aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig470
-rw-r--r--arch/x86/Kconfig.cpu172
-rw-r--r--arch/x86/Kconfig.debug70
-rw-r--r--arch/x86/Makefile32
-rw-r--r--arch/x86/Makefile_32.cpu6
-rw-r--r--arch/x86/boot/Makefile5
-rw-r--r--arch/x86/boot/a20.c5
-rw-r--r--arch/x86/boot/boot.h10
-rw-r--r--arch/x86/boot/compressed/Makefile8
-rw-r--r--arch/x86/boot/compressed/head_32.S5
-rw-r--r--arch/x86/boot/compressed/head_64.S5
-rw-r--r--arch/x86/boot/compressed/misc.c110
-rw-r--r--arch/x86/boot/compressed/relocs.c198
-rw-r--r--arch/x86/boot/cpu.c22
-rw-r--r--arch/x86/boot/cpucheck.c18
-rw-r--r--arch/x86/boot/edd.c12
-rw-r--r--arch/x86/boot/header.S1
-rw-r--r--arch/x86/boot/main.c9
-rw-r--r--arch/x86/boot/memory.c2
-rw-r--r--arch/x86/boot/mkcpustr.c40
-rw-r--r--arch/x86/boot/pm.c6
-rw-r--r--arch/x86/boot/pmjump.S4
-rw-r--r--arch/x86/boot/video-vesa.c2
-rw-r--r--arch/x86/boot/video-vga.c3
-rw-r--r--arch/x86/configs/i386_defconfig1779
-rw-r--r--arch/x86/configs/x86_64_defconfig1798
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/crc32c-intel.c197
-rw-r--r--arch/x86/ia32/ia32_aout.c17
-rw-r--r--arch/x86/ia32/ia32_signal.c149
-rw-r--r--arch/x86/ia32/ia32entry.S197
-rw-r--r--arch/x86/ia32/sys_ia32.c11
-rw-r--r--arch/x86/kernel/.gitignore1
-rw-r--r--arch/x86/kernel/Makefile45
-rw-r--r--arch/x86/kernel/acpi/boot.c491
-rw-r--r--arch/x86/kernel/acpi/processor.c6
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S38
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.h5
-rw-r--r--arch/x86/kernel/acpi/sleep.c40
-rw-r--r--arch/x86/kernel/alternative.c68
-rw-r--r--arch/x86/kernel/amd_iommu.c1383
-rw-r--r--arch/x86/kernel/amd_iommu_init.c1234
-rw-r--r--arch/x86/kernel/aperture_64.c320
-rw-r--r--arch/x86/kernel/apic_32.c699
-rw-r--r--arch/x86/kernel/apic_64.c725
-rw-r--r--arch/x86/kernel/apm_32.c40
-rw-r--r--arch/x86/kernel/asm-offsets_32.c2
-rw-r--r--arch/x86/kernel/asm-offsets_64.c20
-rw-r--r--arch/x86/kernel/bios_uv.c48
-rw-r--r--arch/x86/kernel/cpu/Makefile28
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c122
-rw-r--r--arch/x86/kernel/cpu/amd.c581
-rw-r--r--arch/x86/kernel/cpu/bugs.c56
-rw-r--r--arch/x86/kernel/cpu/bugs_64.c (renamed from arch/x86/kernel/bugs_64.c)0
-rw-r--r--arch/x86/kernel/cpu/centaur.c15
-rw-r--r--arch/x86/kernel/cpu/centaur_64.c37
-rw-r--r--arch/x86/kernel/cpu/cmpxchg.c72
-rw-r--r--arch/x86/kernel/cpu/common.c1011
-rw-r--r--arch/x86/kernel/cpu/cpu.h24
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c19
-rw-r--r--arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c44
-rw-r--r--arch/x86/kernel/cpu/cpufreq/elanfreq.c44
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c8
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c41
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.h1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c11
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c151
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c4
-rw-r--r--arch/x86/kernel/cpu/cyrix.c73
-rw-r--r--arch/x86/kernel/cpu/feature_names.c83
-rw-r--r--arch/x86/kernel/cpu/intel.c357
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c179
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c36
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c61
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c22
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c92
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c1
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.pl32
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c53
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c1014
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h3
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c306
-rw-r--r--arch/x86/kernel/cpu/powerflags.c20
-rw-r--r--arch/x86/kernel/cpu/proc.c2
-rw-r--r--arch/x86/kernel/cpu/transmeta.c32
-rw-r--r--arch/x86/kernel/cpu/umc.c3
-rw-r--r--arch/x86/kernel/cpuid.c45
-rw-r--r--arch/x86/kernel/crash_dump_64.c13
-rw-r--r--arch/x86/kernel/doublefault_32.c2
-rw-r--r--arch/x86/kernel/ds.c954
-rw-r--r--arch/x86/kernel/dumpstack_32.c447
-rw-r--r--arch/x86/kernel/dumpstack_64.c573
-rw-r--r--arch/x86/kernel/e820.c1391
-rw-r--r--arch/x86/kernel/e820_32.c775
-rw-r--r--arch/x86/kernel/e820_64.c952
-rw-r--r--arch/x86/kernel/early-quirks.c100
-rw-r--r--arch/x86/kernel/early_printk.c750
-rw-r--r--arch/x86/kernel/efi.c73
-rw-r--r--arch/x86/kernel/efi_32.c12
-rw-r--r--arch/x86/kernel/efi_64.c8
-rw-r--r--arch/x86/kernel/entry_32.S184
-rw-r--r--arch/x86/kernel/entry_64.S358
-rw-r--r--arch/x86/kernel/es7000_32.c (renamed from arch/x86/mach-es7000/es7000plat.c)164
-rw-r--r--arch/x86/kernel/ftrace.c141
-rw-r--r--arch/x86/kernel/genapic_64.c87
-rw-r--r--arch/x86/kernel/genapic_flat_64.c64
-rw-r--r--arch/x86/kernel/genx2apic_cluster.c159
-rw-r--r--arch/x86/kernel/genx2apic_phys.c154
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c330
-rw-r--r--arch/x86/kernel/head.c56
-rw-r--r--arch/x86/kernel/head32.c27
-rw-r--r--arch/x86/kernel/head64.c103
-rw-r--r--arch/x86/kernel/head_32.S49
-rw-r--r--arch/x86/kernel/head_64.S106
-rw-r--r--arch/x86/kernel/hpet.c122
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c9
-rw-r--r--arch/x86/kernel/i387.c168
-rw-r--r--arch/x86/kernel/i8259.c (renamed from arch/x86/kernel/i8259_32.c)160
-rw-r--r--arch/x86/kernel/i8259_64.c512
-rw-r--r--arch/x86/kernel/io_apic_32.c756
-rw-r--r--arch/x86/kernel/io_apic_64.c971
-rw-r--r--arch/x86/kernel/io_delay.c11
-rw-r--r--arch/x86/kernel/ioport.c1
-rw-r--r--arch/x86/kernel/ipi.c10
-rw-r--r--arch/x86/kernel/irq_32.c255
-rw-r--r--arch/x86/kernel/irq_64.c30
-rw-r--r--arch/x86/kernel/irqinit_32.c163
-rw-r--r--arch/x86/kernel/irqinit_64.c233
-rw-r--r--arch/x86/kernel/k8.c5
-rw-r--r--arch/x86/kernel/kdebugfs.c9
-rw-r--r--arch/x86/kernel/kgdb.c50
-rw-r--r--arch/x86/kernel/kprobes.c7
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/kvmclock.c2
-rw-r--r--arch/x86/kernel/ldt.c22
-rw-r--r--arch/x86/kernel/machine_kexec_32.c63
-rw-r--r--arch/x86/kernel/machine_kexec_64.c8
-rw-r--r--arch/x86/kernel/mfgpt_32.c52
-rw-r--r--arch/x86/kernel/microcode.c848
-rw-r--r--arch/x86/kernel/microcode_amd.c435
-rw-r--r--arch/x86/kernel/microcode_core.c508
-rw-r--r--arch/x86/kernel/microcode_intel.c480
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c3
-rw-r--r--arch/x86/kernel/module_64.c11
-rw-r--r--arch/x86/kernel/mpparse.c888
-rw-r--r--arch/x86/kernel/msr.c60
-rw-r--r--arch/x86/kernel/nmi.c (renamed from arch/x86/kernel/nmi_64.c)256
-rw-r--r--arch/x86/kernel/nmi_32.c467
-rw-r--r--arch/x86/kernel/numaq_32.c222
-rw-r--r--arch/x86/kernel/olpc.c6
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c37
-rw-r--r--arch/x86/kernel/paravirt.c49
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c6
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c9
-rw-r--r--arch/x86/kernel/pci-calgary_64.c192
-rw-r--r--arch/x86/kernel/pci-dma.c353
-rw-r--r--arch/x86/kernel/pci-gart_64.c262
-rw-r--r--arch/x86/kernel/pci-nommu.c20
-rw-r--r--arch/x86/kernel/pci-swiotlb_64.c6
-rw-r--r--arch/x86/kernel/pcspeaker.c13
-rw-r--r--arch/x86/kernel/probe_roms_32.c166
-rw-r--r--arch/x86/kernel/process.c234
-rw-r--r--arch/x86/kernel/process_32.c156
-rw-r--r--arch/x86/kernel/process_64.c338
-rw-r--r--arch/x86/kernel/ptrace.c657
-rw-r--r--arch/x86/kernel/quirks.c97
-rw-r--r--arch/x86/kernel/reboot.c35
-rw-r--r--arch/x86/kernel/reboot_fixups_32.c4
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S178
-rw-r--r--arch/x86/kernel/rtc.c22
-rw-r--r--arch/x86/kernel/setup.c1134
-rw-r--r--arch/x86/kernel/setup64.c287
-rw-r--r--arch/x86/kernel/setup_32.c964
-rw-r--r--arch/x86/kernel/setup_64.c1194
-rw-r--r--arch/x86/kernel/setup_percpu.c385
-rw-r--r--arch/x86/kernel/sigframe.h19
-rw-r--r--arch/x86/kernel/signal_32.c280
-rw-r--r--arch/x86/kernel/signal_64.c313
-rw-r--r--arch/x86/kernel/smp.c164
-rw-r--r--arch/x86/kernel/smpboot.c547
-rw-r--r--arch/x86/kernel/smpcommon.c73
-rw-r--r--arch/x86/kernel/smpcommon_32.c1
-rw-r--r--arch/x86/kernel/stacktrace.c2
-rw-r--r--arch/x86/kernel/step.c35
-rw-r--r--arch/x86/kernel/summit_32.c4
-rw-r--r--arch/x86/kernel/sys_i386_32.c66
-rw-r--r--arch/x86/kernel/sys_x86_64.c44
-rw-r--r--arch/x86/kernel/syscall_64.c4
-rw-r--r--arch/x86/kernel/syscall_table_32.S6
-rw-r--r--arch/x86/kernel/time_32.c15
-rw-r--r--arch/x86/kernel/time_64.c39
-rw-r--r--arch/x86/kernel/tlb_32.c10
-rw-r--r--arch/x86/kernel/tlb_64.c7
-rw-r--r--arch/x86/kernel/tlb_uv.c793
-rw-r--r--arch/x86/kernel/tls.c1
-rw-r--r--arch/x86/kernel/trampoline.c2
-rw-r--r--arch/x86/kernel/traps.c (renamed from arch/x86/kernel/traps_32.c)988
-rw-r--r--arch/x86/kernel/traps_64.c1218
-rw-r--r--arch/x86/kernel/tsc.c849
-rw-r--r--arch/x86/kernel/tsc_32.c451
-rw-r--r--arch/x86/kernel/tsc_64.c357
-rw-r--r--arch/x86/kernel/tsc_sync.c6
-rw-r--r--arch/x86/kernel/visws_quirks.c691
-rw-r--r--arch/x86/kernel/vm86_32.c1
-rw-r--r--arch/x86/kernel/vmi_32.c26
-rw-r--r--arch/x86/kernel/vmiclock_32.c7
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S32
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S25
-rw-r--r--arch/x86/kernel/vsmp_64.c5
-rw-r--r--arch/x86/kernel/vsyscall_64.c19
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c16
-rw-r--r--arch/x86/kernel/xsave.c345
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/Makefile3
-rw-r--r--arch/x86/kvm/i8254.c24
-rw-r--r--arch/x86/kvm/i8259.c9
-rw-r--r--arch/x86/kvm/irq.h2
-rw-r--r--arch/x86/kvm/lapic.c14
-rw-r--r--arch/x86/kvm/lapic.h1
-rw-r--r--arch/x86/kvm/mmu.c173
-rw-r--r--arch/x86/kvm/mmu.h3
-rw-r--r--arch/x86/kvm/paging_tmpl.h42
-rw-r--r--arch/x86/kvm/svm.c153
-rw-r--r--arch/x86/kvm/vmx.c253
-rw-r--r--arch/x86/kvm/vmx.h29
-rw-r--r--arch/x86/kvm/x86.c432
-rw-r--r--arch/x86/kvm/x86_emulate.c257
-rw-r--r--arch/x86/lguest/Kconfig2
-rw-r--r--arch/x86/lguest/boot.c53
-rw-r--r--arch/x86/lib/Makefile8
-rw-r--r--arch/x86/lib/copy_user_64.S429
-rw-r--r--arch/x86/lib/copy_user_nocache_64.S282
-rw-r--r--arch/x86/lib/delay.c (renamed from arch/x86/lib/delay_32.c)38
-rw-r--r--arch/x86/lib/delay_64.c85
-rw-r--r--arch/x86/lib/getuser.S (renamed from arch/x86/lib/getuser_64.S)87
-rw-r--r--arch/x86/lib/getuser_32.S78
-rw-r--r--arch/x86/lib/msr-on-cpu.c76
-rw-r--r--arch/x86/lib/putuser.S (renamed from arch/x86/lib/putuser_32.S)73
-rw-r--r--arch/x86/lib/putuser_64.S106
-rw-r--r--arch/x86/lib/string_32.c42
-rw-r--r--arch/x86/lib/strstr_32.c6
-rw-r--r--arch/x86/lib/thunk_32.S47
-rw-r--r--arch/x86/lib/thunk_64.S19
-rw-r--r--arch/x86/lib/usercopy_32.c7
-rw-r--r--arch/x86/lib/usercopy_64.c23
-rw-r--r--arch/x86/mach-default/setup.c95
-rw-r--r--arch/x86/mach-es7000/Makefile6
-rw-r--r--arch/x86/mach-es7000/es7000.h114
-rw-r--r--arch/x86/mach-generic/Makefile9
-rw-r--r--arch/x86/mach-generic/bigsmp.c13
-rw-r--r--arch/x86/mach-generic/es7000.c33
-rw-r--r--arch/x86/mach-generic/numaq.c41
-rw-r--r--arch/x86/mach-generic/probe.c15
-rw-r--r--arch/x86/mach-generic/summit.c11
-rw-r--r--arch/x86/mach-rdc321x/platform.c1
-rw-r--r--arch/x86/mach-visws/Makefile8
-rw-r--r--arch/x86/mach-visws/mpparse.c88
-rw-r--r--arch/x86/mach-visws/reboot.c55
-rw-r--r--arch/x86/mach-visws/setup.c183
-rw-r--r--arch/x86/mach-visws/traps.c69
-rw-r--r--arch/x86/mach-visws/visws_apic.c297
-rw-r--r--arch/x86/mach-voyager/setup.c37
-rw-r--r--arch/x86/mach-voyager/voyager_smp.c114
-rw-r--r--arch/x86/math-emu/reg_constant.c8
-rw-r--r--arch/x86/mm/Makefile17
-rw-r--r--arch/x86/mm/dump_pagetables.c12
-rw-r--r--arch/x86/mm/fault.c128
-rw-r--r--arch/x86/mm/gup.c298
-rw-r--r--arch/x86/mm/hugetlbpage.c78
-rw-r--r--arch/x86/mm/init_32.c600
-rw-r--r--arch/x86/mm/init_64.c822
-rw-r--r--arch/x86/mm/ioremap.c267
-rw-r--r--arch/x86/mm/k8topology_64.c21
-rw-r--r--arch/x86/mm/kmmio.c510
-rw-r--r--arch/x86/mm/memtest.c123
-rw-r--r--arch/x86/mm/mmio-mod.c517
-rw-r--r--arch/x86/mm/numa_32.c (renamed from arch/x86/mm/discontig_32.c)288
-rw-r--r--arch/x86/mm/numa_64.c109
-rw-r--r--arch/x86/mm/pageattr-test.c27
-rw-r--r--arch/x86/mm/pageattr.c541
-rw-r--r--arch/x86/mm/pat.c612
-rw-r--r--arch/x86/mm/pf_in.c489
-rw-r--r--arch/x86/mm/pf_in.h39
-rw-r--r--arch/x86/mm/pgtable.c199
-rw-r--r--arch/x86/mm/pgtable_32.c104
-rw-r--r--arch/x86/mm/srat_32.c (renamed from arch/x86/kernel/srat_32.c)235
-rw-r--r--arch/x86/mm/srat_64.c23
-rw-r--r--arch/x86/mm/testmmiotrace.c71
-rw-r--r--arch/x86/oprofile/Makefile2
-rw-r--r--arch/x86/oprofile/nmi_int.c117
-rw-r--r--arch/x86/oprofile/op_model_amd.c543
-rw-r--r--arch/x86/oprofile/op_model_athlon.c190
-rw-r--r--arch/x86/oprofile/op_model_p4.c175
-rw-r--r--arch/x86/oprofile/op_x86_model.h4
-rw-r--r--arch/x86/pci/Makefile22
-rw-r--r--arch/x86/pci/Makefile_3224
-rw-r--r--arch/x86/pci/Makefile_6417
-rw-r--r--arch/x86/pci/acpi.c26
-rw-r--r--arch/x86/pci/amd_bus.c (renamed from arch/x86/pci/k8-bus_64.c)150
-rw-r--r--arch/x86/pci/common.c42
-rw-r--r--arch/x86/pci/direct.c25
-rw-r--r--arch/x86/pci/early.c72
-rw-r--r--arch/x86/pci/fixup.c31
-rw-r--r--arch/x86/pci/i386.c35
-rw-r--r--arch/x86/pci/init.c4
-rw-r--r--arch/x86/pci/irq.c487
-rw-r--r--arch/x86/pci/legacy.c21
-rw-r--r--arch/x86/pci/mmconfig-shared.c81
-rw-r--r--arch/x86/pci/mp_bus_to_node.c23
-rw-r--r--arch/x86/pci/numaq_32.c (renamed from arch/x86/pci/numa.c)40
-rw-r--r--arch/x86/pci/pci.h15
-rw-r--r--arch/x86/pci/visws.c21
-rw-r--r--arch/x86/power/cpu_32.c13
-rw-r--r--arch/x86/power/cpu_64.c7
-rw-r--r--arch/x86/power/hibernate_64.c2
-rw-r--r--arch/x86/power/hibernate_asm_32.S40
-rw-r--r--arch/x86/vdso/Makefile2
-rw-r--r--arch/x86/vdso/vclock_gettime.c15
-rw-r--r--arch/x86/vdso/vdso32-setup.c30
-rw-r--r--arch/x86/vdso/vdso32.S13
-rw-r--r--arch/x86/vdso/vgetcpu.c3
-rw-r--r--arch/x86/vdso/vma.c13
-rw-r--r--arch/x86/xen/Kconfig28
-rw-r--r--arch/x86/xen/Makefile14
-rw-r--r--arch/x86/xen/debugfs.c123
-rw-r--r--arch/x86/xen/debugfs.h10
-rw-r--r--arch/x86/xen/enlighten.c992
-rw-r--r--arch/x86/xen/irq.c143
-rw-r--r--arch/x86/xen/manage.c143
-rw-r--r--arch/x86/xen/mmu.c840
-rw-r--r--arch/x86/xen/mmu.h42
-rw-r--r--arch/x86/xen/multicalls.c156
-rw-r--r--arch/x86/xen/multicalls.h12
-rw-r--r--arch/x86/xen/setup.c109
-rw-r--r--arch/x86/xen/smp.c342
-rw-r--r--arch/x86/xen/spinlock.c428
-rw-r--r--arch/x86/xen/suspend.c48
-rw-r--r--arch/x86/xen/time.c29
-rw-r--r--arch/x86/xen/xen-asm_32.S (renamed from arch/x86/xen/xen-asm.S)2
-rw-r--r--arch/x86/xen/xen-asm_64.S285
-rw-r--r--arch/x86/xen/xen-head.S31
-rw-r--r--arch/x86/xen/xen-ops.h43
345 files changed, 39533 insertions, 22095 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 09a28a36ff26..e73ddc382a16 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -18,13 +18,21 @@ config X86_64
18### Arch settings 18### Arch settings
19config X86 19config X86
20 def_bool y 20 def_bool y
21 select HAVE_AOUT if X86_32
21 select HAVE_UNSTABLE_SCHED_CLOCK 22 select HAVE_UNSTABLE_SCHED_CLOCK
22 select HAVE_IDE 23 select HAVE_IDE
23 select HAVE_OPROFILE 24 select HAVE_OPROFILE
25 select HAVE_IOREMAP_PROT
24 select HAVE_KPROBES 26 select HAVE_KPROBES
27 select ARCH_WANT_OPTIONAL_GPIOLIB
25 select HAVE_KRETPROBES 28 select HAVE_KRETPROBES
29 select HAVE_DYNAMIC_FTRACE
30 select HAVE_FTRACE
26 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) 31 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
27 select HAVE_ARCH_KGDB if !X86_VOYAGER 32 select HAVE_ARCH_KGDB if !X86_VOYAGER
33 select HAVE_ARCH_TRACEHOOK
34 select HAVE_GENERIC_DMA_COHERENT if X86_32
35 select HAVE_EFFICIENT_UNALIGNED_ACCESS
28 36
29config ARCH_DEFCONFIG 37config ARCH_DEFCONFIG
30 string 38 string
@@ -121,7 +129,7 @@ config ARCH_HAS_CACHE_LINE_SIZE
121 def_bool y 129 def_bool y
122 130
123config HAVE_SETUP_PER_CPU_AREA 131config HAVE_SETUP_PER_CPU_AREA
124 def_bool X86_64 || (X86_SMP && !X86_VOYAGER) 132 def_bool X86_64_SMP || (X86_SMP && !X86_VOYAGER)
125 133
126config HAVE_CPUMASK_OF_CPU_MAP 134config HAVE_CPUMASK_OF_CPU_MAP
127 def_bool X86_64_SMP 135 def_bool X86_64_SMP
@@ -145,9 +153,6 @@ config AUDIT_ARCH
145 bool 153 bool
146 default X86_64 154 default X86_64
147 155
148config ARCH_SUPPORTS_AOUT
149 def_bool y
150
151config ARCH_SUPPORTS_OPTIMIZED_INLINING 156config ARCH_SUPPORTS_OPTIMIZED_INLINING
152 def_bool y 157 def_bool y
153 158
@@ -168,6 +173,7 @@ config GENERIC_PENDING_IRQ
168config X86_SMP 173config X86_SMP
169 bool 174 bool
170 depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64) 175 depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64)
176 select USE_GENERIC_SMP_HELPERS
171 default y 177 default y
172 178
173config X86_32_SMP 179config X86_32_SMP
@@ -181,12 +187,12 @@ config X86_64_SMP
181config X86_HT 187config X86_HT
182 bool 188 bool
183 depends on SMP 189 depends on SMP
184 depends on (X86_32 && !(X86_VISWS || X86_VOYAGER)) || X86_64 190 depends on (X86_32 && !X86_VOYAGER) || X86_64
185 default y 191 default y
186 192
187config X86_BIOS_REBOOT 193config X86_BIOS_REBOOT
188 bool 194 bool
189 depends on !X86_VISWS && !X86_VOYAGER 195 depends on !X86_VOYAGER
190 default y 196 default y
191 197
192config X86_TRAMPOLINE 198config X86_TRAMPOLINE
@@ -230,6 +236,26 @@ config SMP
230 236
231 If you don't know what to do here, say N. 237 If you don't know what to do here, say N.
232 238
239config X86_FIND_SMP_CONFIG
240 def_bool y
241 depends on X86_MPPARSE || X86_VOYAGER
242
243if ACPI
244config X86_MPPARSE
245 def_bool y
246 bool "Enable MPS table"
247 depends on X86_LOCAL_APIC
248 help
249 For old smp systems that do not have proper acpi support. Newer systems
250 (esp with 64bit cpus) with acpi support, MADT and DSDT will override it
251endif
252
253if !ACPI
254config X86_MPPARSE
255 def_bool y
256 depends on X86_LOCAL_APIC
257endif
258
233choice 259choice
234 prompt "Subarchitecture Type" 260 prompt "Subarchitecture Type"
235 default X86_PC 261 default X86_PC
@@ -251,7 +277,7 @@ config X86_ELAN
251 277
252config X86_VOYAGER 278config X86_VOYAGER
253 bool "Voyager (NCR)" 279 bool "Voyager (NCR)"
254 depends on X86_32 && (SMP || BROKEN) 280 depends on X86_32 && (SMP || BROKEN) && !PCI
255 help 281 help
256 Voyager is an MCA-based 32-way capable SMP architecture proprietary 282 Voyager is an MCA-based 32-way capable SMP architecture proprietary
257 to NCR Corp. Machine classes 345x/35xx/4100/51xx are Voyager-based. 283 to NCR Corp. Machine classes 345x/35xx/4100/51xx are Voyager-based.
@@ -261,16 +287,27 @@ config X86_VOYAGER
261 If you do not specifically know you have a Voyager based machine, 287 If you do not specifically know you have a Voyager based machine,
262 say N here, otherwise the kernel you build will not be bootable. 288 say N here, otherwise the kernel you build will not be bootable.
263 289
290config X86_GENERICARCH
291 bool "Generic architecture"
292 depends on X86_32
293 help
294 This option compiles in the NUMAQ, Summit, bigsmp, ES7000, default
295 subarchitectures. It is intended for a generic binary kernel.
296 if you select them all, kernel will probe it one by one. and will
297 fallback to default.
298
299if X86_GENERICARCH
300
264config X86_NUMAQ 301config X86_NUMAQ
265 bool "NUMAQ (IBM/Sequent)" 302 bool "NUMAQ (IBM/Sequent)"
266 depends on SMP && X86_32 303 depends on SMP && X86_32 && PCI && X86_MPPARSE
267 select NUMA 304 select NUMA
268 help 305 help
269 This option is used for getting Linux to run on a (IBM/Sequent) NUMA 306 This option is used for getting Linux to run on a NUMAQ (IBM/Sequent)
270 multiquad box. This changes the way that processors are bootstrapped, 307 NUMA multiquad box. This changes the way that processors are
271 and uses Clustered Logical APIC addressing mode instead of Flat Logical. 308 bootstrapped, and uses Clustered Logical APIC addressing mode instead
272 You will need a new lynxer.elf file to flash your firmware with - send 309 of Flat Logical. You will need a new lynxer.elf file to flash your
273 email to <Martin.Bligh@us.ibm.com>. 310 firmware with - send email to <Martin.Bligh@us.ibm.com>.
274 311
275config X86_SUMMIT 312config X86_SUMMIT
276 bool "Summit/EXA (IBM x440)" 313 bool "Summit/EXA (IBM x440)"
@@ -279,72 +316,55 @@ config X86_SUMMIT
279 This option is needed for IBM systems that use the Summit/EXA chipset. 316 This option is needed for IBM systems that use the Summit/EXA chipset.
280 In particular, it is needed for the x440. 317 In particular, it is needed for the x440.
281 318
282 If you don't have one of these computers, you should say N here. 319config X86_ES7000
283 If you want to build a NUMA kernel, you must select ACPI. 320 bool "Support for Unisys ES7000 IA32 series"
321 depends on X86_32 && SMP
322 help
323 Support for Unisys ES7000 systems. Say 'Y' here if this kernel is
324 supposed to run on an IA32-based Unisys ES7000 system.
284 325
285config X86_BIGSMP 326config X86_BIGSMP
286 bool "Support for other sub-arch SMP systems with more than 8 CPUs" 327 bool "Support for big SMP systems with more than 8 CPUs"
287 depends on X86_32 && SMP 328 depends on X86_32 && SMP
288 help 329 help
289 This option is needed for the systems that have more than 8 CPUs 330 This option is needed for the systems that have more than 8 CPUs
290 and if the system is not of any sub-arch type above. 331 and if the system is not of any sub-arch type above.
291 332
292 If you don't have such a system, you should say N here. 333endif
334
335config X86_VSMP
336 bool "Support for ScaleMP vSMP"
337 select PARAVIRT
338 depends on X86_64 && PCI
339 help
340 Support for ScaleMP vSMP systems. Say 'Y' here if this kernel is
341 supposed to run on these EM64T-based machines. Only choose this option
342 if you have one of these machines.
343
344endchoice
293 345
294config X86_VISWS 346config X86_VISWS
295 bool "SGI 320/540 (Visual Workstation)" 347 bool "SGI 320/540 (Visual Workstation)"
296 depends on X86_32 348 depends on X86_32 && PCI && !X86_VOYAGER && X86_MPPARSE && PCI_GODIRECT
297 help 349 help
298 The SGI Visual Workstation series is an IA32-based workstation 350 The SGI Visual Workstation series is an IA32-based workstation
299 based on SGI systems chips with some legacy PC hardware attached. 351 based on SGI systems chips with some legacy PC hardware attached.
300 352
301 Say Y here to create a kernel to run on the SGI 320 or 540. 353 Say Y here to create a kernel to run on the SGI 320 or 540.
302 354
303 A kernel compiled for the Visual Workstation will not run on PCs 355 A kernel compiled for the Visual Workstation will run on general
304 and vice versa. See <file:Documentation/sgi-visws.txt> for details. 356 PCs as well. See <file:Documentation/sgi-visws.txt> for details.
305
306config X86_GENERICARCH
307 bool "Generic architecture (Summit, bigsmp, ES7000, default)"
308 depends on X86_32
309 help
310 This option compiles in the Summit, bigsmp, ES7000, default subarchitectures.
311 It is intended for a generic binary kernel.
312 If you want a NUMA kernel, select ACPI. We need SRAT for NUMA.
313
314config X86_ES7000
315 bool "Support for Unisys ES7000 IA32 series"
316 depends on X86_32 && SMP
317 help
318 Support for Unisys ES7000 systems. Say 'Y' here if this kernel is
319 supposed to run on an IA32-based Unisys ES7000 system.
320 Only choose this option if you have such a system, otherwise you
321 should say N here.
322 357
323config X86_RDC321X 358config X86_RDC321X
324 bool "RDC R-321x SoC" 359 bool "RDC R-321x SoC"
325 depends on X86_32 360 depends on X86_32
326 select M486 361 select M486
327 select X86_REBOOTFIXUPS 362 select X86_REBOOTFIXUPS
328 select GENERIC_GPIO
329 select LEDS_CLASS
330 select LEDS_GPIO
331 select NEW_LEDS
332 help 363 help
333 This option is needed for RDC R-321x system-on-chip, also known 364 This option is needed for RDC R-321x system-on-chip, also known
334 as R-8610-(G). 365 as R-8610-(G).
335 If you don't have one of these chips, you should say N here. 366 If you don't have one of these chips, you should say N here.
336 367
337config X86_VSMP
338 bool "Support for ScaleMP vSMP"
339 select PARAVIRT
340 depends on X86_64
341 help
342 Support for ScaleMP vSMP systems. Say 'Y' here if this kernel is
343 supposed to run on these EM64T-based machines. Only choose this option
344 if you have one of these machines.
345
346endchoice
347
348config SCHED_NO_NO_OMIT_FRAME_POINTER 368config SCHED_NO_NO_OMIT_FRAME_POINTER
349 def_bool y 369 def_bool y
350 prompt "Single-depth WCHAN output" 370 prompt "Single-depth WCHAN output"
@@ -373,7 +393,7 @@ config VMI
373 bool "VMI Guest support" 393 bool "VMI Guest support"
374 select PARAVIRT 394 select PARAVIRT
375 depends on X86_32 395 depends on X86_32
376 depends on !(X86_VISWS || X86_VOYAGER) 396 depends on !X86_VOYAGER
377 help 397 help
378 VMI provides a paravirtualized interface to the VMware ESX server 398 VMI provides a paravirtualized interface to the VMware ESX server
379 (it could be used by other hypervisors in theory too, but is not 399 (it could be used by other hypervisors in theory too, but is not
@@ -384,7 +404,7 @@ config KVM_CLOCK
384 bool "KVM paravirtualized clock" 404 bool "KVM paravirtualized clock"
385 select PARAVIRT 405 select PARAVIRT
386 select PARAVIRT_CLOCK 406 select PARAVIRT_CLOCK
387 depends on !(X86_VISWS || X86_VOYAGER) 407 depends on !X86_VOYAGER
388 help 408 help
389 Turning on this option will allow you to run a paravirtualized clock 409 Turning on this option will allow you to run a paravirtualized clock
390 when running over the KVM hypervisor. Instead of relying on a PIT 410 when running over the KVM hypervisor. Instead of relying on a PIT
@@ -395,7 +415,7 @@ config KVM_CLOCK
395config KVM_GUEST 415config KVM_GUEST
396 bool "KVM Guest support" 416 bool "KVM Guest support"
397 select PARAVIRT 417 select PARAVIRT
398 depends on !(X86_VISWS || X86_VOYAGER) 418 depends on !X86_VOYAGER
399 help 419 help
400 This option enables various optimizations for running under the KVM 420 This option enables various optimizations for running under the KVM
401 hypervisor. 421 hypervisor.
@@ -404,7 +424,7 @@ source "arch/x86/lguest/Kconfig"
404 424
405config PARAVIRT 425config PARAVIRT
406 bool "Enable paravirtualization code" 426 bool "Enable paravirtualization code"
407 depends on !(X86_VISWS || X86_VOYAGER) 427 depends on !X86_VOYAGER
408 help 428 help
409 This changes the kernel so it can modify itself when it is run 429 This changes the kernel so it can modify itself when it is run
410 under a hypervisor, potentially improving performance significantly 430 under a hypervisor, potentially improving performance significantly
@@ -417,51 +437,31 @@ config PARAVIRT_CLOCK
417 437
418endif 438endif
419 439
420config MEMTEST_BOOTPARAM 440config PARAVIRT_DEBUG
421 bool "Memtest boot parameter" 441 bool "paravirt-ops debugging"
422 depends on X86_64 442 depends on PARAVIRT && DEBUG_KERNEL
423 default y 443 help
424 help 444 Enable to debug paravirt_ops internals. Specifically, BUG if
425 This option adds a kernel parameter 'memtest', which allows memtest 445 a paravirt_op is missing when it is called.
426 to be disabled at boot. If this option is selected, memtest
427 functionality can be disabled with memtest=0 on the kernel
428 command line. The purpose of this option is to allow a single
429 kernel image to be distributed with memtest built in, but not
430 necessarily enabled.
431
432 If you are unsure how to answer this question, answer Y.
433 446
434config MEMTEST_BOOTPARAM_VALUE 447config MEMTEST
435 int "Memtest boot parameter default value (0-4)" 448 bool "Memtest"
436 depends on MEMTEST_BOOTPARAM
437 range 0 4
438 default 0
439 help 449 help
440 This option sets the default value for the kernel parameter 450 This option adds a kernel parameter 'memtest', which allows memtest
441 'memtest', which allows memtest to be disabled at boot. If this 451 to be set.
442 option is set to 0 (zero), the memtest kernel parameter will 452 memtest=0, mean disabled; -- default
443 default to 0, disabling memtest at bootup. If this option is 453 memtest=1, mean do 1 test pattern;
444 set to 4, the memtest kernel parameter will default to 4, 454 ...
445 enabling memtest at bootup, and use that as pattern number. 455 memtest=4, mean do 4 test patterns.
446 456 If you are unsure how to answer this question, answer N.
447 If you are unsure how to answer this question, answer 0.
448
449config ACPI_SRAT
450 def_bool y
451 depends on X86_32 && ACPI && NUMA && (X86_SUMMIT || X86_GENERICARCH)
452 select ACPI_NUMA
453
454config HAVE_ARCH_PARSE_SRAT
455 def_bool y
456 depends on ACPI_SRAT
457 457
458config X86_SUMMIT_NUMA 458config X86_SUMMIT_NUMA
459 def_bool y 459 def_bool y
460 depends on X86_32 && NUMA && (X86_SUMMIT || X86_GENERICARCH) 460 depends on X86_32 && NUMA && X86_GENERICARCH
461 461
462config X86_CYCLONE_TIMER 462config X86_CYCLONE_TIMER
463 def_bool y 463 def_bool y
464 depends on X86_32 && X86_SUMMIT || X86_GENERICARCH 464 depends on X86_GENERICARCH
465 465
466config ES7000_CLUSTERED_APIC 466config ES7000_CLUSTERED_APIC
467 def_bool y 467 def_bool y
@@ -549,6 +549,22 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
549 Calgary anyway, pass 'iommu=calgary' on the kernel command line. 549 Calgary anyway, pass 'iommu=calgary' on the kernel command line.
550 If unsure, say Y. 550 If unsure, say Y.
551 551
552config AMD_IOMMU
553 bool "AMD IOMMU support"
554 select SWIOTLB
555 select PCI_MSI
556 depends on X86_64 && PCI && ACPI
557 help
558 With this option you can enable support for AMD IOMMU hardware in
559 your system. An IOMMU is a hardware component which provides
560 remapping of DMA memory accesses from devices. With an AMD IOMMU you
561 can isolate the the DMA memory of different devices and protect the
562 system from misbehaving device drivers or hardware.
563
564 You can find out if your system has an AMD IOMMU if you look into
565 your BIOS for an option to enable it or if you have an IVRS ACPI
566 table.
567
552# need this always selected by IOMMU for the VIA workaround 568# need this always selected by IOMMU for the VIA workaround
553config SWIOTLB 569config SWIOTLB
554 bool 570 bool
@@ -560,17 +576,26 @@ config SWIOTLB
560 3 GB of memory. If unsure, say Y. 576 3 GB of memory. If unsure, say Y.
561 577
562config IOMMU_HELPER 578config IOMMU_HELPER
563 def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB) 579 def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU)
580
581config MAXSMP
582 bool "Configure Maximum number of SMP Processors and NUMA Nodes"
583 depends on X86_64 && SMP && BROKEN
584 default n
585 help
586 Configure maximum number of CPUS and NUMA Nodes for this architecture.
587 If unsure, say N.
564 588
565config NR_CPUS 589config NR_CPUS
566 int "Maximum number of CPUs (2-255)" 590 int "Maximum number of CPUs (2-512)" if !MAXSMP
567 range 2 255 591 range 2 512
568 depends on SMP 592 depends on SMP
593 default "4096" if MAXSMP
569 default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000 594 default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
570 default "8" 595 default "8"
571 help 596 help
572 This allows you to specify the maximum number of CPUs which this 597 This allows you to specify the maximum number of CPUs which this
573 kernel will support. The maximum supported value is 255 and the 598 kernel will support. The maximum supported value is 512 and the
574 minimum value which makes sense is 2. 599 minimum value which makes sense is 2.
575 600
576 This is purely to save memory - each supported CPU adds 601 This is purely to save memory - each supported CPU adds
@@ -598,7 +623,7 @@ source "kernel/Kconfig.preempt"
598 623
599config X86_UP_APIC 624config X86_UP_APIC
600 bool "Local APIC support on uniprocessors" 625 bool "Local APIC support on uniprocessors"
601 depends on X86_32 && !SMP && !(X86_VISWS || X86_VOYAGER || X86_GENERICARCH) 626 depends on X86_32 && !SMP && !(X86_VOYAGER || X86_GENERICARCH)
602 help 627 help
603 A local APIC (Advanced Programmable Interrupt Controller) is an 628 A local APIC (Advanced Programmable Interrupt Controller) is an
604 integrated interrupt controller in the CPU. If you have a single-CPU 629 integrated interrupt controller in the CPU. If you have a single-CPU
@@ -623,11 +648,11 @@ config X86_UP_IOAPIC
623 648
624config X86_LOCAL_APIC 649config X86_LOCAL_APIC
625 def_bool y 650 def_bool y
626 depends on X86_64 || (X86_32 && (X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER) || X86_GENERICARCH)) 651 depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
627 652
628config X86_IO_APIC 653config X86_IO_APIC
629 def_bool y 654 def_bool y
630 depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER)) || X86_GENERICARCH)) 655 depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
631 656
632config X86_VISWS_APIC 657config X86_VISWS_APIC
633 def_bool y 658 def_bool y
@@ -681,7 +706,7 @@ config X86_MCE_NONFATAL
681 706
682config X86_MCE_P4THERMAL 707config X86_MCE_P4THERMAL
683 bool "check for P4 thermal throttling interrupt." 708 bool "check for P4 thermal throttling interrupt."
684 depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP) && !X86_VISWS 709 depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP)
685 help 710 help
686 Enabling this feature will cause a message to be printed when the P4 711 Enabling this feature will cause a message to be printed when the P4
687 enters thermal throttling. 712 enters thermal throttling.
@@ -751,23 +776,45 @@ config X86_REBOOTFIXUPS
751 Say N otherwise. 776 Say N otherwise.
752 777
753config MICROCODE 778config MICROCODE
754 tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support" 779 tristate "/dev/cpu/microcode - microcode support"
755 select FW_LOADER 780 select FW_LOADER
756 ---help--- 781 ---help---
757 If you say Y here, you will be able to update the microcode on 782 If you say Y here, you will be able to update the microcode on
758 Intel processors in the IA32 family, e.g. Pentium Pro, Pentium II, 783 certain Intel and AMD processors. The Intel support is for the
759 Pentium III, Pentium 4, Xeon etc. You will obviously need the 784 IA32 family, e.g. Pentium Pro, Pentium II, Pentium III,
760 actual microcode binary data itself which is not shipped with the 785 Pentium 4, Xeon etc. The AMD support is for family 0x10 and
761 Linux kernel. 786 0x11 processors, e.g. Opteron, Phenom and Turion 64 Ultra.
787 You will obviously need the actual microcode binary data itself
788 which is not shipped with the Linux kernel.
762 789
763 For latest news and information on obtaining all the required 790 This option selects the general module only, you need to select
764 ingredients for this driver, check: 791 at least one vendor specific module as well.
765 <http://www.urbanmyth.org/microcode/>.
766 792
767 To compile this driver as a module, choose M here: the 793 To compile this driver as a module, choose M here: the
768 module will be called microcode. 794 module will be called microcode.
769 795
770config MICROCODE_OLD_INTERFACE 796config MICROCODE_INTEL
797 bool "Intel microcode patch loading support"
798 depends on MICROCODE
799 default MICROCODE
800 select FW_LOADER
801 --help---
802 This options enables microcode patch loading support for Intel
803 processors.
804
805 For latest news and information on obtaining all the required
806 Intel ingredients for this driver, check:
807 <http://www.urbanmyth.org/microcode/>.
808
809config MICROCODE_AMD
810 bool "AMD microcode patch loading support"
811 depends on MICROCODE
812 select FW_LOADER
813 --help---
814 If you select this option, microcode patch loading support for AMD
815 processors will be enabled.
816
817 config MICROCODE_OLD_INTERFACE
771 def_bool y 818 def_bool y
772 depends on MICROCODE 819 depends on MICROCODE
773 820
@@ -911,18 +958,18 @@ config X86_PAE
911config NUMA 958config NUMA
912 bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" 959 bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)"
913 depends on SMP 960 depends on SMP
914 depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || (X86_SUMMIT || X86_GENERICARCH) && ACPI) && EXPERIMENTAL) 961 depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL)
915 default n if X86_PC 962 default n if X86_PC
916 default y if (X86_NUMAQ || X86_SUMMIT) 963 default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP)
917 help 964 help
918 Enable NUMA (Non Uniform Memory Access) support. 965 Enable NUMA (Non Uniform Memory Access) support.
919 The kernel will try to allocate memory used by a CPU on the 966 The kernel will try to allocate memory used by a CPU on the
920 local memory controller of the CPU and add some more 967 local memory controller of the CPU and add some more
921 NUMA awareness to the kernel. 968 NUMA awareness to the kernel.
922 969
923 For i386 this is currently highly experimental and should be only 970 For 32-bit this is currently highly experimental and should be only
924 used for kernel development. It might also cause boot failures. 971 used for kernel development. It might also cause boot failures.
925 For x86_64 this is recommended on all multiprocessor Opteron systems. 972 For 64-bit this is recommended on all multiprocessor Opteron systems.
926 If the system is EM64T, you should say N unless your system is 973 If the system is EM64T, you should say N unless your system is
927 EM64T NUMA. 974 EM64T NUMA.
928 975
@@ -966,12 +1013,16 @@ config NUMA_EMU
966 number of nodes. This is only useful for debugging. 1013 number of nodes. This is only useful for debugging.
967 1014
968config NODES_SHIFT 1015config NODES_SHIFT
969 int "Max num nodes shift(1-15)" 1016 int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP
970 range 1 15 if X86_64 1017 range 1 9 if X86_64
1018 default "9" if MAXSMP
971 default "6" if X86_64 1019 default "6" if X86_64
972 default "4" if X86_NUMAQ 1020 default "4" if X86_NUMAQ
973 default "3" 1021 default "3"
974 depends on NEED_MULTIPLE_NODES 1022 depends on NEED_MULTIPLE_NODES
1023 help
1024 Specify the maximum number of NUMA Nodes available on the target
1025 system. Increases memory reserved to accomodate various tables.
975 1026
976config HAVE_ARCH_BOOTMEM_NODE 1027config HAVE_ARCH_BOOTMEM_NODE
977 def_bool y 1028 def_bool y
@@ -991,7 +1042,7 @@ config HAVE_ARCH_ALLOC_REMAP
991 1042
992config ARCH_FLATMEM_ENABLE 1043config ARCH_FLATMEM_ENABLE
993 def_bool y 1044 def_bool y
994 depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && X86_PC && !NUMA 1045 depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && !NUMA
995 1046
996config ARCH_DISCONTIGMEM_ENABLE 1047config ARCH_DISCONTIGMEM_ENABLE
997 def_bool y 1048 def_bool y
@@ -1007,7 +1058,7 @@ config ARCH_SPARSEMEM_DEFAULT
1007 1058
1008config ARCH_SPARSEMEM_ENABLE 1059config ARCH_SPARSEMEM_ENABLE
1009 def_bool y 1060 def_bool y
1010 depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) 1061 depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) || X86_GENERICARCH
1011 select SPARSEMEM_STATIC if X86_32 1062 select SPARSEMEM_STATIC if X86_32
1012 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 1063 select SPARSEMEM_VMEMMAP_ENABLE if X86_64
1013 1064
@@ -1030,6 +1081,56 @@ config HIGHPTE
1030 low memory. Setting this option will put user-space page table 1081 low memory. Setting this option will put user-space page table
1031 entries in high memory. 1082 entries in high memory.
1032 1083
1084config X86_CHECK_BIOS_CORRUPTION
1085 bool "Check for low memory corruption"
1086 help
1087 Periodically check for memory corruption in low memory, which
1088 is suspected to be caused by BIOS. Even when enabled in the
1089 configuration, it is disabled at runtime. Enable it by
1090 setting "memory_corruption_check=1" on the kernel command
1091 line. By default it scans the low 64k of memory every 60
1092 seconds; see the memory_corruption_check_size and
1093 memory_corruption_check_period parameters in
1094 Documentation/kernel-parameters.txt to adjust this.
1095
1096 When enabled with the default parameters, this option has
1097 almost no overhead, as it reserves a relatively small amount
1098 of memory and scans it infrequently. It both detects corruption
1099 and prevents it from affecting the running system.
1100
1101 It is, however, intended as a diagnostic tool; if repeatable
1102 BIOS-originated corruption always affects the same memory,
1103 you can use memmap= to prevent the kernel from using that
1104 memory.
1105
1106config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
1107 bool "Set the default setting of memory_corruption_check"
1108 depends on X86_CHECK_BIOS_CORRUPTION
1109 default y
1110 help
1111 Set whether the default state of memory_corruption_check is
1112 on or off.
1113
1114config X86_RESERVE_LOW_64K
1115 bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen"
1116 default y
1117 help
1118 Reserve the first 64K of physical RAM on BIOSes that are known
1119 to potentially corrupt that memory range. A numbers of BIOSes are
1120 known to utilize this area during suspend/resume, so it must not
1121 be used by the kernel.
1122
1123 Set this to N if you are absolutely sure that you trust the BIOS
1124 to get all its memory reservations and usages right.
1125
1126 If you have doubts about the BIOS (e.g. suspend/resume does not
1127 work or there's kernel crashes after certain hardware hotplug
1128 events) and it's not AMI or Phoenix, then you might want to enable
1129 X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical
1130 corruption patterns.
1131
1132 Say Y if unsure.
1133
1033config MATH_EMULATION 1134config MATH_EMULATION
1034 bool 1135 bool
1035 prompt "Math emulation" if X86_32 1136 prompt "Math emulation" if X86_32
@@ -1088,7 +1189,38 @@ config MTRR
1088 You can safely say Y even if your machine doesn't have MTRRs, you'll 1189 You can safely say Y even if your machine doesn't have MTRRs, you'll
1089 just add about 9 KB to your kernel. 1190 just add about 9 KB to your kernel.
1090 1191
1091 See <file:Documentation/mtrr.txt> for more information. 1192 See <file:Documentation/x86/mtrr.txt> for more information.
1193
1194config MTRR_SANITIZER
1195 def_bool y
1196 prompt "MTRR cleanup support"
1197 depends on MTRR
1198 help
1199 Convert MTRR layout from continuous to discrete, so X drivers can
1200 add writeback entries.
1201
1202 Can be disabled with disable_mtrr_cleanup on the kernel command line.
1203 The largest mtrr entry size for a continous block can be set with
1204 mtrr_chunk_size.
1205
1206 If unsure, say Y.
1207
1208config MTRR_SANITIZER_ENABLE_DEFAULT
1209 int "MTRR cleanup enable value (0-1)"
1210 range 0 1
1211 default "0"
1212 depends on MTRR_SANITIZER
1213 help
1214 Enable mtrr cleanup default value
1215
1216config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
1217 int "MTRR cleanup spare reg num (0-7)"
1218 range 0 7
1219 default "1"
1220 depends on MTRR_SANITIZER
1221 help
1222 mtrr cleanup spare entries default, it can be changed via
1223 mtrr_spare_reg_nr=N on the kernel command line.
1092 1224
1093config X86_PAT 1225config X86_PAT
1094 bool 1226 bool
@@ -1131,7 +1263,6 @@ config IRQBALANCE
1131config SECCOMP 1263config SECCOMP
1132 def_bool y 1264 def_bool y
1133 prompt "Enable seccomp to safely compute untrusted bytecode" 1265 prompt "Enable seccomp to safely compute untrusted bytecode"
1134 depends on PROC_FS
1135 help 1266 help
1136 This kernel feature is useful for number crunching applications 1267 This kernel feature is useful for number crunching applications
1137 that may need to compute untrusted bytecode during their 1268 that may need to compute untrusted bytecode during their
@@ -1139,7 +1270,7 @@ config SECCOMP
1139 the process as file descriptors supporting the read/write 1270 the process as file descriptors supporting the read/write
1140 syscalls, it's possible to isolate those applications in 1271 syscalls, it's possible to isolate those applications in
1141 their own address space using seccomp. Once seccomp is 1272 their own address space using seccomp. Once seccomp is
1142 enabled via /proc/<pid>/seccomp, it cannot be disabled 1273 enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
1143 and the task is only allowed to execute a few safe syscalls 1274 and the task is only allowed to execute a few safe syscalls
1144 defined by each seccomp mode. 1275 defined by each seccomp mode.
1145 1276
@@ -1186,8 +1317,7 @@ config KEXEC
1186 strongly in flux, so no good recommendation can be made. 1317 strongly in flux, so no good recommendation can be made.
1187 1318
1188config CRASH_DUMP 1319config CRASH_DUMP
1189 bool "kernel crash dumps (EXPERIMENTAL)" 1320 bool "kernel crash dumps"
1190 depends on EXPERIMENTAL
1191 depends on X86_64 || (X86_32 && HIGHMEM) 1321 depends on X86_64 || (X86_32 && HIGHMEM)
1192 help 1322 help
1193 Generate crash dump after being started by kexec. 1323 Generate crash dump after being started by kexec.
@@ -1200,6 +1330,14 @@ config CRASH_DUMP
1200 (CONFIG_RELOCATABLE=y). 1330 (CONFIG_RELOCATABLE=y).
1201 For more details see Documentation/kdump/kdump.txt 1331 For more details see Documentation/kdump/kdump.txt
1202 1332
1333config KEXEC_JUMP
1334 bool "kexec jump (EXPERIMENTAL)"
1335 depends on EXPERIMENTAL
1336 depends on KEXEC && HIBERNATION && X86_32
1337 help
1338 Jump between original kernel and kexeced kernel and invoke
1339 code in physical address mode via KEXEC
1340
1203config PHYSICAL_START 1341config PHYSICAL_START
1204 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) 1342 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
1205 default "0x1000000" if X86_NUMAQ 1343 default "0x1000000" if X86_NUMAQ
@@ -1286,14 +1424,14 @@ config PHYSICAL_ALIGN
1286 Don't change this unless you know what you are doing. 1424 Don't change this unless you know what you are doing.
1287 1425
1288config HOTPLUG_CPU 1426config HOTPLUG_CPU
1289 bool "Support for suspend on SMP and hot-pluggable CPUs (EXPERIMENTAL)" 1427 bool "Support for hot-pluggable CPUs"
1290 depends on SMP && HOTPLUG && EXPERIMENTAL && !X86_VOYAGER 1428 depends on SMP && HOTPLUG && !X86_VOYAGER
1291 ---help--- 1429 ---help---
1292 Say Y here to experiment with turning CPUs off and on, and to 1430 Say Y here to allow turning CPUs off and on. CPUs can be
1293 enable suspend on SMP systems. CPUs can be controlled through 1431 controlled through /sys/devices/system/cpu.
1294 /sys/devices/system/cpu. 1432 ( Note: power management support will enable this option
1295 Say N if you want to disable CPU hotplug and don't need to 1433 automatically on SMP systems. )
1296 suspend. 1434 Say N if you want to disable CPU hotplug.
1297 1435
1298config COMPAT_VDSO 1436config COMPAT_VDSO
1299 def_bool y 1437 def_bool y
@@ -1308,6 +1446,51 @@ config COMPAT_VDSO
1308 1446
1309 If unsure, say Y. 1447 If unsure, say Y.
1310 1448
1449config CMDLINE_BOOL
1450 bool "Built-in kernel command line"
1451 default n
1452 help
1453 Allow for specifying boot arguments to the kernel at
1454 build time. On some systems (e.g. embedded ones), it is
1455 necessary or convenient to provide some or all of the
1456 kernel boot arguments with the kernel itself (that is,
1457 to not rely on the boot loader to provide them.)
1458
1459 To compile command line arguments into the kernel,
1460 set this option to 'Y', then fill in the
1461 the boot arguments in CONFIG_CMDLINE.
1462
1463 Systems with fully functional boot loaders (i.e. non-embedded)
1464 should leave this option set to 'N'.
1465
1466config CMDLINE
1467 string "Built-in kernel command string"
1468 depends on CMDLINE_BOOL
1469 default ""
1470 help
1471 Enter arguments here that should be compiled into the kernel
1472 image and used at boot time. If the boot loader provides a
1473 command line at boot time, it is appended to this string to
1474 form the full kernel command line, when the system boots.
1475
1476 However, you can use the CONFIG_CMDLINE_OVERRIDE option to
1477 change this behavior.
1478
1479 In most cases, the command line (whether built-in or provided
1480 by the boot loader) should specify the device for the root
1481 file system.
1482
1483config CMDLINE_OVERRIDE
1484 bool "Built-in command line overrides boot loader arguments"
1485 default n
1486 depends on CMDLINE_BOOL
1487 help
1488 Set this option to 'Y' to have the kernel ignore the boot loader
1489 command line, and use ONLY the built-in command line.
1490
1491 This is used to work around broken boot loaders. This should
1492 be set to 'N' under normal conditions.
1493
1311endmenu 1494endmenu
1312 1495
1313config ARCH_ENABLE_MEMORY_HOTPLUG 1496config ARCH_ENABLE_MEMORY_HOTPLUG
@@ -1336,7 +1519,7 @@ config X86_APM_BOOT
1336 1519
1337menuconfig APM 1520menuconfig APM
1338 tristate "APM (Advanced Power Management) BIOS support" 1521 tristate "APM (Advanced Power Management) BIOS support"
1339 depends on X86_32 && PM_SLEEP && !X86_VISWS 1522 depends on X86_32 && PM_SLEEP
1340 ---help--- 1523 ---help---
1341 APM is a BIOS specification for saving power using several different 1524 APM is a BIOS specification for saving power using several different
1342 techniques. This is mostly useful for battery powered laptops with 1525 techniques. This is mostly useful for battery powered laptops with
@@ -1472,8 +1655,7 @@ endmenu
1472menu "Bus options (PCI etc.)" 1655menu "Bus options (PCI etc.)"
1473 1656
1474config PCI 1657config PCI
1475 bool "PCI support" if !X86_VISWS && !X86_VSMP 1658 bool "PCI support"
1476 depends on !X86_VOYAGER
1477 default y 1659 default y
1478 select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC) 1660 select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC)
1479 help 1661 help
@@ -1484,7 +1666,7 @@ config PCI
1484 1666
1485choice 1667choice
1486 prompt "PCI access mode" 1668 prompt "PCI access mode"
1487 depends on X86_32 && PCI && !X86_VISWS 1669 depends on X86_32 && PCI
1488 default PCI_GOANY 1670 default PCI_GOANY
1489 ---help--- 1671 ---help---
1490 On PCI systems, the BIOS can be used to detect the PCI devices and 1672 On PCI systems, the BIOS can be used to detect the PCI devices and
@@ -1521,12 +1703,12 @@ endchoice
1521 1703
1522config PCI_BIOS 1704config PCI_BIOS
1523 def_bool y 1705 def_bool y
1524 depends on X86_32 && !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY) 1706 depends on X86_32 && PCI && (PCI_GOBIOS || PCI_GOANY)
1525 1707
1526# x86-64 doesn't support PCI BIOS access from long mode so always go direct. 1708# x86-64 doesn't support PCI BIOS access from long mode so always go direct.
1527config PCI_DIRECT 1709config PCI_DIRECT
1528 def_bool y 1710 def_bool y
1529 depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY || PCI_GOOLPC) || X86_VISWS) 1711 depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY || PCI_GOOLPC))
1530 1712
1531config PCI_MMCONFIG 1713config PCI_MMCONFIG
1532 def_bool y 1714 def_bool y
@@ -1574,6 +1756,14 @@ config DMAR_FLOPPY_WA
1574 workaround will setup a 1:1 mapping for the first 1756 workaround will setup a 1:1 mapping for the first
1575 16M to make floppy (an ISA device) work. 1757 16M to make floppy (an ISA device) work.
1576 1758
1759config INTR_REMAP
1760 bool "Support for Interrupt Remapping (EXPERIMENTAL)"
1761 depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL
1762 help
1763 Supports Interrupt remapping for IO-APIC and MSI devices.
1764 To use x2apic mode in the CPU's which support x2APIC enhancements or
1765 to support platforms with CPU's having > 8 bit APIC ID, say Y.
1766
1577source "drivers/pci/pcie/Kconfig" 1767source "drivers/pci/pcie/Kconfig"
1578 1768
1579source "drivers/pci/Kconfig" 1769source "drivers/pci/Kconfig"
@@ -1586,7 +1776,7 @@ if X86_32
1586 1776
1587config ISA 1777config ISA
1588 bool "ISA support" 1778 bool "ISA support"
1589 depends on !(X86_VOYAGER || X86_VISWS) 1779 depends on !X86_VOYAGER
1590 help 1780 help
1591 Find out whether you have ISA slots on your motherboard. ISA is the 1781 Find out whether you have ISA slots on your motherboard. ISA is the
1592 name of a bus system, i.e. the way the CPU talks to the other stuff 1782 name of a bus system, i.e. the way the CPU talks to the other stuff
@@ -1613,7 +1803,7 @@ config EISA
1613source "drivers/eisa/Kconfig" 1803source "drivers/eisa/Kconfig"
1614 1804
1615config MCA 1805config MCA
1616 bool "MCA support" if !(X86_VISWS || X86_VOYAGER) 1806 bool "MCA support" if !X86_VOYAGER
1617 default y if X86_VOYAGER 1807 default y if X86_VOYAGER
1618 help 1808 help
1619 MicroChannel Architecture is found in some IBM PS/2 machines and 1809 MicroChannel Architecture is found in some IBM PS/2 machines and
@@ -1690,7 +1880,7 @@ config IA32_EMULATION
1690 1880
1691config IA32_AOUT 1881config IA32_AOUT
1692 tristate "IA32 a.out support" 1882 tristate "IA32 a.out support"
1693 depends on IA32_EMULATION && ARCH_SUPPORTS_AOUT 1883 depends on IA32_EMULATION
1694 help 1884 help
1695 Support old a.out binaries in the 32bit emulation. 1885 Support old a.out binaries in the 32bit emulation.
1696 1886
@@ -1704,7 +1894,7 @@ config COMPAT_FOR_U64_ALIGNMENT
1704 1894
1705config SYSVIPC_COMPAT 1895config SYSVIPC_COMPAT
1706 def_bool y 1896 def_bool y
1707 depends on X86_64 && COMPAT && SYSVIPC 1897 depends on COMPAT && SYSVIPC
1708 1898
1709endmenu 1899endmenu
1710 1900
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 2ad6301849a1..0b7c4a3f0651 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -38,8 +38,7 @@ config M386
38 - "Crusoe" for the Transmeta Crusoe series. 38 - "Crusoe" for the Transmeta Crusoe series.
39 - "Efficeon" for the Transmeta Efficeon series. 39 - "Efficeon" for the Transmeta Efficeon series.
40 - "Winchip-C6" for original IDT Winchip. 40 - "Winchip-C6" for original IDT Winchip.
41 - "Winchip-2" for IDT Winchip 2. 41 - "Winchip-2" for IDT Winchips with 3dNow! capabilities.
42 - "Winchip-2A" for IDT Winchips with 3dNow! capabilities.
43 - "GeodeGX1" for Geode GX1 (Cyrix MediaGX). 42 - "GeodeGX1" for Geode GX1 (Cyrix MediaGX).
44 - "Geode GX/LX" For AMD Geode GX and LX processors. 43 - "Geode GX/LX" For AMD Geode GX and LX processors.
45 - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3. 44 - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3.
@@ -194,19 +193,11 @@ config MWINCHIPC6
194 treat this chip as a 586TSC with some extended instructions 193 treat this chip as a 586TSC with some extended instructions
195 and alignment requirements. 194 and alignment requirements.
196 195
197config MWINCHIP2
198 bool "Winchip-2"
199 depends on X86_32
200 help
201 Select this for an IDT Winchip-2. Linux and GCC
202 treat this chip as a 586TSC with some extended instructions
203 and alignment requirements.
204
205config MWINCHIP3D 196config MWINCHIP3D
206 bool "Winchip-2A/Winchip-3" 197 bool "Winchip-2/Winchip-2A/Winchip-3"
207 depends on X86_32 198 depends on X86_32
208 help 199 help
209 Select this for an IDT Winchip-2A or 3. Linux and GCC 200 Select this for an IDT Winchip-2, 2A or 3. Linux and GCC
210 treat this chip as a 586TSC with some extended instructions 201 treat this chip as a 586TSC with some extended instructions
211 and alignment requirements. Also enable out of order memory 202 and alignment requirements. Also enable out of order memory
212 stores for this CPU, which can increase performance of some 203 stores for this CPU, which can increase performance of some
@@ -318,7 +309,7 @@ config X86_L1_CACHE_SHIFT
318 int 309 int
319 default "7" if MPENTIUM4 || X86_GENERIC || GENERIC_CPU || MPSC 310 default "7" if MPENTIUM4 || X86_GENERIC || GENERIC_CPU || MPSC
320 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 311 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
321 default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX 312 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
322 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 313 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7
323 314
324config X86_XADD 315config X86_XADD
@@ -344,7 +335,7 @@ config X86_F00F_BUG
344 335
345config X86_WP_WORKS_OK 336config X86_WP_WORKS_OK
346 def_bool y 337 def_bool y
347 depends on X86_32 && !M386 338 depends on !M386
348 339
349config X86_INVLPG 340config X86_INVLPG
350 def_bool y 341 def_bool y
@@ -360,11 +351,7 @@ config X86_POPAD_OK
360 351
361config X86_ALIGNMENT_16 352config X86_ALIGNMENT_16
362 def_bool y 353 def_bool y
363 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 354 depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
364
365config X86_GOOD_APIC
366 def_bool y
367 depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2 || MVIAC7 || X86_64
368 355
369config X86_INTEL_USERCOPY 356config X86_INTEL_USERCOPY
370 def_bool y 357 def_bool y
@@ -372,7 +359,7 @@ config X86_INTEL_USERCOPY
372 359
373config X86_USE_PPRO_CHECKSUM 360config X86_USE_PPRO_CHECKSUM
374 def_bool y 361 def_bool y
375 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 362 depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2
376 363
377config X86_USE_3DNOW 364config X86_USE_3DNOW
378 def_bool y 365 def_bool y
@@ -380,30 +367,37 @@ config X86_USE_3DNOW
380 367
381config X86_OOSTORE 368config X86_OOSTORE
382 def_bool y 369 def_bool y
383 depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR 370 depends on (MWINCHIP3D || MWINCHIPC6) && MTRR
384 371
385# 372#
386# P6_NOPs are a relatively minor optimization that require a family >= 373# P6_NOPs are a relatively minor optimization that require a family >=
387# 6 processor, except that it is broken on certain VIA chips. 374# 6 processor, except that it is broken on certain VIA chips.
388# Furthermore, AMD chips prefer a totally different sequence of NOPs 375# Furthermore, AMD chips prefer a totally different sequence of NOPs
389# (which work on all CPUs). As a result, disallow these if we're 376# (which work on all CPUs). In addition, it looks like Virtual PC
390# compiling X86_GENERIC but not X86_64 (these NOPs do work on all 377# does not understand them.
391# x86-64 capable chips); the list of processors in the right-hand clause 378#
392# are the cores that benefit from this optimization. 379# As a result, disallow these if we're not compiling for X86_64 (these
380# NOPs do work on all x86-64 capable chips); the list of processors in
381# the right-hand clause are the cores that benefit from this optimization.
393# 382#
394config X86_P6_NOP 383config X86_P6_NOP
395 def_bool y 384 def_bool y
396 depends on (X86_64 || !X86_GENERIC) && (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || MPENTIUM4 || MPSC) 385 depends on X86_64
386 depends on (MCORE2 || MPENTIUM4 || MPSC)
397 387
398config X86_TSC 388config X86_TSC
399 def_bool y 389 def_bool y
400 depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 390 depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64
391
392config X86_CMPXCHG64
393 def_bool y
394 depends on X86_PAE || X86_64
401 395
402# this should be set for all -march=.. options where the compiler 396# this should be set for all -march=.. options where the compiler
403# generates cmov. 397# generates cmov.
404config X86_CMOV 398config X86_CMOV
405 def_bool y 399 def_bool y
406 depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || X86_64) 400 depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64)
407 401
408config X86_MINIMUM_CPU_FAMILY 402config X86_MINIMUM_CPU_FAMILY
409 int 403 int
@@ -414,4 +408,124 @@ config X86_MINIMUM_CPU_FAMILY
414 408
415config X86_DEBUGCTLMSR 409config X86_DEBUGCTLMSR
416 def_bool y 410 def_bool y
417 depends on !(M586MMX || M586TSC || M586 || M486 || M386) 411 depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386)
412
413menuconfig PROCESSOR_SELECT
414 bool "Supported processor vendors" if EMBEDDED
415 help
416 This lets you choose what x86 vendor support code your kernel
417 will include.
418
419config CPU_SUP_INTEL
420 default y
421 bool "Support Intel processors" if PROCESSOR_SELECT
422 help
423 This enables detection, tunings and quirks for Intel processors
424
425 You need this enabled if you want your kernel to run on an
426 Intel CPU. Disabling this option on other types of CPUs
427 makes the kernel a tiny bit smaller. Disabling it on an Intel
428 CPU might render the kernel unbootable.
429
430 If unsure, say N.
431
432config CPU_SUP_CYRIX_32
433 default y
434 bool "Support Cyrix processors" if PROCESSOR_SELECT
435 depends on !64BIT
436 help
437 This enables detection, tunings and quirks for Cyrix processors
438
439 You need this enabled if you want your kernel to run on a
440 Cyrix CPU. Disabling this option on other types of CPUs
441 makes the kernel a tiny bit smaller. Disabling it on a Cyrix
442 CPU might render the kernel unbootable.
443
444 If unsure, say N.
445
446config CPU_SUP_AMD
447 default y
448 bool "Support AMD processors" if PROCESSOR_SELECT
449 help
450 This enables detection, tunings and quirks for AMD processors
451
452 You need this enabled if you want your kernel to run on an
453 AMD CPU. Disabling this option on other types of CPUs
454 makes the kernel a tiny bit smaller. Disabling it on an AMD
455 CPU might render the kernel unbootable.
456
457 If unsure, say N.
458
459config CPU_SUP_CENTAUR_32
460 default y
461 bool "Support Centaur processors" if PROCESSOR_SELECT
462 depends on !64BIT
463 help
464 This enables detection, tunings and quirks for Centaur processors
465
466 You need this enabled if you want your kernel to run on a
467 Centaur CPU. Disabling this option on other types of CPUs
468 makes the kernel a tiny bit smaller. Disabling it on a Centaur
469 CPU might render the kernel unbootable.
470
471 If unsure, say N.
472
473config CPU_SUP_CENTAUR_64
474 default y
475 bool "Support Centaur processors" if PROCESSOR_SELECT
476 depends on 64BIT
477 help
478 This enables detection, tunings and quirks for Centaur processors
479
480 You need this enabled if you want your kernel to run on a
481 Centaur CPU. Disabling this option on other types of CPUs
482 makes the kernel a tiny bit smaller. Disabling it on a Centaur
483 CPU might render the kernel unbootable.
484
485 If unsure, say N.
486
487config CPU_SUP_TRANSMETA_32
488 default y
489 bool "Support Transmeta processors" if PROCESSOR_SELECT
490 depends on !64BIT
491 help
492 This enables detection, tunings and quirks for Transmeta processors
493
494 You need this enabled if you want your kernel to run on a
495 Transmeta CPU. Disabling this option on other types of CPUs
496 makes the kernel a tiny bit smaller. Disabling it on a Transmeta
497 CPU might render the kernel unbootable.
498
499 If unsure, say N.
500
501config CPU_SUP_UMC_32
502 default y
503 bool "Support UMC processors" if PROCESSOR_SELECT
504 depends on !64BIT
505 help
506 This enables detection, tunings and quirks for UMC processors
507
508 You need this enabled if you want your kernel to run on a
509 UMC CPU. Disabling this option on other types of CPUs
510 makes the kernel a tiny bit smaller. Disabling it on a UMC
511 CPU might render the kernel unbootable.
512
513 If unsure, say N.
514
515config X86_DS
516 bool "Debug Store support"
517 default y
518 help
519 Add support for Debug Store.
520 This allows the kernel to provide a memory buffer to the hardware
521 to store various profiling and tracing events.
522
523config X86_PTRACE_BTS
524 bool "ptrace interface to Branch Trace Store"
525 default y
526 depends on (X86_DS && X86_DEBUGCTLMSR)
527 help
528 Add a ptrace interface to allow collecting an execution trace
529 of the traced task.
530 This collects control flow changes in a (cyclic) buffer and allows
531 debuggers to fill in the gaps and show an execution trace of the debuggee.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index a8d3c7e0414a..95fe606cb9a3 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -5,13 +5,15 @@ config TRACE_IRQFLAGS_SUPPORT
5 5
6source "lib/Kconfig.debug" 6source "lib/Kconfig.debug"
7 7
8config NONPROMISC_DEVMEM 8config STRICT_DEVMEM
9 bool "Filter access to /dev/mem" 9 bool "Filter access to /dev/mem"
10 help 10 help
11 If this option is left off, you allow userspace access to all 11 If this option is disabled, you allow userspace (root) access to all
12 of memory, including kernel and userspace memory. Accidental 12 of memory, including kernel and userspace memory. Accidental
13 access to this is obviously disastrous, but specific access can 13 access to this is obviously disastrous, but specific access can
14 be used by people debugging the kernel. 14 be used by people debugging the kernel. Note that with PAT support
15 enabled, even in this case there are restrictions on /dev/mem
16 use due to the cache aliasing requirements.
15 17
16 If this option is switched on, the /dev/mem file only allows 18 If this option is switched on, the /dev/mem file only allows
17 userspace access to PCI space and the BIOS code and data regions. 19 userspace access to PCI space and the BIOS code and data regions.
@@ -20,6 +22,14 @@ config NONPROMISC_DEVMEM
20 22
21 If in doubt, say Y. 23 If in doubt, say Y.
22 24
25config X86_VERBOSE_BOOTUP
26 bool "Enable verbose x86 bootup info messages"
27 default y
28 help
29 Enables the informational output from the decompression stage
30 (e.g. bzImage) of the boot. If you disable this you will still
31 see errors. Disable this if you want silent bootup.
32
23config EARLY_PRINTK 33config EARLY_PRINTK
24 bool "Early printk" if EMBEDDED 34 bool "Early printk" if EMBEDDED
25 default y 35 default y
@@ -33,6 +43,19 @@ config EARLY_PRINTK
33 with klogd/syslogd or the X server. You should normally N here, 43 with klogd/syslogd or the X server. You should normally N here,
34 unless you want to debug such a crash. 44 unless you want to debug such a crash.
35 45
46config EARLY_PRINTK_DBGP
47 bool "Early printk via EHCI debug port"
48 default n
49 depends on EARLY_PRINTK && PCI
50 help
51 Write kernel log output directly into the EHCI debug port.
52
53 This is useful for kernel debugging when your machine crashes very
54 early before the console code is initialized. For normal operation
55 it is not recommended because it looks ugly and doesn't cooperate
56 with klogd/syslogd or the X server. You should normally N here,
57 unless you want to debug such a crash. You need usb debug device.
58
36config DEBUG_STACKOVERFLOW 59config DEBUG_STACKOVERFLOW
37 bool "Check for stack overflows" 60 bool "Check for stack overflows"
38 depends on DEBUG_KERNEL 61 depends on DEBUG_KERNEL
@@ -60,7 +83,7 @@ config DEBUG_PAGEALLOC
60config DEBUG_PER_CPU_MAPS 83config DEBUG_PER_CPU_MAPS
61 bool "Debug access to per_cpu maps" 84 bool "Debug access to per_cpu maps"
62 depends on DEBUG_KERNEL 85 depends on DEBUG_KERNEL
63 depends on X86_64_SMP 86 depends on X86_SMP
64 default n 87 default n
65 help 88 help
66 Say Y to verify that the per_cpu map being accessed has 89 Say Y to verify that the per_cpu map being accessed has
@@ -130,15 +153,6 @@ config 4KSTACKS
130 on the VM subsystem for higher order allocations. This option 153 on the VM subsystem for higher order allocations. This option
131 will also use IRQ stacks to compensate for the reduced stackspace. 154 will also use IRQ stacks to compensate for the reduced stackspace.
132 155
133config X86_FIND_SMP_CONFIG
134 def_bool y
135 depends on X86_LOCAL_APIC || X86_VOYAGER
136 depends on X86_32
137
138config X86_MPPARSE
139 def_bool y
140 depends on (X86_32 && (X86_LOCAL_APIC && !X86_VISWS)) || X86_64
141
142config DOUBLEFAULT 156config DOUBLEFAULT
143 default y 157 default y
144 bool "Enable doublefault exception handler" if EMBEDDED 158 bool "Enable doublefault exception handler" if EMBEDDED
@@ -173,6 +187,33 @@ config IOMMU_LEAK
173 Add a simple leak tracer to the IOMMU code. This is useful when you 187 Add a simple leak tracer to the IOMMU code. This is useful when you
174 are debugging a buggy device driver that leaks IOMMU mappings. 188 are debugging a buggy device driver that leaks IOMMU mappings.
175 189
190config MMIOTRACE_HOOKS
191 bool
192
193config MMIOTRACE
194 bool "Memory mapped IO tracing"
195 depends on DEBUG_KERNEL && PCI
196 select TRACING
197 select MMIOTRACE_HOOKS
198 help
199 Mmiotrace traces Memory Mapped I/O access and is meant for
200 debugging and reverse engineering. It is called from the ioremap
201 implementation and works via page faults. Tracing is disabled by
202 default and can be enabled at run-time.
203
204 See Documentation/tracers/mmiotrace.txt.
205 If you are not helping to develop drivers, say N.
206
207config MMIOTRACE_TEST
208 tristate "Test module for mmiotrace"
209 depends on MMIOTRACE && m
210 help
211 This is a dumb module for testing mmiotrace. It is very dangerous
212 as it will write garbage to IO memory starting at a given address.
213 However, it should be safe to use on e.g. unused portion of VRAM.
214
215 Say N, unless you absolutely know what you are doing.
216
176# 217#
177# IO delay types: 218# IO delay types:
178# 219#
@@ -262,7 +303,6 @@ config CPA_DEBUG
262 303
263config OPTIMIZE_INLINING 304config OPTIMIZE_INLINING
264 bool "Allow gcc to uninline functions marked 'inline'" 305 bool "Allow gcc to uninline functions marked 'inline'"
265 depends on BROKEN
266 help 306 help
267 This option determines if the kernel forces gcc to inline the functions 307 This option determines if the kernel forces gcc to inline the functions
268 developers have marked 'inline'. Doing so takes away freedom from gcc to 308 developers have marked 'inline'. Doing so takes away freedom from gcc to
@@ -273,5 +313,7 @@ config OPTIMIZE_INLINING
273 become the default in the future, until then this option is there to 313 become the default in the future, until then this option is there to
274 test gcc for this. 314 test gcc for this.
275 315
316 If unsure, say N.
317
276endmenu 318endmenu
277 319
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index c3e0eeeb1dd2..58ea55ce2423 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -113,38 +113,11 @@ mcore-y := arch/x86/mach-default/
113mflags-$(CONFIG_X86_VOYAGER) := -Iinclude/asm-x86/mach-voyager 113mflags-$(CONFIG_X86_VOYAGER) := -Iinclude/asm-x86/mach-voyager
114mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/ 114mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/
115 115
116# VISWS subarch support
117mflags-$(CONFIG_X86_VISWS) := -Iinclude/asm-x86/mach-visws
118mcore-$(CONFIG_X86_VISWS) := arch/x86/mach-visws/
119
120# NUMAQ subarch support
121mflags-$(CONFIG_X86_NUMAQ) := -Iinclude/asm-x86/mach-numaq
122mcore-$(CONFIG_X86_NUMAQ) := arch/x86/mach-default/
123
124# BIGSMP subarch support
125mflags-$(CONFIG_X86_BIGSMP) := -Iinclude/asm-x86/mach-bigsmp
126mcore-$(CONFIG_X86_BIGSMP) := arch/x86/mach-default/
127
128#Summit subarch support
129mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-x86/mach-summit
130mcore-$(CONFIG_X86_SUMMIT) := arch/x86/mach-default/
131
132# generic subarchitecture 116# generic subarchitecture
133mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic 117mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic
134fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/ 118fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/
135mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/ 119mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/
136 120
137
138# ES7000 subarch support
139mflags-$(CONFIG_X86_ES7000) := -Iinclude/asm-x86/mach-es7000
140fcore-$(CONFIG_X86_ES7000) := arch/x86/mach-es7000/
141mcore-$(CONFIG_X86_ES7000) := arch/x86/mach-default/
142
143# RDC R-321x subarch support
144mflags-$(CONFIG_X86_RDC321X) := -Iinclude/asm-x86/mach-rdc321x
145mcore-$(CONFIG_X86_RDC321X) := arch/x86/mach-default/
146core-$(CONFIG_X86_RDC321X) += arch/x86/mach-rdc321x/
147
148# default subarch .h files 121# default subarch .h files
149mflags-y += -Iinclude/asm-x86/mach-default 122mflags-y += -Iinclude/asm-x86/mach-default
150 123
@@ -160,6 +133,7 @@ KBUILD_AFLAGS += $(mflags-y)
160 133
161head-y := arch/x86/kernel/head_$(BITS).o 134head-y := arch/x86/kernel/head_$(BITS).o
162head-y += arch/x86/kernel/head$(BITS).o 135head-y += arch/x86/kernel/head$(BITS).o
136head-y += arch/x86/kernel/head.o
163head-y += arch/x86/kernel/init_task.o 137head-y += arch/x86/kernel/init_task.o
164 138
165libs-y += arch/x86/lib/ 139libs-y += arch/x86/lib/
@@ -210,12 +184,12 @@ all: bzImage
210 184
211# KBUILD_IMAGE specify target image being built 185# KBUILD_IMAGE specify target image being built
212 KBUILD_IMAGE := $(boot)/bzImage 186 KBUILD_IMAGE := $(boot)/bzImage
213zImage zlilo zdisk: KBUILD_IMAGE := arch/x86/boot/zImage 187zImage zlilo zdisk: KBUILD_IMAGE := $(boot)/zImage
214 188
215zImage bzImage: vmlinux 189zImage bzImage: vmlinux
216 $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) 190 $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
217 $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot 191 $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
218 $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/bzImage 192 $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@
219 193
220compressed: zImage 194compressed: zImage
221 195
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index e372b584e919..80177ec052f0 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -28,7 +28,6 @@ cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon)
28cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 28cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
29cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 29cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
30cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) 30cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586)
31cflags-$(CONFIG_MWINCHIP2) += $(call cc-option,-march=winchip2,-march=i586)
32cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) 31cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586)
33cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 32cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
34cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) 33cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686)
@@ -45,3 +44,8 @@ cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx
45# cpu entries 44# cpu entries
46cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) 45cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686))
47 46
47# Bug fix for binutils: this option is required in order to keep
48# binutils from generating NOPL instructions against our will.
49ifneq ($(CONFIG_X86_P6_NOP),y)
50cflags-y += $(call cc-option,-Wa$(comma)-mtune=generic32,)
51endif
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 7ee102f9c4f8..cd48c7210016 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -72,9 +72,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
72KBUILD_CFLAGS += $(call cc-option,-m32) 72KBUILD_CFLAGS += $(call cc-option,-m32)
73KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ 73KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
74 74
75$(obj)/zImage: IMAGE_OFFSET := 0x1000
76$(obj)/zImage: asflags-y := $(SVGA_MODE) $(RAMDISK) 75$(obj)/zImage: asflags-y := $(SVGA_MODE) $(RAMDISK)
77$(obj)/bzImage: IMAGE_OFFSET := 0x100000
78$(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__ 76$(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__
79$(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ 77$(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__
80$(obj)/bzImage: BUILDFLAGS := -b 78$(obj)/bzImage: BUILDFLAGS := -b
@@ -117,7 +115,7 @@ $(obj)/setup.bin: $(obj)/setup.elf FORCE
117 $(call if_changed,objcopy) 115 $(call if_changed,objcopy)
118 116
119$(obj)/compressed/vmlinux: FORCE 117$(obj)/compressed/vmlinux: FORCE
120 $(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@ 118 $(Q)$(MAKE) $(build)=$(obj)/compressed $@
121 119
122# Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel 120# Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel
123FDARGS = 121FDARGS =
@@ -181,6 +179,7 @@ isoimage: $(BOOTIMAGE)
181 mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \ 179 mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \
182 -no-emul-boot -boot-load-size 4 -boot-info-table \ 180 -no-emul-boot -boot-load-size 4 -boot-info-table \
183 $(obj)/isoimage 181 $(obj)/isoimage
182 isohybrid $(obj)/image.iso 2>/dev/null || true
184 rm -rf $(obj)/isoimage 183 rm -rf $(obj)/isoimage
185 184
186zlilo: $(BOOTIMAGE) 185zlilo: $(BOOTIMAGE)
diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c
index e01aafd03bde..4063d630deff 100644
--- a/arch/x86/boot/a20.c
+++ b/arch/x86/boot/a20.c
@@ -1,7 +1,7 @@
1/* -*- linux-c -*- ------------------------------------------------------- * 1/* -*- linux-c -*- ------------------------------------------------------- *
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007-2008 rPath, Inc. - All Rights Reserved
5 * 5 *
6 * This file is part of the Linux kernel, and is made available under 6 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 7 * the terms of the GNU General Public License version 2.
@@ -95,6 +95,9 @@ static void enable_a20_kbc(void)
95 95
96 outb(0xdf, 0x60); /* A20 on */ 96 outb(0xdf, 0x60); /* A20 on */
97 empty_8042(); 97 empty_8042();
98
99 outb(0xff, 0x64); /* Null command, but UHCI wants it */
100 empty_8042();
98} 101}
99 102
100static void enable_a20_fast(void) 103static void enable_a20_fast(void)
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index a34b9982c7cb..cc0ef13fba7a 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -24,10 +24,14 @@
24#include <linux/edd.h> 24#include <linux/edd.h>
25#include <asm/boot.h> 25#include <asm/boot.h>
26#include <asm/setup.h> 26#include <asm/setup.h>
27#include "bitops.h"
28#include <asm/cpufeature.h>
27 29
28/* Useful macros */ 30/* Useful macros */
29#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) 31#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
30 32
33#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
34
31extern struct setup_header hdr; 35extern struct setup_header hdr;
32extern struct boot_params boot_params; 36extern struct boot_params boot_params;
33 37
@@ -242,6 +246,12 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize);
242int cmdline_find_option_bool(const char *option); 246int cmdline_find_option_bool(const char *option);
243 247
244/* cpu.c, cpucheck.c */ 248/* cpu.c, cpucheck.c */
249struct cpu_features {
250 int level; /* Family, or 64 for x86-64 */
251 int model;
252 u32 flags[NCAPINTS];
253};
254extern struct cpu_features cpu;
245int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr); 255int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr);
246int validate_cpu(void); 256int validate_cpu(void);
247 257
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 92fdd35bd93e..1771c804e02f 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -27,9 +27,8 @@ $(obj)/vmlinux.bin: vmlinux FORCE
27 $(call if_changed,objcopy) 27 $(call if_changed,objcopy)
28 28
29 29
30ifeq ($(CONFIG_X86_32),y) 30targets += vmlinux.bin.all vmlinux.relocs relocs
31targets += vmlinux.bin.all vmlinux.relocs 31hostprogs-$(CONFIG_X86_32) += relocs
32hostprogs-y := relocs
33 32
34quiet_cmd_relocs = RELOCS $@ 33quiet_cmd_relocs = RELOCS $@
35 cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $< 34 cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
@@ -43,6 +42,8 @@ quiet_cmd_relocbin = BUILD $@
43$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE 42$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
44 $(call if_changed,relocbin) 43 $(call if_changed,relocbin)
45 44
45ifeq ($(CONFIG_X86_32),y)
46
46ifdef CONFIG_RELOCATABLE 47ifdef CONFIG_RELOCATABLE
47$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE 48$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
48 $(call if_changed,gzip) 49 $(call if_changed,gzip)
@@ -59,6 +60,5 @@ $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
59LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T 60LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
60endif 61endif
61 62
62
63$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE 63$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE
64 $(call if_changed,ld) 64 $(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index ba7736cf2ec7..29c5fbf08392 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -137,14 +137,15 @@ relocated:
137 */ 137 */
138 movl output_len(%ebx), %eax 138 movl output_len(%ebx), %eax
139 pushl %eax 139 pushl %eax
140 # push arguments for decompress_kernel:
140 pushl %ebp # output address 141 pushl %ebp # output address
141 movl input_len(%ebx), %eax 142 movl input_len(%ebx), %eax
142 pushl %eax # input_len 143 pushl %eax # input_len
143 leal input_data(%ebx), %eax 144 leal input_data(%ebx), %eax
144 pushl %eax # input_data 145 pushl %eax # input_data
145 leal boot_heap(%ebx), %eax 146 leal boot_heap(%ebx), %eax
146 pushl %eax # heap area as third argument 147 pushl %eax # heap area
147 pushl %esi # real mode pointer as second arg 148 pushl %esi # real mode pointer
148 call decompress_kernel 149 call decompress_kernel
149 addl $20, %esp 150 addl $20, %esp
150 popl %ecx 151 popl %ecx
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index d8819efac81d..1d5dff4123e1 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -30,6 +30,7 @@
30#include <asm/page.h> 30#include <asm/page.h>
31#include <asm/boot.h> 31#include <asm/boot.h>
32#include <asm/msr.h> 32#include <asm/msr.h>
33#include <asm/processor-flags.h>
33#include <asm/asm-offsets.h> 34#include <asm/asm-offsets.h>
34 35
35.section ".text.head" 36.section ".text.head"
@@ -109,7 +110,7 @@ startup_32:
109 110
110 /* Enable PAE mode */ 111 /* Enable PAE mode */
111 xorl %eax, %eax 112 xorl %eax, %eax
112 orl $(1 << 5), %eax 113 orl $(X86_CR4_PAE), %eax
113 movl %eax, %cr4 114 movl %eax, %cr4
114 115
115 /* 116 /*
@@ -170,7 +171,7 @@ startup_32:
170 pushl %eax 171 pushl %eax
171 172
172 /* Enter paged protected Mode, activating Long Mode */ 173 /* Enter paged protected Mode, activating Long Mode */
173 movl $0x80000001, %eax /* Enable Paging and Protected mode */ 174 movl $(X86_CR0_PG | X86_CR0_PE), %eax /* Enable Paging and Protected mode */
174 movl %eax, %cr0 175 movl %eax, %cr0
175 176
176 /* Jump from 32bit compatibility mode into 64bit mode. */ 177 /* Jump from 32bit compatibility mode into 64bit mode. */
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 90456cee47c3..5780d361105b 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -16,7 +16,7 @@
16 */ 16 */
17#undef CONFIG_PARAVIRT 17#undef CONFIG_PARAVIRT
18#ifdef CONFIG_X86_32 18#ifdef CONFIG_X86_32
19#define _ASM_DESC_H_ 1 19#define ASM_X86__DESC_H 1
20#endif 20#endif
21 21
22#ifdef CONFIG_X86_64 22#ifdef CONFIG_X86_64
@@ -27,9 +27,10 @@
27#include <linux/linkage.h> 27#include <linux/linkage.h>
28#include <linux/screen_info.h> 28#include <linux/screen_info.h>
29#include <linux/elf.h> 29#include <linux/elf.h>
30#include <asm/io.h> 30#include <linux/io.h>
31#include <asm/page.h> 31#include <asm/page.h>
32#include <asm/boot.h> 32#include <asm/boot.h>
33#include <asm/bootparam.h>
33 34
34/* WARNING!! 35/* WARNING!!
35 * This code is compiled with -fPIC and it is relocated dynamically 36 * This code is compiled with -fPIC and it is relocated dynamically
@@ -181,32 +182,23 @@ static unsigned outcnt;
181static int fill_inbuf(void); 182static int fill_inbuf(void);
182static void flush_window(void); 183static void flush_window(void);
183static void error(char *m); 184static void error(char *m);
184static void gzip_mark(void **);
185static void gzip_release(void **);
186 185
187/* 186/*
188 * This is set up by the setup-routine at boot-time 187 * This is set up by the setup-routine at boot-time
189 */ 188 */
190static unsigned char *real_mode; /* Pointer to real-mode data */ 189static struct boot_params *real_mode; /* Pointer to real-mode data */
191 190static int quiet;
192#define RM_EXT_MEM_K (*(unsigned short *)(real_mode + 0x2))
193#ifndef STANDARD_MEMORY_BIOS_CALL
194#define RM_ALT_MEM_K (*(unsigned long *)(real_mode + 0x1e0))
195#endif
196#define RM_SCREEN_INFO (*(struct screen_info *)(real_mode+0))
197 191
198extern unsigned char input_data[]; 192extern unsigned char input_data[];
199extern int input_len; 193extern int input_len;
200 194
201static long bytes_out; 195static long bytes_out;
202 196
203static void *malloc(int size);
204static void free(void *where);
205
206static void *memset(void *s, int c, unsigned n); 197static void *memset(void *s, int c, unsigned n);
207static void *memcpy(void *dest, const void *src, unsigned n); 198static void *memcpy(void *dest, const void *src, unsigned n);
208 199
209static void putstr(const char *); 200static void __putstr(int, const char *);
201#define putstr(__x) __putstr(0, __x)
210 202
211#ifdef CONFIG_X86_64 203#ifdef CONFIG_X86_64
212#define memptr long 204#define memptr long
@@ -221,46 +213,8 @@ static char *vidmem;
221static int vidport; 213static int vidport;
222static int lines, cols; 214static int lines, cols;
223 215
224#ifdef CONFIG_X86_NUMAQ
225void *xquad_portio;
226#endif
227
228#include "../../../../lib/inflate.c" 216#include "../../../../lib/inflate.c"
229 217
230static void *malloc(int size)
231{
232 void *p;
233
234 if (size < 0)
235 error("Malloc error");
236 if (free_mem_ptr <= 0)
237 error("Memory error");
238
239 free_mem_ptr = (free_mem_ptr + 3) & ~3; /* Align */
240
241 p = (void *)free_mem_ptr;
242 free_mem_ptr += size;
243
244 if (free_mem_ptr >= free_mem_end_ptr)
245 error("Out of memory");
246
247 return p;
248}
249
250static void free(void *where)
251{ /* Don't care */
252}
253
254static void gzip_mark(void **ptr)
255{
256 *ptr = (void *) free_mem_ptr;
257}
258
259static void gzip_release(void **ptr)
260{
261 free_mem_ptr = (memptr) *ptr;
262}
263
264static void scroll(void) 218static void scroll(void)
265{ 219{
266 int i; 220 int i;
@@ -270,18 +224,24 @@ static void scroll(void)
270 vidmem[i] = ' '; 224 vidmem[i] = ' ';
271} 225}
272 226
273static void putstr(const char *s) 227static void __putstr(int error, const char *s)
274{ 228{
275 int x, y, pos; 229 int x, y, pos;
276 char c; 230 char c;
277 231
232#ifndef CONFIG_X86_VERBOSE_BOOTUP
233 if (!error)
234 return;
235#endif
236
278#ifdef CONFIG_X86_32 237#ifdef CONFIG_X86_32
279 if (RM_SCREEN_INFO.orig_video_mode == 0 && lines == 0 && cols == 0) 238 if (real_mode->screen_info.orig_video_mode == 0 &&
239 lines == 0 && cols == 0)
280 return; 240 return;
281#endif 241#endif
282 242
283 x = RM_SCREEN_INFO.orig_x; 243 x = real_mode->screen_info.orig_x;
284 y = RM_SCREEN_INFO.orig_y; 244 y = real_mode->screen_info.orig_y;
285 245
286 while ((c = *s++) != '\0') { 246 while ((c = *s++) != '\0') {
287 if (c == '\n') { 247 if (c == '\n') {
@@ -291,7 +251,7 @@ static void putstr(const char *s)
291 y--; 251 y--;
292 } 252 }
293 } else { 253 } else {
294 vidmem [(x + cols * y) * 2] = c; 254 vidmem[(x + cols * y) * 2] = c;
295 if (++x >= cols) { 255 if (++x >= cols) {
296 x = 0; 256 x = 0;
297 if (++y >= lines) { 257 if (++y >= lines) {
@@ -302,8 +262,8 @@ static void putstr(const char *s)
302 } 262 }
303 } 263 }
304 264
305 RM_SCREEN_INFO.orig_x = x; 265 real_mode->screen_info.orig_x = x;
306 RM_SCREEN_INFO.orig_y = y; 266 real_mode->screen_info.orig_y = y;
307 267
308 pos = (x + cols * y) * 2; /* Update cursor position */ 268 pos = (x + cols * y) * 2; /* Update cursor position */
309 outb(14, vidport); 269 outb(14, vidport);
@@ -317,7 +277,8 @@ static void *memset(void *s, int c, unsigned n)
317 int i; 277 int i;
318 char *ss = s; 278 char *ss = s;
319 279
320 for (i = 0; i < n; i++) ss[i] = c; 280 for (i = 0; i < n; i++)
281 ss[i] = c;
321 return s; 282 return s;
322} 283}
323 284
@@ -327,7 +288,8 @@ static void *memcpy(void *dest, const void *src, unsigned n)
327 const char *s = src; 288 const char *s = src;
328 char *d = dest; 289 char *d = dest;
329 290
330 for (i = 0; i < n; i++) d[i] = s[i]; 291 for (i = 0; i < n; i++)
292 d[i] = s[i];
331 return dest; 293 return dest;
332} 294}
333 295
@@ -366,9 +328,9 @@ static void flush_window(void)
366 328
367static void error(char *x) 329static void error(char *x)
368{ 330{
369 putstr("\n\n"); 331 __putstr(1, "\n\n");
370 putstr(x); 332 __putstr(1, x);
371 putstr("\n\n -- System halted"); 333 __putstr(1, "\n\n -- System halted");
372 334
373 while (1) 335 while (1)
374 asm("hlt"); 336 asm("hlt");
@@ -395,7 +357,8 @@ static void parse_elf(void *output)
395 return; 357 return;
396 } 358 }
397 359
398 putstr("Parsing ELF... "); 360 if (!quiet)
361 putstr("Parsing ELF... ");
399 362
400 phdrs = malloc(sizeof(*phdrs) * ehdr.e_phnum); 363 phdrs = malloc(sizeof(*phdrs) * ehdr.e_phnum);
401 if (!phdrs) 364 if (!phdrs)
@@ -430,7 +393,10 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
430{ 393{
431 real_mode = rmode; 394 real_mode = rmode;
432 395
433 if (RM_SCREEN_INFO.orig_video_mode == 7) { 396 if (real_mode->hdr.loadflags & QUIET_FLAG)
397 quiet = 1;
398
399 if (real_mode->screen_info.orig_video_mode == 7) {
434 vidmem = (char *) 0xb0000; 400 vidmem = (char *) 0xb0000;
435 vidport = 0x3b4; 401 vidport = 0x3b4;
436 } else { 402 } else {
@@ -438,8 +404,8 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
438 vidport = 0x3d4; 404 vidport = 0x3d4;
439 } 405 }
440 406
441 lines = RM_SCREEN_INFO.orig_video_lines; 407 lines = real_mode->screen_info.orig_video_lines;
442 cols = RM_SCREEN_INFO.orig_video_cols; 408 cols = real_mode->screen_info.orig_video_cols;
443 409
444 window = output; /* Output buffer (Normally at 1M) */ 410 window = output; /* Output buffer (Normally at 1M) */
445 free_mem_ptr = heap; /* Heap */ 411 free_mem_ptr = heap; /* Heap */
@@ -465,9 +431,11 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
465#endif 431#endif
466 432
467 makecrc(); 433 makecrc();
468 putstr("\nDecompressing Linux... "); 434 if (!quiet)
435 putstr("\nDecompressing Linux... ");
469 gunzip(); 436 gunzip();
470 parse_elf(output); 437 parse_elf(output);
471 putstr("done.\nBooting the kernel.\n"); 438 if (!quiet)
439 putstr("done.\nBooting the kernel.\n");
472 return; 440 return;
473} 441}
diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c
index edaadea90aaf..857e492c571e 100644
--- a/arch/x86/boot/compressed/relocs.c
+++ b/arch/x86/boot/compressed/relocs.c
@@ -10,16 +10,20 @@
10#define USE_BSD 10#define USE_BSD
11#include <endian.h> 11#include <endian.h>
12 12
13#define MAX_SHDRS 100
14#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 13#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
15static Elf32_Ehdr ehdr; 14static Elf32_Ehdr ehdr;
16static Elf32_Shdr shdr[MAX_SHDRS];
17static Elf32_Sym *symtab[MAX_SHDRS];
18static Elf32_Rel *reltab[MAX_SHDRS];
19static char *strtab[MAX_SHDRS];
20static unsigned long reloc_count, reloc_idx; 15static unsigned long reloc_count, reloc_idx;
21static unsigned long *relocs; 16static unsigned long *relocs;
22 17
18struct section {
19 Elf32_Shdr shdr;
20 struct section *link;
21 Elf32_Sym *symtab;
22 Elf32_Rel *reltab;
23 char *strtab;
24};
25static struct section *secs;
26
23/* 27/*
24 * Following symbols have been audited. There values are constant and do 28 * Following symbols have been audited. There values are constant and do
25 * not change if bzImage is loaded at a different physical address than 29 * not change if bzImage is loaded at a different physical address than
@@ -35,7 +39,7 @@ static int is_safe_abs_reloc(const char* sym_name)
35{ 39{
36 int i; 40 int i;
37 41
38 for(i = 0; i < ARRAY_SIZE(safe_abs_relocs); i++) { 42 for (i = 0; i < ARRAY_SIZE(safe_abs_relocs); i++) {
39 if (!strcmp(sym_name, safe_abs_relocs[i])) 43 if (!strcmp(sym_name, safe_abs_relocs[i]))
40 /* Match found */ 44 /* Match found */
41 return 1; 45 return 1;
@@ -137,10 +141,10 @@ static const char *sec_name(unsigned shndx)
137{ 141{
138 const char *sec_strtab; 142 const char *sec_strtab;
139 const char *name; 143 const char *name;
140 sec_strtab = strtab[ehdr.e_shstrndx]; 144 sec_strtab = secs[ehdr.e_shstrndx].strtab;
141 name = "<noname>"; 145 name = "<noname>";
142 if (shndx < ehdr.e_shnum) { 146 if (shndx < ehdr.e_shnum) {
143 name = sec_strtab + shdr[shndx].sh_name; 147 name = sec_strtab + secs[shndx].shdr.sh_name;
144 } 148 }
145 else if (shndx == SHN_ABS) { 149 else if (shndx == SHN_ABS) {
146 name = "ABSOLUTE"; 150 name = "ABSOLUTE";
@@ -159,7 +163,7 @@ static const char *sym_name(const char *sym_strtab, Elf32_Sym *sym)
159 name = sym_strtab + sym->st_name; 163 name = sym_strtab + sym->st_name;
160 } 164 }
161 else { 165 else {
162 name = sec_name(shdr[sym->st_shndx].sh_name); 166 name = sec_name(secs[sym->st_shndx].shdr.sh_name);
163 } 167 }
164 return name; 168 return name;
165} 169}
@@ -244,29 +248,34 @@ static void read_ehdr(FILE *fp)
244static void read_shdrs(FILE *fp) 248static void read_shdrs(FILE *fp)
245{ 249{
246 int i; 250 int i;
247 if (ehdr.e_shnum > MAX_SHDRS) { 251 Elf32_Shdr shdr;
248 die("%d section headers supported: %d\n", 252
249 ehdr.e_shnum, MAX_SHDRS); 253 secs = calloc(ehdr.e_shnum, sizeof(struct section));
254 if (!secs) {
255 die("Unable to allocate %d section headers\n",
256 ehdr.e_shnum);
250 } 257 }
251 if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) { 258 if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) {
252 die("Seek to %d failed: %s\n", 259 die("Seek to %d failed: %s\n",
253 ehdr.e_shoff, strerror(errno)); 260 ehdr.e_shoff, strerror(errno));
254 } 261 }
255 if (fread(&shdr, sizeof(shdr[0]), ehdr.e_shnum, fp) != ehdr.e_shnum) { 262 for (i = 0; i < ehdr.e_shnum; i++) {
256 die("Cannot read ELF section headers: %s\n", 263 struct section *sec = &secs[i];
257 strerror(errno)); 264 if (fread(&shdr, sizeof shdr, 1, fp) != 1)
258 } 265 die("Cannot read ELF section headers %d/%d: %s\n",
259 for(i = 0; i < ehdr.e_shnum; i++) { 266 i, ehdr.e_shnum, strerror(errno));
260 shdr[i].sh_name = elf32_to_cpu(shdr[i].sh_name); 267 sec->shdr.sh_name = elf32_to_cpu(shdr.sh_name);
261 shdr[i].sh_type = elf32_to_cpu(shdr[i].sh_type); 268 sec->shdr.sh_type = elf32_to_cpu(shdr.sh_type);
262 shdr[i].sh_flags = elf32_to_cpu(shdr[i].sh_flags); 269 sec->shdr.sh_flags = elf32_to_cpu(shdr.sh_flags);
263 shdr[i].sh_addr = elf32_to_cpu(shdr[i].sh_addr); 270 sec->shdr.sh_addr = elf32_to_cpu(shdr.sh_addr);
264 shdr[i].sh_offset = elf32_to_cpu(shdr[i].sh_offset); 271 sec->shdr.sh_offset = elf32_to_cpu(shdr.sh_offset);
265 shdr[i].sh_size = elf32_to_cpu(shdr[i].sh_size); 272 sec->shdr.sh_size = elf32_to_cpu(shdr.sh_size);
266 shdr[i].sh_link = elf32_to_cpu(shdr[i].sh_link); 273 sec->shdr.sh_link = elf32_to_cpu(shdr.sh_link);
267 shdr[i].sh_info = elf32_to_cpu(shdr[i].sh_info); 274 sec->shdr.sh_info = elf32_to_cpu(shdr.sh_info);
268 shdr[i].sh_addralign = elf32_to_cpu(shdr[i].sh_addralign); 275 sec->shdr.sh_addralign = elf32_to_cpu(shdr.sh_addralign);
269 shdr[i].sh_entsize = elf32_to_cpu(shdr[i].sh_entsize); 276 sec->shdr.sh_entsize = elf32_to_cpu(shdr.sh_entsize);
277 if (sec->shdr.sh_link < ehdr.e_shnum)
278 sec->link = &secs[sec->shdr.sh_link];
270 } 279 }
271 280
272} 281}
@@ -274,20 +283,22 @@ static void read_shdrs(FILE *fp)
274static void read_strtabs(FILE *fp) 283static void read_strtabs(FILE *fp)
275{ 284{
276 int i; 285 int i;
277 for(i = 0; i < ehdr.e_shnum; i++) { 286 for (i = 0; i < ehdr.e_shnum; i++) {
278 if (shdr[i].sh_type != SHT_STRTAB) { 287 struct section *sec = &secs[i];
288 if (sec->shdr.sh_type != SHT_STRTAB) {
279 continue; 289 continue;
280 } 290 }
281 strtab[i] = malloc(shdr[i].sh_size); 291 sec->strtab = malloc(sec->shdr.sh_size);
282 if (!strtab[i]) { 292 if (!sec->strtab) {
283 die("malloc of %d bytes for strtab failed\n", 293 die("malloc of %d bytes for strtab failed\n",
284 shdr[i].sh_size); 294 sec->shdr.sh_size);
285 } 295 }
286 if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) { 296 if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
287 die("Seek to %d failed: %s\n", 297 die("Seek to %d failed: %s\n",
288 shdr[i].sh_offset, strerror(errno)); 298 sec->shdr.sh_offset, strerror(errno));
289 } 299 }
290 if (fread(strtab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) { 300 if (fread(sec->strtab, 1, sec->shdr.sh_size, fp)
301 != sec->shdr.sh_size) {
291 die("Cannot read symbol table: %s\n", 302 die("Cannot read symbol table: %s\n",
292 strerror(errno)); 303 strerror(errno));
293 } 304 }
@@ -297,28 +308,31 @@ static void read_strtabs(FILE *fp)
297static void read_symtabs(FILE *fp) 308static void read_symtabs(FILE *fp)
298{ 309{
299 int i,j; 310 int i,j;
300 for(i = 0; i < ehdr.e_shnum; i++) { 311 for (i = 0; i < ehdr.e_shnum; i++) {
301 if (shdr[i].sh_type != SHT_SYMTAB) { 312 struct section *sec = &secs[i];
313 if (sec->shdr.sh_type != SHT_SYMTAB) {
302 continue; 314 continue;
303 } 315 }
304 symtab[i] = malloc(shdr[i].sh_size); 316 sec->symtab = malloc(sec->shdr.sh_size);
305 if (!symtab[i]) { 317 if (!sec->symtab) {
306 die("malloc of %d bytes for symtab failed\n", 318 die("malloc of %d bytes for symtab failed\n",
307 shdr[i].sh_size); 319 sec->shdr.sh_size);
308 } 320 }
309 if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) { 321 if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
310 die("Seek to %d failed: %s\n", 322 die("Seek to %d failed: %s\n",
311 shdr[i].sh_offset, strerror(errno)); 323 sec->shdr.sh_offset, strerror(errno));
312 } 324 }
313 if (fread(symtab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) { 325 if (fread(sec->symtab, 1, sec->shdr.sh_size, fp)
326 != sec->shdr.sh_size) {
314 die("Cannot read symbol table: %s\n", 327 die("Cannot read symbol table: %s\n",
315 strerror(errno)); 328 strerror(errno));
316 } 329 }
317 for(j = 0; j < shdr[i].sh_size/sizeof(symtab[i][0]); j++) { 330 for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) {
318 symtab[i][j].st_name = elf32_to_cpu(symtab[i][j].st_name); 331 Elf32_Sym *sym = &sec->symtab[j];
319 symtab[i][j].st_value = elf32_to_cpu(symtab[i][j].st_value); 332 sym->st_name = elf32_to_cpu(sym->st_name);
320 symtab[i][j].st_size = elf32_to_cpu(symtab[i][j].st_size); 333 sym->st_value = elf32_to_cpu(sym->st_value);
321 symtab[i][j].st_shndx = elf16_to_cpu(symtab[i][j].st_shndx); 334 sym->st_size = elf32_to_cpu(sym->st_size);
335 sym->st_shndx = elf16_to_cpu(sym->st_shndx);
322 } 336 }
323 } 337 }
324} 338}
@@ -327,26 +341,29 @@ static void read_symtabs(FILE *fp)
327static void read_relocs(FILE *fp) 341static void read_relocs(FILE *fp)
328{ 342{
329 int i,j; 343 int i,j;
330 for(i = 0; i < ehdr.e_shnum; i++) { 344 for (i = 0; i < ehdr.e_shnum; i++) {
331 if (shdr[i].sh_type != SHT_REL) { 345 struct section *sec = &secs[i];
346 if (sec->shdr.sh_type != SHT_REL) {
332 continue; 347 continue;
333 } 348 }
334 reltab[i] = malloc(shdr[i].sh_size); 349 sec->reltab = malloc(sec->shdr.sh_size);
335 if (!reltab[i]) { 350 if (!sec->reltab) {
336 die("malloc of %d bytes for relocs failed\n", 351 die("malloc of %d bytes for relocs failed\n",
337 shdr[i].sh_size); 352 sec->shdr.sh_size);
338 } 353 }
339 if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) { 354 if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
340 die("Seek to %d failed: %s\n", 355 die("Seek to %d failed: %s\n",
341 shdr[i].sh_offset, strerror(errno)); 356 sec->shdr.sh_offset, strerror(errno));
342 } 357 }
343 if (fread(reltab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) { 358 if (fread(sec->reltab, 1, sec->shdr.sh_size, fp)
359 != sec->shdr.sh_size) {
344 die("Cannot read symbol table: %s\n", 360 die("Cannot read symbol table: %s\n",
345 strerror(errno)); 361 strerror(errno));
346 } 362 }
347 for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) { 363 for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) {
348 reltab[i][j].r_offset = elf32_to_cpu(reltab[i][j].r_offset); 364 Elf32_Rel *rel = &sec->reltab[j];
349 reltab[i][j].r_info = elf32_to_cpu(reltab[i][j].r_info); 365 rel->r_offset = elf32_to_cpu(rel->r_offset);
366 rel->r_info = elf32_to_cpu(rel->r_info);
350 } 367 }
351 } 368 }
352} 369}
@@ -357,19 +374,21 @@ static void print_absolute_symbols(void)
357 int i; 374 int i;
358 printf("Absolute symbols\n"); 375 printf("Absolute symbols\n");
359 printf(" Num: Value Size Type Bind Visibility Name\n"); 376 printf(" Num: Value Size Type Bind Visibility Name\n");
360 for(i = 0; i < ehdr.e_shnum; i++) { 377 for (i = 0; i < ehdr.e_shnum; i++) {
378 struct section *sec = &secs[i];
361 char *sym_strtab; 379 char *sym_strtab;
362 Elf32_Sym *sh_symtab; 380 Elf32_Sym *sh_symtab;
363 int j; 381 int j;
364 if (shdr[i].sh_type != SHT_SYMTAB) { 382
383 if (sec->shdr.sh_type != SHT_SYMTAB) {
365 continue; 384 continue;
366 } 385 }
367 sh_symtab = symtab[i]; 386 sh_symtab = sec->symtab;
368 sym_strtab = strtab[shdr[i].sh_link]; 387 sym_strtab = sec->link->strtab;
369 for(j = 0; j < shdr[i].sh_size/sizeof(symtab[0][0]); j++) { 388 for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) {
370 Elf32_Sym *sym; 389 Elf32_Sym *sym;
371 const char *name; 390 const char *name;
372 sym = &symtab[i][j]; 391 sym = &sec->symtab[j];
373 name = sym_name(sym_strtab, sym); 392 name = sym_name(sym_strtab, sym);
374 if (sym->st_shndx != SHN_ABS) { 393 if (sym->st_shndx != SHN_ABS) {
375 continue; 394 continue;
@@ -389,26 +408,27 @@ static void print_absolute_relocs(void)
389{ 408{
390 int i, printed = 0; 409 int i, printed = 0;
391 410
392 for(i = 0; i < ehdr.e_shnum; i++) { 411 for (i = 0; i < ehdr.e_shnum; i++) {
412 struct section *sec = &secs[i];
413 struct section *sec_applies, *sec_symtab;
393 char *sym_strtab; 414 char *sym_strtab;
394 Elf32_Sym *sh_symtab; 415 Elf32_Sym *sh_symtab;
395 unsigned sec_applies, sec_symtab;
396 int j; 416 int j;
397 if (shdr[i].sh_type != SHT_REL) { 417 if (sec->shdr.sh_type != SHT_REL) {
398 continue; 418 continue;
399 } 419 }
400 sec_symtab = shdr[i].sh_link; 420 sec_symtab = sec->link;
401 sec_applies = shdr[i].sh_info; 421 sec_applies = &secs[sec->shdr.sh_info];
402 if (!(shdr[sec_applies].sh_flags & SHF_ALLOC)) { 422 if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) {
403 continue; 423 continue;
404 } 424 }
405 sh_symtab = symtab[sec_symtab]; 425 sh_symtab = sec_symtab->symtab;
406 sym_strtab = strtab[shdr[sec_symtab].sh_link]; 426 sym_strtab = sec_symtab->link->strtab;
407 for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) { 427 for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) {
408 Elf32_Rel *rel; 428 Elf32_Rel *rel;
409 Elf32_Sym *sym; 429 Elf32_Sym *sym;
410 const char *name; 430 const char *name;
411 rel = &reltab[i][j]; 431 rel = &sec->reltab[j];
412 sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; 432 sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
413 name = sym_name(sym_strtab, sym); 433 name = sym_name(sym_strtab, sym);
414 if (sym->st_shndx != SHN_ABS) { 434 if (sym->st_shndx != SHN_ABS) {
@@ -456,26 +476,28 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym))
456{ 476{
457 int i; 477 int i;
458 /* Walk through the relocations */ 478 /* Walk through the relocations */
459 for(i = 0; i < ehdr.e_shnum; i++) { 479 for (i = 0; i < ehdr.e_shnum; i++) {
460 char *sym_strtab; 480 char *sym_strtab;
461 Elf32_Sym *sh_symtab; 481 Elf32_Sym *sh_symtab;
462 unsigned sec_applies, sec_symtab; 482 struct section *sec_applies, *sec_symtab;
463 int j; 483 int j;
464 if (shdr[i].sh_type != SHT_REL) { 484 struct section *sec = &secs[i];
485
486 if (sec->shdr.sh_type != SHT_REL) {
465 continue; 487 continue;
466 } 488 }
467 sec_symtab = shdr[i].sh_link; 489 sec_symtab = sec->link;
468 sec_applies = shdr[i].sh_info; 490 sec_applies = &secs[sec->shdr.sh_info];
469 if (!(shdr[sec_applies].sh_flags & SHF_ALLOC)) { 491 if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) {
470 continue; 492 continue;
471 } 493 }
472 sh_symtab = symtab[sec_symtab]; 494 sh_symtab = sec_symtab->symtab;
473 sym_strtab = strtab[shdr[sec_symtab].sh_link]; 495 sym_strtab = sec_symtab->link->strtab;
474 for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) { 496 for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) {
475 Elf32_Rel *rel; 497 Elf32_Rel *rel;
476 Elf32_Sym *sym; 498 Elf32_Sym *sym;
477 unsigned r_type; 499 unsigned r_type;
478 rel = &reltab[i][j]; 500 rel = &sec->reltab[j];
479 sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; 501 sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
480 r_type = ELF32_R_TYPE(rel->r_info); 502 r_type = ELF32_R_TYPE(rel->r_info);
481 /* Don't visit relocations to absolute symbols */ 503 /* Don't visit relocations to absolute symbols */
@@ -539,7 +561,7 @@ static void emit_relocs(int as_text)
539 */ 561 */
540 printf(".section \".data.reloc\",\"a\"\n"); 562 printf(".section \".data.reloc\",\"a\"\n");
541 printf(".balign 4\n"); 563 printf(".balign 4\n");
542 for(i = 0; i < reloc_count; i++) { 564 for (i = 0; i < reloc_count; i++) {
543 printf("\t .long 0x%08lx\n", relocs[i]); 565 printf("\t .long 0x%08lx\n", relocs[i]);
544 } 566 }
545 printf("\n"); 567 printf("\n");
@@ -550,7 +572,7 @@ static void emit_relocs(int as_text)
550 /* Print a stop */ 572 /* Print a stop */
551 printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]); 573 printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]);
552 /* Now print each relocation */ 574 /* Now print each relocation */
553 for(i = 0; i < reloc_count; i++) { 575 for (i = 0; i < reloc_count; i++) {
554 buf[0] = (relocs[i] >> 0) & 0xff; 576 buf[0] = (relocs[i] >> 0) & 0xff;
555 buf[1] = (relocs[i] >> 8) & 0xff; 577 buf[1] = (relocs[i] >> 8) & 0xff;
556 buf[2] = (relocs[i] >> 16) & 0xff; 578 buf[2] = (relocs[i] >> 16) & 0xff;
@@ -577,7 +599,7 @@ int main(int argc, char **argv)
577 show_absolute_relocs = 0; 599 show_absolute_relocs = 0;
578 as_text = 0; 600 as_text = 0;
579 fname = NULL; 601 fname = NULL;
580 for(i = 1; i < argc; i++) { 602 for (i = 1; i < argc; i++) {
581 char *arg = argv[i]; 603 char *arg = argv[i];
582 if (*arg == '-') { 604 if (*arg == '-') {
583 if (strcmp(argv[1], "--abs-syms") == 0) { 605 if (strcmp(argv[1], "--abs-syms") == 0) {
diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c
index 00e19edd852c..6ec6bb6e9957 100644
--- a/arch/x86/boot/cpu.c
+++ b/arch/x86/boot/cpu.c
@@ -16,9 +16,6 @@
16 */ 16 */
17 17
18#include "boot.h" 18#include "boot.h"
19#include "bitops.h"
20#include <asm/cpufeature.h>
21
22#include "cpustr.h" 19#include "cpustr.h"
23 20
24static char *cpu_name(int level) 21static char *cpu_name(int level)
@@ -28,6 +25,8 @@ static char *cpu_name(int level)
28 if (level == 64) { 25 if (level == 64) {
29 return "x86-64"; 26 return "x86-64";
30 } else { 27 } else {
28 if (level == 15)
29 level = 6;
31 sprintf(buf, "i%d86", level); 30 sprintf(buf, "i%d86", level);
32 return buf; 31 return buf;
33 } 32 }
@@ -60,17 +59,18 @@ int validate_cpu(void)
60 u32 e = err_flags[i]; 59 u32 e = err_flags[i];
61 60
62 for (j = 0; j < 32; j++) { 61 for (j = 0; j < 32; j++) {
63 int n = (i << 5)+j; 62 if (msg_strs[0] < i ||
64 if (*msg_strs < n) { 63 (msg_strs[0] == i && msg_strs[1] < j)) {
65 /* Skip to the next string */ 64 /* Skip to the next string */
66 do { 65 msg_strs += 2;
67 msg_strs++; 66 while (*msg_strs++)
68 } while (*msg_strs); 67 ;
69 msg_strs++;
70 } 68 }
71 if (e & 1) { 69 if (e & 1) {
72 if (*msg_strs == n && msg_strs[1]) 70 if (msg_strs[0] == i &&
73 printf("%s ", msg_strs+1); 71 msg_strs[1] == j &&
72 msg_strs[2])
73 printf("%s ", msg_strs+2);
74 else 74 else
75 printf("%d:%d ", i, j); 75 printf("%d:%d ", i, j);
76 } 76 }
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
index 7804389ee005..4d3ff037201f 100644
--- a/arch/x86/boot/cpucheck.c
+++ b/arch/x86/boot/cpucheck.c
@@ -22,21 +22,13 @@
22 22
23#ifdef _SETUP 23#ifdef _SETUP
24# include "boot.h" 24# include "boot.h"
25# include "bitops.h"
26#endif 25#endif
27#include <linux/types.h> 26#include <linux/types.h>
28#include <asm/cpufeature.h>
29#include <asm/processor-flags.h> 27#include <asm/processor-flags.h>
30#include <asm/required-features.h> 28#include <asm/required-features.h>
31#include <asm/msr-index.h> 29#include <asm/msr-index.h>
32 30
33struct cpu_features { 31struct cpu_features cpu;
34 int level; /* Family, or 64 for x86-64 */
35 int model;
36 u32 flags[NCAPINTS];
37};
38
39static struct cpu_features cpu;
40static u32 cpu_vendor[3]; 32static u32 cpu_vendor[3];
41static u32 err_flags[NCAPINTS]; 33static u32 err_flags[NCAPINTS];
42 34
@@ -46,12 +38,12 @@ static const u32 req_flags[NCAPINTS] =
46{ 38{
47 REQUIRED_MASK0, 39 REQUIRED_MASK0,
48 REQUIRED_MASK1, 40 REQUIRED_MASK1,
49 REQUIRED_MASK2, 41 0, /* REQUIRED_MASK2 not implemented in this file */
50 REQUIRED_MASK3, 42 0, /* REQUIRED_MASK3 not implemented in this file */
51 REQUIRED_MASK4, 43 REQUIRED_MASK4,
52 REQUIRED_MASK5, 44 0, /* REQUIRED_MASK5 not implemented in this file */
53 REQUIRED_MASK6, 45 REQUIRED_MASK6,
54 REQUIRED_MASK7, 46 0, /* REQUIRED_MASK7 not implemented in this file */
55}; 47};
56 48
57#define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a)) 49#define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a))
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c
index 03399d64013b..1aae8f3e5ca1 100644
--- a/arch/x86/boot/edd.c
+++ b/arch/x86/boot/edd.c
@@ -41,6 +41,7 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
41 char *mbrbuf_ptr, *mbrbuf_end; 41 char *mbrbuf_ptr, *mbrbuf_end;
42 u32 buf_base, mbr_base; 42 u32 buf_base, mbr_base;
43 extern char _end[]; 43 extern char _end[];
44 u16 mbr_magic;
44 45
45 sector_size = ei->params.bytes_per_sector; 46 sector_size = ei->params.bytes_per_sector;
46 if (!sector_size) 47 if (!sector_size)
@@ -58,11 +59,15 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
58 if (mbrbuf_end > (char *)(size_t)boot_params.hdr.heap_end_ptr) 59 if (mbrbuf_end > (char *)(size_t)boot_params.hdr.heap_end_ptr)
59 return -1; 60 return -1;
60 61
62 memset(mbrbuf_ptr, 0, sector_size);
61 if (read_mbr(devno, mbrbuf_ptr)) 63 if (read_mbr(devno, mbrbuf_ptr))
62 return -1; 64 return -1;
63 65
64 *mbrsig = *(u32 *)&mbrbuf_ptr[EDD_MBR_SIG_OFFSET]; 66 *mbrsig = *(u32 *)&mbrbuf_ptr[EDD_MBR_SIG_OFFSET];
65 return 0; 67 mbr_magic = *(u16 *)&mbrbuf_ptr[510];
68
69 /* check for valid MBR magic */
70 return mbr_magic == 0xAA55 ? 0 : -1;
66} 71}
67 72
68static int get_edd_info(u8 devno, struct edd_info *ei) 73static int get_edd_info(u8 devno, struct edd_info *ei)
@@ -167,9 +172,8 @@ void query_edd(void)
167 * Scan the BIOS-supported hard disks and query EDD 172 * Scan the BIOS-supported hard disks and query EDD
168 * information... 173 * information...
169 */ 174 */
170 get_edd_info(devno, &ei); 175 if (!get_edd_info(devno, &ei)
171 176 && boot_params.eddbuf_entries < EDDMAXNR) {
172 if (boot_params.eddbuf_entries < EDDMAXNR) {
173 memcpy(edp, &ei, sizeof ei); 177 memcpy(edp, &ei, sizeof ei);
174 edp++; 178 edp++;
175 boot_params.eddbuf_entries++; 179 boot_params.eddbuf_entries++;
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index af86e431acfa..b993062e9a5f 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -30,7 +30,6 @@ SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */
30SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */ 30SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */
31 /* to be loaded */ 31 /* to be loaded */
32ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */ 32ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */
33SWAP_DEV = 0 /* SWAP_DEV is now written by "build" */
34 33
35#ifndef SVGA_MODE 34#ifndef SVGA_MODE
36#define SVGA_MODE ASK_VGA 35#define SVGA_MODE ASK_VGA
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 77569a4a3be1..197421db1af1 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -73,6 +73,11 @@ static void keyboard_set_repeat(void)
73 */ 73 */
74static void query_ist(void) 74static void query_ist(void)
75{ 75{
76 /* Some older BIOSes apparently crash on this call, so filter
77 it from machines too old to have SpeedStep at all. */
78 if (cpu.level < 6)
79 return;
80
76 asm("int $0x15" 81 asm("int $0x15"
77 : "=a" (boot_params.ist_info.signature), 82 : "=a" (boot_params.ist_info.signature),
78 "=b" (boot_params.ist_info.command), 83 "=b" (boot_params.ist_info.command),
@@ -165,6 +170,10 @@ void main(void)
165 /* Set the video mode */ 170 /* Set the video mode */
166 set_video(); 171 set_video();
167 172
173 /* Parse command line for 'quiet' and pass it to decompressor. */
174 if (cmdline_find_option_bool("quiet"))
175 boot_params.hdr.loadflags |= QUIET_FLAG;
176
168 /* Do the last things and invoke protected mode */ 177 /* Do the last things and invoke protected mode */
169 go_to_protected_mode(); 178 go_to_protected_mode();
170} 179}
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index acad32eb4290..8c3c25f35578 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -53,7 +53,7 @@ static int detect_memory_e820(void)
53 53
54 count++; 54 count++;
55 desc++; 55 desc++;
56 } while (next && count < E820MAX); 56 } while (next && count < ARRAY_SIZE(boot_params.e820_map));
57 57
58 return boot_params.e820_entries = count; 58 return boot_params.e820_entries = count;
59} 59}
diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c
index bbe76953bae9..8ef60f20b371 100644
--- a/arch/x86/boot/mkcpustr.c
+++ b/arch/x86/boot/mkcpustr.c
@@ -15,33 +15,33 @@
15 15
16#include <stdio.h> 16#include <stdio.h>
17 17
18#include "../kernel/cpu/feature_names.c" 18#include "../kernel/cpu/capflags.c"
19
20#if NCAPFLAGS > 8
21# error "Need to adjust the boot code handling of CPUID strings"
22#endif
23 19
24int main(void) 20int main(void)
25{ 21{
26 int i; 22 int i, j;
27 const char *str; 23 const char *str;
28 24
29 printf("static const char x86_cap_strs[] = \n"); 25 printf("static const char x86_cap_strs[] = \n");
30 26
31 for (i = 0; i < NCAPINTS*32; i++) { 27 for (i = 0; i < NCAPINTS; i++) {
32 str = x86_cap_flags[i]; 28 for (j = 0; j < 32; j++) {
33 29 str = x86_cap_flags[i*32+j];
34 if (i == NCAPINTS*32-1) { 30
35 /* The last entry must be unconditional; this 31 if (i == NCAPINTS-1 && j == 31) {
36 also consumes the compiler-added null character */ 32 /* The last entry must be unconditional; this
37 if (!str) 33 also consumes the compiler-added null
38 str = ""; 34 character */
39 printf("\t\"\\x%02x\"\"%s\"\n", i, str); 35 if (!str)
40 } else if (str) { 36 str = "";
41 printf("#if REQUIRED_MASK%d & (1 << %d)\n" 37 printf("\t\"\\x%02x\\x%02x\"\"%s\"\n",
42 "\t\"\\x%02x\"\"%s\\0\"\n" 38 i, j, str);
43 "#endif\n", 39 } else if (str) {
44 i >> 5, i & 31, i, str); 40 printf("#if REQUIRED_MASK%d & (1 << %d)\n"
41 "\t\"\\x%02x\\x%02x\"\"%s\\0\"\n"
42 "#endif\n",
43 i, j, i, j, str);
44 }
45 } 45 }
46 } 46 }
47 printf("\t;\n"); 47 printf("\t;\n");
diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c
index 328956fdb59e..85a1cd8a8ff8 100644
--- a/arch/x86/boot/pm.c
+++ b/arch/x86/boot/pm.c
@@ -98,12 +98,6 @@ static void reset_coprocessor(void)
98/* 98/*
99 * Set up the GDT 99 * Set up the GDT
100 */ 100 */
101#define GDT_ENTRY(flags, base, limit) \
102 (((u64)(base & 0xff000000) << 32) | \
103 ((u64)flags << 40) | \
104 ((u64)(limit & 0x00ff0000) << 32) | \
105 ((u64)(base & 0x00ffffff) << 16) | \
106 ((u64)(limit & 0x0000ffff)))
107 101
108struct gdt_ptr { 102struct gdt_ptr {
109 u16 len; 103 u16 len;
diff --git a/arch/x86/boot/pmjump.S b/arch/x86/boot/pmjump.S
index ab049d40a884..141b6e20ed31 100644
--- a/arch/x86/boot/pmjump.S
+++ b/arch/x86/boot/pmjump.S
@@ -33,6 +33,8 @@ protected_mode_jump:
33 movw %cs, %bx 33 movw %cs, %bx
34 shll $4, %ebx 34 shll $4, %ebx
35 addl %ebx, 2f 35 addl %ebx, 2f
36 jmp 1f # Short jump to serialize on 386/486
371:
36 38
37 movw $__BOOT_DS, %cx 39 movw $__BOOT_DS, %cx
38 movw $__BOOT_TSS, %di 40 movw $__BOOT_TSS, %di
@@ -40,8 +42,6 @@ protected_mode_jump:
40 movl %cr0, %edx 42 movl %cr0, %edx
41 orb $X86_CR0_PE, %dl # Protected mode 43 orb $X86_CR0_PE, %dl # Protected mode
42 movl %edx, %cr0 44 movl %edx, %cr0
43 jmp 1f # Short jump to serialize on 386/486
441:
45 45
46 # Transition to 32-bit mode 46 # Transition to 32-bit mode
47 .byte 0x66, 0xea # ljmpl opcode 47 .byte 0x66, 0xea # ljmpl opcode
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index 401ad998ad08..1e6fe0214c85 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -224,7 +224,7 @@ static void vesa_store_pm_info(void)
224static void vesa_store_mode_params_graphics(void) 224static void vesa_store_mode_params_graphics(void)
225{ 225{
226 /* Tell the kernel we're in VESA graphics mode */ 226 /* Tell the kernel we're in VESA graphics mode */
227 boot_params.screen_info.orig_video_isVGA = 0x23; 227 boot_params.screen_info.orig_video_isVGA = VIDEO_TYPE_VLFB;
228 228
229 /* Mode parameters */ 229 /* Mode parameters */
230 boot_params.screen_info.vesa_attributes = vminfo.mode_attr; 230 boot_params.screen_info.vesa_attributes = vminfo.mode_attr;
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
index 40ecb8d7688c..b939cb476dec 100644
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -259,8 +259,7 @@ static int vga_probe(void)
259 return mode_count[adapter]; 259 return mode_count[adapter];
260} 260}
261 261
262__videocard video_vga = 262__videocard video_vga = {
263{
264 .card_name = "VGA", 263 .card_name = "VGA",
265 .probe = vga_probe, 264 .probe = vga_probe,
266 .set_mode = vga_set_mode, 265 .set_mode = vga_set_mode,
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index ad7ddaaff588..52d0359719d7 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,54 +1,105 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.22-git14 3# Linux kernel version: 2.6.27-rc5
4# Fri Jul 20 09:53:15 2007 4# Wed Sep 3 17:23:09 2008
5# 5#
6# CONFIG_64BIT is not set
6CONFIG_X86_32=y 7CONFIG_X86_32=y
8# CONFIG_X86_64 is not set
9CONFIG_X86=y
10CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig"
11# CONFIG_GENERIC_LOCKBREAK is not set
7CONFIG_GENERIC_TIME=y 12CONFIG_GENERIC_TIME=y
13CONFIG_GENERIC_CMOS_UPDATE=y
8CONFIG_CLOCKSOURCE_WATCHDOG=y 14CONFIG_CLOCKSOURCE_WATCHDOG=y
9CONFIG_GENERIC_CLOCKEVENTS=y 15CONFIG_GENERIC_CLOCKEVENTS=y
10CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y 16CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y
11CONFIG_LOCKDEP_SUPPORT=y 17CONFIG_LOCKDEP_SUPPORT=y
12CONFIG_STACKTRACE_SUPPORT=y 18CONFIG_STACKTRACE_SUPPORT=y
13CONFIG_SEMAPHORE_SLEEPERS=y 19CONFIG_HAVE_LATENCYTOP_SUPPORT=y
14CONFIG_X86=y 20CONFIG_FAST_CMPXCHG_LOCAL=y
15CONFIG_MMU=y 21CONFIG_MMU=y
16CONFIG_ZONE_DMA=y 22CONFIG_ZONE_DMA=y
17CONFIG_QUICKLIST=y
18CONFIG_GENERIC_ISA_DMA=y 23CONFIG_GENERIC_ISA_DMA=y
19CONFIG_GENERIC_IOMAP=y 24CONFIG_GENERIC_IOMAP=y
20CONFIG_GENERIC_BUG=y 25CONFIG_GENERIC_BUG=y
21CONFIG_GENERIC_HWEIGHT=y 26CONFIG_GENERIC_HWEIGHT=y
27# CONFIG_GENERIC_GPIO is not set
22CONFIG_ARCH_MAY_HAVE_PC_FDC=y 28CONFIG_ARCH_MAY_HAVE_PC_FDC=y
23CONFIG_DMI=y 29# CONFIG_RWSEM_GENERIC_SPINLOCK is not set
30CONFIG_RWSEM_XCHGADD_ALGORITHM=y
31# CONFIG_ARCH_HAS_ILOG2_U32 is not set
32# CONFIG_ARCH_HAS_ILOG2_U64 is not set
33CONFIG_ARCH_HAS_CPU_IDLE_WAIT=y
34CONFIG_GENERIC_CALIBRATE_DELAY=y
35# CONFIG_GENERIC_TIME_VSYSCALL is not set
36CONFIG_ARCH_HAS_CPU_RELAX=y
37CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
38CONFIG_HAVE_SETUP_PER_CPU_AREA=y
39# CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set
40CONFIG_ARCH_HIBERNATION_POSSIBLE=y
41CONFIG_ARCH_SUSPEND_POSSIBLE=y
42# CONFIG_ZONE_DMA32 is not set
43CONFIG_ARCH_POPULATES_NODE_MAP=y
44# CONFIG_AUDIT_ARCH is not set
45CONFIG_ARCH_SUPPORTS_AOUT=y
46CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
47CONFIG_GENERIC_HARDIRQS=y
48CONFIG_GENERIC_IRQ_PROBE=y
49CONFIG_GENERIC_PENDING_IRQ=y
50CONFIG_X86_SMP=y
51CONFIG_X86_32_SMP=y
52CONFIG_X86_HT=y
53CONFIG_X86_BIOS_REBOOT=y
54CONFIG_X86_TRAMPOLINE=y
55CONFIG_KTIME_SCALAR=y
24CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" 56CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
25 57
26# 58#
27# Code maturity level options 59# General setup
28# 60#
29CONFIG_EXPERIMENTAL=y 61CONFIG_EXPERIMENTAL=y
30CONFIG_LOCK_KERNEL=y 62CONFIG_LOCK_KERNEL=y
31CONFIG_INIT_ENV_ARG_LIMIT=32 63CONFIG_INIT_ENV_ARG_LIMIT=32
32
33#
34# General setup
35#
36CONFIG_LOCALVERSION="" 64CONFIG_LOCALVERSION=""
37CONFIG_LOCALVERSION_AUTO=y 65# CONFIG_LOCALVERSION_AUTO is not set
38CONFIG_SWAP=y 66CONFIG_SWAP=y
39CONFIG_SYSVIPC=y 67CONFIG_SYSVIPC=y
40CONFIG_SYSVIPC_SYSCTL=y 68CONFIG_SYSVIPC_SYSCTL=y
41CONFIG_POSIX_MQUEUE=y 69CONFIG_POSIX_MQUEUE=y
42# CONFIG_BSD_PROCESS_ACCT is not set 70CONFIG_BSD_PROCESS_ACCT=y
43# CONFIG_TASKSTATS is not set 71# CONFIG_BSD_PROCESS_ACCT_V3 is not set
44# CONFIG_USER_NS is not set 72CONFIG_TASKSTATS=y
45# CONFIG_AUDIT is not set 73CONFIG_TASK_DELAY_ACCT=y
46CONFIG_IKCONFIG=y 74CONFIG_TASK_XACCT=y
47CONFIG_IKCONFIG_PROC=y 75CONFIG_TASK_IO_ACCOUNTING=y
48CONFIG_LOG_BUF_SHIFT=18 76CONFIG_AUDIT=y
49# CONFIG_CPUSETS is not set 77CONFIG_AUDITSYSCALL=y
50CONFIG_SYSFS_DEPRECATED=y 78CONFIG_AUDIT_TREE=y
79# CONFIG_IKCONFIG is not set
80CONFIG_LOG_BUF_SHIFT=17
81CONFIG_CGROUPS=y
82# CONFIG_CGROUP_DEBUG is not set
83CONFIG_CGROUP_NS=y
84# CONFIG_CGROUP_DEVICE is not set
85CONFIG_CPUSETS=y
86CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y
87CONFIG_GROUP_SCHED=y
88CONFIG_FAIR_GROUP_SCHED=y
89# CONFIG_RT_GROUP_SCHED is not set
90# CONFIG_USER_SCHED is not set
91CONFIG_CGROUP_SCHED=y
92CONFIG_CGROUP_CPUACCT=y
93CONFIG_RESOURCE_COUNTERS=y
94# CONFIG_CGROUP_MEM_RES_CTLR is not set
95# CONFIG_SYSFS_DEPRECATED_V2 is not set
96CONFIG_PROC_PID_CPUSET=y
51CONFIG_RELAY=y 97CONFIG_RELAY=y
98CONFIG_NAMESPACES=y
99CONFIG_UTS_NS=y
100CONFIG_IPC_NS=y
101CONFIG_USER_NS=y
102CONFIG_PID_NS=y
52CONFIG_BLK_DEV_INITRD=y 103CONFIG_BLK_DEV_INITRD=y
53CONFIG_INITRAMFS_SOURCE="" 104CONFIG_INITRAMFS_SOURCE=""
54CONFIG_CC_OPTIMIZE_FOR_SIZE=y 105CONFIG_CC_OPTIMIZE_FOR_SIZE=y
@@ -58,11 +109,13 @@ CONFIG_UID16=y
58CONFIG_SYSCTL_SYSCALL=y 109CONFIG_SYSCTL_SYSCALL=y
59CONFIG_KALLSYMS=y 110CONFIG_KALLSYMS=y
60CONFIG_KALLSYMS_ALL=y 111CONFIG_KALLSYMS_ALL=y
61# CONFIG_KALLSYMS_EXTRA_PASS is not set 112CONFIG_KALLSYMS_EXTRA_PASS=y
62CONFIG_HOTPLUG=y 113CONFIG_HOTPLUG=y
63CONFIG_PRINTK=y 114CONFIG_PRINTK=y
64CONFIG_BUG=y 115CONFIG_BUG=y
65CONFIG_ELF_CORE=y 116CONFIG_ELF_CORE=y
117CONFIG_PCSPKR_PLATFORM=y
118# CONFIG_COMPAT_BRK is not set
66CONFIG_BASE_FULL=y 119CONFIG_BASE_FULL=y
67CONFIG_FUTEX=y 120CONFIG_FUTEX=y
68CONFIG_ANON_INODES=y 121CONFIG_ANON_INODES=y
@@ -76,21 +129,40 @@ CONFIG_SLUB_DEBUG=y
76# CONFIG_SLAB is not set 129# CONFIG_SLAB is not set
77CONFIG_SLUB=y 130CONFIG_SLUB=y
78# CONFIG_SLOB is not set 131# CONFIG_SLOB is not set
132CONFIG_PROFILING=y
133CONFIG_MARKERS=y
134# CONFIG_OPROFILE is not set
135CONFIG_HAVE_OPROFILE=y
136CONFIG_KPROBES=y
137CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y
138CONFIG_KRETPROBES=y
139CONFIG_HAVE_IOREMAP_PROT=y
140CONFIG_HAVE_KPROBES=y
141CONFIG_HAVE_KRETPROBES=y
142# CONFIG_HAVE_ARCH_TRACEHOOK is not set
143# CONFIG_HAVE_DMA_ATTRS is not set
144CONFIG_USE_GENERIC_SMP_HELPERS=y
145# CONFIG_HAVE_CLK is not set
146CONFIG_PROC_PAGE_MONITOR=y
147CONFIG_HAVE_GENERIC_DMA_COHERENT=y
148CONFIG_SLABINFO=y
79CONFIG_RT_MUTEXES=y 149CONFIG_RT_MUTEXES=y
80# CONFIG_TINY_SHMEM is not set 150# CONFIG_TINY_SHMEM is not set
81CONFIG_BASE_SMALL=0 151CONFIG_BASE_SMALL=0
82CONFIG_MODULES=y 152CONFIG_MODULES=y
153# CONFIG_MODULE_FORCE_LOAD is not set
83CONFIG_MODULE_UNLOAD=y 154CONFIG_MODULE_UNLOAD=y
84CONFIG_MODULE_FORCE_UNLOAD=y 155CONFIG_MODULE_FORCE_UNLOAD=y
85# CONFIG_MODVERSIONS is not set 156# CONFIG_MODVERSIONS is not set
86# CONFIG_MODULE_SRCVERSION_ALL is not set 157# CONFIG_MODULE_SRCVERSION_ALL is not set
87# CONFIG_KMOD is not set 158CONFIG_KMOD=y
88CONFIG_STOP_MACHINE=y 159CONFIG_STOP_MACHINE=y
89CONFIG_BLOCK=y 160CONFIG_BLOCK=y
90CONFIG_LBD=y 161# CONFIG_LBD is not set
91# CONFIG_BLK_DEV_IO_TRACE is not set 162CONFIG_BLK_DEV_IO_TRACE=y
92# CONFIG_LSF is not set 163# CONFIG_LSF is not set
93# CONFIG_BLK_DEV_BSG is not set 164CONFIG_BLK_DEV_BSG=y
165# CONFIG_BLK_DEV_INTEGRITY is not set
94 166
95# 167#
96# IO Schedulers 168# IO Schedulers
@@ -103,7 +175,8 @@ CONFIG_IOSCHED_CFQ=y
103# CONFIG_DEFAULT_DEADLINE is not set 175# CONFIG_DEFAULT_DEADLINE is not set
104CONFIG_DEFAULT_CFQ=y 176CONFIG_DEFAULT_CFQ=y
105# CONFIG_DEFAULT_NOOP is not set 177# CONFIG_DEFAULT_NOOP is not set
106CONFIG_DEFAULT_IOSCHED="anticipatory" 178CONFIG_DEFAULT_IOSCHED="cfq"
179CONFIG_CLASSIC_RCU=y
107 180
108# 181#
109# Processor type and features 182# Processor type and features
@@ -111,28 +184,28 @@ CONFIG_DEFAULT_IOSCHED="anticipatory"
111CONFIG_TICK_ONESHOT=y 184CONFIG_TICK_ONESHOT=y
112CONFIG_NO_HZ=y 185CONFIG_NO_HZ=y
113CONFIG_HIGH_RES_TIMERS=y 186CONFIG_HIGH_RES_TIMERS=y
187CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
114CONFIG_SMP=y 188CONFIG_SMP=y
115# CONFIG_X86_PC is not set 189CONFIG_X86_FIND_SMP_CONFIG=y
190CONFIG_X86_MPPARSE=y
191CONFIG_X86_PC=y
116# CONFIG_X86_ELAN is not set 192# CONFIG_X86_ELAN is not set
117# CONFIG_X86_VOYAGER is not set 193# CONFIG_X86_VOYAGER is not set
118# CONFIG_X86_NUMAQ is not set 194# CONFIG_X86_GENERICARCH is not set
119# CONFIG_X86_SUMMIT is not set 195# CONFIG_X86_VSMP is not set
120# CONFIG_X86_BIGSMP is not set 196# CONFIG_X86_RDC321X is not set
121# CONFIG_X86_VISWS is not set 197CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
122CONFIG_X86_GENERICARCH=y 198# CONFIG_PARAVIRT_GUEST is not set
123# CONFIG_X86_ES7000 is not set 199# CONFIG_MEMTEST is not set
124# CONFIG_PARAVIRT is not set
125CONFIG_X86_CYCLONE_TIMER=y
126# CONFIG_M386 is not set 200# CONFIG_M386 is not set
127# CONFIG_M486 is not set 201# CONFIG_M486 is not set
128# CONFIG_M586 is not set 202# CONFIG_M586 is not set
129# CONFIG_M586TSC is not set 203# CONFIG_M586TSC is not set
130# CONFIG_M586MMX is not set 204# CONFIG_M586MMX is not set
131# CONFIG_M686 is not set 205CONFIG_M686=y
132# CONFIG_MPENTIUMII is not set 206# CONFIG_MPENTIUMII is not set
133CONFIG_MPENTIUMIII=y 207# CONFIG_MPENTIUMIII is not set
134# CONFIG_MPENTIUMM is not set 208# CONFIG_MPENTIUMM is not set
135# CONFIG_MCORE2 is not set
136# CONFIG_MPENTIUM4 is not set 209# CONFIG_MPENTIUM4 is not set
137# CONFIG_MK6 is not set 210# CONFIG_MK6 is not set
138# CONFIG_MK7 is not set 211# CONFIG_MK7 is not set
@@ -140,128 +213,141 @@ CONFIG_MPENTIUMIII=y
140# CONFIG_MCRUSOE is not set 213# CONFIG_MCRUSOE is not set
141# CONFIG_MEFFICEON is not set 214# CONFIG_MEFFICEON is not set
142# CONFIG_MWINCHIPC6 is not set 215# CONFIG_MWINCHIPC6 is not set
143# CONFIG_MWINCHIP2 is not set
144# CONFIG_MWINCHIP3D is not set 216# CONFIG_MWINCHIP3D is not set
145# CONFIG_MGEODEGX1 is not set 217# CONFIG_MGEODEGX1 is not set
146# CONFIG_MGEODE_LX is not set 218# CONFIG_MGEODE_LX is not set
147# CONFIG_MCYRIXIII is not set 219# CONFIG_MCYRIXIII is not set
148# CONFIG_MVIAC3_2 is not set 220# CONFIG_MVIAC3_2 is not set
149# CONFIG_MVIAC7 is not set 221# CONFIG_MVIAC7 is not set
222# CONFIG_MPSC is not set
223# CONFIG_MCORE2 is not set
224# CONFIG_GENERIC_CPU is not set
150CONFIG_X86_GENERIC=y 225CONFIG_X86_GENERIC=y
226CONFIG_X86_CPU=y
151CONFIG_X86_CMPXCHG=y 227CONFIG_X86_CMPXCHG=y
152CONFIG_X86_L1_CACHE_SHIFT=7 228CONFIG_X86_L1_CACHE_SHIFT=7
153CONFIG_X86_XADD=y 229CONFIG_X86_XADD=y
154CONFIG_RWSEM_XCHGADD_ALGORITHM=y 230# CONFIG_X86_PPRO_FENCE is not set
155# CONFIG_ARCH_HAS_ILOG2_U32 is not set
156# CONFIG_ARCH_HAS_ILOG2_U64 is not set
157CONFIG_GENERIC_CALIBRATE_DELAY=y
158CONFIG_X86_WP_WORKS_OK=y 231CONFIG_X86_WP_WORKS_OK=y
159CONFIG_X86_INVLPG=y 232CONFIG_X86_INVLPG=y
160CONFIG_X86_BSWAP=y 233CONFIG_X86_BSWAP=y
161CONFIG_X86_POPAD_OK=y 234CONFIG_X86_POPAD_OK=y
162CONFIG_X86_GOOD_APIC=y
163CONFIG_X86_INTEL_USERCOPY=y 235CONFIG_X86_INTEL_USERCOPY=y
164CONFIG_X86_USE_PPRO_CHECKSUM=y 236CONFIG_X86_USE_PPRO_CHECKSUM=y
165CONFIG_X86_TSC=y 237CONFIG_X86_TSC=y
166CONFIG_X86_CMOV=y 238CONFIG_X86_CMOV=y
167CONFIG_X86_MINIMUM_CPU_FAMILY=4 239CONFIG_X86_MINIMUM_CPU_FAMILY=4
240CONFIG_X86_DEBUGCTLMSR=y
168CONFIG_HPET_TIMER=y 241CONFIG_HPET_TIMER=y
169CONFIG_HPET_EMULATE_RTC=y 242CONFIG_HPET_EMULATE_RTC=y
170CONFIG_NR_CPUS=32 243CONFIG_DMI=y
244# CONFIG_IOMMU_HELPER is not set
245CONFIG_NR_CPUS=64
171CONFIG_SCHED_SMT=y 246CONFIG_SCHED_SMT=y
172CONFIG_SCHED_MC=y 247CONFIG_SCHED_MC=y
173# CONFIG_PREEMPT_NONE is not set 248# CONFIG_PREEMPT_NONE is not set
174CONFIG_PREEMPT_VOLUNTARY=y 249CONFIG_PREEMPT_VOLUNTARY=y
175# CONFIG_PREEMPT is not set 250# CONFIG_PREEMPT is not set
176CONFIG_PREEMPT_BKL=y
177CONFIG_X86_LOCAL_APIC=y 251CONFIG_X86_LOCAL_APIC=y
178CONFIG_X86_IO_APIC=y 252CONFIG_X86_IO_APIC=y
179CONFIG_X86_MCE=y 253# CONFIG_X86_MCE is not set
180CONFIG_X86_MCE_NONFATAL=y
181CONFIG_X86_MCE_P4THERMAL=y
182CONFIG_VM86=y 254CONFIG_VM86=y
183# CONFIG_TOSHIBA is not set 255# CONFIG_TOSHIBA is not set
184# CONFIG_I8K is not set 256# CONFIG_I8K is not set
185# CONFIG_X86_REBOOTFIXUPS is not set 257CONFIG_X86_REBOOTFIXUPS=y
186CONFIG_MICROCODE=y 258CONFIG_MICROCODE=y
187CONFIG_MICROCODE_OLD_INTERFACE=y 259CONFIG_MICROCODE_OLD_INTERFACE=y
188CONFIG_X86_MSR=y 260CONFIG_X86_MSR=y
189CONFIG_X86_CPUID=y 261CONFIG_X86_CPUID=y
190
191#
192# Firmware Drivers
193#
194# CONFIG_EDD is not set
195# CONFIG_DELL_RBU is not set
196# CONFIG_DCDBAS is not set
197CONFIG_DMIID=y
198# CONFIG_NOHIGHMEM is not set 262# CONFIG_NOHIGHMEM is not set
199CONFIG_HIGHMEM4G=y 263CONFIG_HIGHMEM4G=y
200# CONFIG_HIGHMEM64G is not set 264# CONFIG_HIGHMEM64G is not set
201CONFIG_PAGE_OFFSET=0xC0000000 265CONFIG_PAGE_OFFSET=0xC0000000
202CONFIG_HIGHMEM=y 266CONFIG_HIGHMEM=y
203CONFIG_ARCH_POPULATES_NODE_MAP=y 267CONFIG_ARCH_FLATMEM_ENABLE=y
268CONFIG_ARCH_SPARSEMEM_ENABLE=y
269CONFIG_ARCH_SELECT_MEMORY_MODEL=y
204CONFIG_SELECT_MEMORY_MODEL=y 270CONFIG_SELECT_MEMORY_MODEL=y
205CONFIG_FLATMEM_MANUAL=y 271CONFIG_FLATMEM_MANUAL=y
206# CONFIG_DISCONTIGMEM_MANUAL is not set 272# CONFIG_DISCONTIGMEM_MANUAL is not set
207# CONFIG_SPARSEMEM_MANUAL is not set 273# CONFIG_SPARSEMEM_MANUAL is not set
208CONFIG_FLATMEM=y 274CONFIG_FLATMEM=y
209CONFIG_FLAT_NODE_MEM_MAP=y 275CONFIG_FLAT_NODE_MEM_MAP=y
210# CONFIG_SPARSEMEM_STATIC is not set 276CONFIG_SPARSEMEM_STATIC=y
277# CONFIG_SPARSEMEM_VMEMMAP_ENABLE is not set
278CONFIG_PAGEFLAGS_EXTENDED=y
211CONFIG_SPLIT_PTLOCK_CPUS=4 279CONFIG_SPLIT_PTLOCK_CPUS=4
212CONFIG_RESOURCES_64BIT=y 280CONFIG_RESOURCES_64BIT=y
213CONFIG_ZONE_DMA_FLAG=1 281CONFIG_ZONE_DMA_FLAG=1
214CONFIG_BOUNCE=y 282CONFIG_BOUNCE=y
215CONFIG_NR_QUICK=1
216CONFIG_VIRT_TO_BUS=y 283CONFIG_VIRT_TO_BUS=y
217# CONFIG_HIGHPTE is not set 284CONFIG_HIGHPTE=y
218# CONFIG_MATH_EMULATION is not set 285# CONFIG_MATH_EMULATION is not set
219CONFIG_MTRR=y 286CONFIG_MTRR=y
220# CONFIG_EFI is not set 287# CONFIG_MTRR_SANITIZER is not set
288CONFIG_X86_PAT=y
289CONFIG_EFI=y
221# CONFIG_IRQBALANCE is not set 290# CONFIG_IRQBALANCE is not set
222CONFIG_SECCOMP=y 291CONFIG_SECCOMP=y
223# CONFIG_HZ_100 is not set 292# CONFIG_HZ_100 is not set
224CONFIG_HZ_250=y 293# CONFIG_HZ_250 is not set
225# CONFIG_HZ_300 is not set 294# CONFIG_HZ_300 is not set
226# CONFIG_HZ_1000 is not set 295CONFIG_HZ_1000=y
227CONFIG_HZ=250 296CONFIG_HZ=1000
228# CONFIG_KEXEC is not set 297CONFIG_SCHED_HRTICK=y
229# CONFIG_CRASH_DUMP is not set 298CONFIG_KEXEC=y
230CONFIG_PHYSICAL_START=0x100000 299CONFIG_CRASH_DUMP=y
231# CONFIG_RELOCATABLE is not set 300# CONFIG_KEXEC_JUMP is not set
232CONFIG_PHYSICAL_ALIGN=0x100000 301CONFIG_PHYSICAL_START=0x1000000
233# CONFIG_HOTPLUG_CPU is not set 302CONFIG_RELOCATABLE=y
234CONFIG_COMPAT_VDSO=y 303CONFIG_PHYSICAL_ALIGN=0x200000
304CONFIG_HOTPLUG_CPU=y
305# CONFIG_COMPAT_VDSO is not set
235CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y 306CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
236 307
237# 308#
238# Power management options (ACPI, APM) 309# Power management options
239# 310#
240CONFIG_PM=y 311CONFIG_PM=y
241CONFIG_PM_LEGACY=y 312CONFIG_PM_DEBUG=y
242# CONFIG_PM_DEBUG is not set 313# CONFIG_PM_VERBOSE is not set
243 314CONFIG_CAN_PM_TRACE=y
244# 315CONFIG_PM_TRACE=y
245# ACPI (Advanced Configuration and Power Interface) Support 316CONFIG_PM_TRACE_RTC=y
246# 317CONFIG_PM_SLEEP_SMP=y
318CONFIG_PM_SLEEP=y
319CONFIG_SUSPEND=y
320# CONFIG_PM_TEST_SUSPEND is not set
321CONFIG_SUSPEND_FREEZER=y
322CONFIG_HIBERNATION=y
323CONFIG_PM_STD_PARTITION=""
247CONFIG_ACPI=y 324CONFIG_ACPI=y
325CONFIG_ACPI_SLEEP=y
248CONFIG_ACPI_PROCFS=y 326CONFIG_ACPI_PROCFS=y
327CONFIG_ACPI_PROCFS_POWER=y
328CONFIG_ACPI_SYSFS_POWER=y
329CONFIG_ACPI_PROC_EVENT=y
249CONFIG_ACPI_AC=y 330CONFIG_ACPI_AC=y
250CONFIG_ACPI_BATTERY=y 331CONFIG_ACPI_BATTERY=y
251CONFIG_ACPI_BUTTON=y 332CONFIG_ACPI_BUTTON=y
252CONFIG_ACPI_FAN=y 333CONFIG_ACPI_FAN=y
253# CONFIG_ACPI_DOCK is not set 334CONFIG_ACPI_DOCK=y
335# CONFIG_ACPI_BAY is not set
254CONFIG_ACPI_PROCESSOR=y 336CONFIG_ACPI_PROCESSOR=y
337CONFIG_ACPI_HOTPLUG_CPU=y
255CONFIG_ACPI_THERMAL=y 338CONFIG_ACPI_THERMAL=y
339# CONFIG_ACPI_WMI is not set
256# CONFIG_ACPI_ASUS is not set 340# CONFIG_ACPI_ASUS is not set
257# CONFIG_ACPI_TOSHIBA is not set 341# CONFIG_ACPI_TOSHIBA is not set
258CONFIG_ACPI_BLACKLIST_YEAR=2001 342# CONFIG_ACPI_CUSTOM_DSDT is not set
259CONFIG_ACPI_DEBUG=y 343CONFIG_ACPI_BLACKLIST_YEAR=0
344# CONFIG_ACPI_DEBUG is not set
260CONFIG_ACPI_EC=y 345CONFIG_ACPI_EC=y
346# CONFIG_ACPI_PCI_SLOT is not set
261CONFIG_ACPI_POWER=y 347CONFIG_ACPI_POWER=y
262CONFIG_ACPI_SYSTEM=y 348CONFIG_ACPI_SYSTEM=y
263CONFIG_X86_PM_TIMER=y 349CONFIG_X86_PM_TIMER=y
264# CONFIG_ACPI_CONTAINER is not set 350CONFIG_ACPI_CONTAINER=y
265# CONFIG_ACPI_SBS is not set 351# CONFIG_ACPI_SBS is not set
266# CONFIG_APM is not set 352# CONFIG_APM is not set
267 353
@@ -271,15 +357,17 @@ CONFIG_X86_PM_TIMER=y
271CONFIG_CPU_FREQ=y 357CONFIG_CPU_FREQ=y
272CONFIG_CPU_FREQ_TABLE=y 358CONFIG_CPU_FREQ_TABLE=y
273CONFIG_CPU_FREQ_DEBUG=y 359CONFIG_CPU_FREQ_DEBUG=y
274CONFIG_CPU_FREQ_STAT=y 360# CONFIG_CPU_FREQ_STAT is not set
275# CONFIG_CPU_FREQ_STAT_DETAILS is not set 361# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set
276CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y 362# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set
277# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set 363CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y
364# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set
365# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set
278CONFIG_CPU_FREQ_GOV_PERFORMANCE=y 366CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
279# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set 367# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set
280CONFIG_CPU_FREQ_GOV_USERSPACE=y 368CONFIG_CPU_FREQ_GOV_USERSPACE=y
281CONFIG_CPU_FREQ_GOV_ONDEMAND=y 369CONFIG_CPU_FREQ_GOV_ONDEMAND=y
282CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y 370# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set
283 371
284# 372#
285# CPUFreq processor drivers 373# CPUFreq processor drivers
@@ -287,8 +375,7 @@ CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y
287CONFIG_X86_ACPI_CPUFREQ=y 375CONFIG_X86_ACPI_CPUFREQ=y
288# CONFIG_X86_POWERNOW_K6 is not set 376# CONFIG_X86_POWERNOW_K6 is not set
289# CONFIG_X86_POWERNOW_K7 is not set 377# CONFIG_X86_POWERNOW_K7 is not set
290CONFIG_X86_POWERNOW_K8=y 378# CONFIG_X86_POWERNOW_K8 is not set
291CONFIG_X86_POWERNOW_K8_ACPI=y
292# CONFIG_X86_GX_SUSPMOD is not set 379# CONFIG_X86_GX_SUSPMOD is not set
293# CONFIG_X86_SPEEDSTEP_CENTRINO is not set 380# CONFIG_X86_SPEEDSTEP_CENTRINO is not set
294# CONFIG_X86_SPEEDSTEP_ICH is not set 381# CONFIG_X86_SPEEDSTEP_ICH is not set
@@ -302,106 +389,217 @@ CONFIG_X86_POWERNOW_K8_ACPI=y
302# 389#
303# shared options 390# shared options
304# 391#
305CONFIG_X86_ACPI_CPUFREQ_PROC_INTF=y 392# CONFIG_X86_ACPI_CPUFREQ_PROC_INTF is not set
306# CONFIG_X86_SPEEDSTEP_LIB is not set 393# CONFIG_X86_SPEEDSTEP_LIB is not set
394CONFIG_CPU_IDLE=y
395CONFIG_CPU_IDLE_GOV_LADDER=y
396CONFIG_CPU_IDLE_GOV_MENU=y
307 397
308# 398#
309# Bus options (PCI, PCMCIA, EISA, MCA, ISA) 399# Bus options (PCI etc.)
310# 400#
311CONFIG_PCI=y 401CONFIG_PCI=y
312# CONFIG_PCI_GOBIOS is not set 402# CONFIG_PCI_GOBIOS is not set
313# CONFIG_PCI_GOMMCONFIG is not set 403# CONFIG_PCI_GOMMCONFIG is not set
314# CONFIG_PCI_GODIRECT is not set 404# CONFIG_PCI_GODIRECT is not set
405# CONFIG_PCI_GOOLPC is not set
315CONFIG_PCI_GOANY=y 406CONFIG_PCI_GOANY=y
316CONFIG_PCI_BIOS=y 407CONFIG_PCI_BIOS=y
317CONFIG_PCI_DIRECT=y 408CONFIG_PCI_DIRECT=y
318CONFIG_PCI_MMCONFIG=y 409CONFIG_PCI_MMCONFIG=y
319# CONFIG_PCIEPORTBUS is not set 410CONFIG_PCI_DOMAINS=y
411CONFIG_PCIEPORTBUS=y
412# CONFIG_HOTPLUG_PCI_PCIE is not set
413CONFIG_PCIEAER=y
414# CONFIG_PCIEASPM is not set
320CONFIG_ARCH_SUPPORTS_MSI=y 415CONFIG_ARCH_SUPPORTS_MSI=y
321CONFIG_PCI_MSI=y 416CONFIG_PCI_MSI=y
417# CONFIG_PCI_LEGACY is not set
322# CONFIG_PCI_DEBUG is not set 418# CONFIG_PCI_DEBUG is not set
323# CONFIG_HT_IRQ is not set 419CONFIG_HT_IRQ=y
324CONFIG_ISA_DMA_API=y 420CONFIG_ISA_DMA_API=y
325# CONFIG_ISA is not set 421# CONFIG_ISA is not set
326# CONFIG_MCA is not set 422# CONFIG_MCA is not set
327# CONFIG_SCx200 is not set 423# CONFIG_SCx200 is not set
424# CONFIG_OLPC is not set
328CONFIG_K8_NB=y 425CONFIG_K8_NB=y
329 426CONFIG_PCCARD=y
330# 427# CONFIG_PCMCIA_DEBUG is not set
331# PCCARD (PCMCIA/CardBus) support 428CONFIG_PCMCIA=y
332# 429CONFIG_PCMCIA_LOAD_CIS=y
333# CONFIG_PCCARD is not set 430CONFIG_PCMCIA_IOCTL=y
334# CONFIG_HOTPLUG_PCI is not set 431CONFIG_CARDBUS=y
335 432
336# 433#
337# Executable file formats 434# PC-card bridges
435#
436CONFIG_YENTA=y
437CONFIG_YENTA_O2=y
438CONFIG_YENTA_RICOH=y
439CONFIG_YENTA_TI=y
440CONFIG_YENTA_ENE_TUNE=y
441CONFIG_YENTA_TOSHIBA=y
442# CONFIG_PD6729 is not set
443# CONFIG_I82092 is not set
444CONFIG_PCCARD_NONSTATIC=y
445CONFIG_HOTPLUG_PCI=y
446# CONFIG_HOTPLUG_PCI_FAKE is not set
447# CONFIG_HOTPLUG_PCI_IBM is not set
448# CONFIG_HOTPLUG_PCI_ACPI is not set
449# CONFIG_HOTPLUG_PCI_CPCI is not set
450# CONFIG_HOTPLUG_PCI_SHPC is not set
451
452#
453# Executable file formats / Emulations
338# 454#
339CONFIG_BINFMT_ELF=y 455CONFIG_BINFMT_ELF=y
340# CONFIG_BINFMT_AOUT is not set 456# CONFIG_BINFMT_AOUT is not set
341# CONFIG_BINFMT_MISC is not set 457CONFIG_BINFMT_MISC=y
342
343#
344# Networking
345#
346CONFIG_NET=y 458CONFIG_NET=y
347 459
348# 460#
349# Networking options 461# Networking options
350# 462#
351CONFIG_PACKET=y 463CONFIG_PACKET=y
352# CONFIG_PACKET_MMAP is not set 464CONFIG_PACKET_MMAP=y
353CONFIG_UNIX=y 465CONFIG_UNIX=y
354CONFIG_XFRM=y 466CONFIG_XFRM=y
355# CONFIG_XFRM_USER is not set 467CONFIG_XFRM_USER=y
356# CONFIG_XFRM_SUB_POLICY is not set 468# CONFIG_XFRM_SUB_POLICY is not set
357# CONFIG_XFRM_MIGRATE is not set 469# CONFIG_XFRM_MIGRATE is not set
470# CONFIG_XFRM_STATISTICS is not set
358# CONFIG_NET_KEY is not set 471# CONFIG_NET_KEY is not set
359CONFIG_INET=y 472CONFIG_INET=y
360CONFIG_IP_MULTICAST=y 473CONFIG_IP_MULTICAST=y
361# CONFIG_IP_ADVANCED_ROUTER is not set 474CONFIG_IP_ADVANCED_ROUTER=y
475CONFIG_ASK_IP_FIB_HASH=y
476# CONFIG_IP_FIB_TRIE is not set
362CONFIG_IP_FIB_HASH=y 477CONFIG_IP_FIB_HASH=y
478CONFIG_IP_MULTIPLE_TABLES=y
479CONFIG_IP_ROUTE_MULTIPATH=y
480CONFIG_IP_ROUTE_VERBOSE=y
363CONFIG_IP_PNP=y 481CONFIG_IP_PNP=y
364CONFIG_IP_PNP_DHCP=y 482CONFIG_IP_PNP_DHCP=y
365# CONFIG_IP_PNP_BOOTP is not set 483CONFIG_IP_PNP_BOOTP=y
366# CONFIG_IP_PNP_RARP is not set 484CONFIG_IP_PNP_RARP=y
367# CONFIG_NET_IPIP is not set 485# CONFIG_NET_IPIP is not set
368# CONFIG_NET_IPGRE is not set 486# CONFIG_NET_IPGRE is not set
369# CONFIG_IP_MROUTE is not set 487CONFIG_IP_MROUTE=y
488CONFIG_IP_PIMSM_V1=y
489CONFIG_IP_PIMSM_V2=y
370# CONFIG_ARPD is not set 490# CONFIG_ARPD is not set
371# CONFIG_SYN_COOKIES is not set 491CONFIG_SYN_COOKIES=y
372# CONFIG_INET_AH is not set 492# CONFIG_INET_AH is not set
373# CONFIG_INET_ESP is not set 493# CONFIG_INET_ESP is not set
374# CONFIG_INET_IPCOMP is not set 494# CONFIG_INET_IPCOMP is not set
375# CONFIG_INET_XFRM_TUNNEL is not set 495# CONFIG_INET_XFRM_TUNNEL is not set
376CONFIG_INET_TUNNEL=y 496CONFIG_INET_TUNNEL=y
377CONFIG_INET_XFRM_MODE_TRANSPORT=y 497# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
378CONFIG_INET_XFRM_MODE_TUNNEL=y 498# CONFIG_INET_XFRM_MODE_TUNNEL is not set
379# CONFIG_INET_XFRM_MODE_BEET is not set 499# CONFIG_INET_XFRM_MODE_BEET is not set
380CONFIG_INET_DIAG=y 500CONFIG_INET_LRO=y
381CONFIG_INET_TCP_DIAG=y 501# CONFIG_INET_DIAG is not set
382# CONFIG_TCP_CONG_ADVANCED is not set 502CONFIG_TCP_CONG_ADVANCED=y
503# CONFIG_TCP_CONG_BIC is not set
383CONFIG_TCP_CONG_CUBIC=y 504CONFIG_TCP_CONG_CUBIC=y
505# CONFIG_TCP_CONG_WESTWOOD is not set
506# CONFIG_TCP_CONG_HTCP is not set
507# CONFIG_TCP_CONG_HSTCP is not set
508# CONFIG_TCP_CONG_HYBLA is not set
509# CONFIG_TCP_CONG_VEGAS is not set
510# CONFIG_TCP_CONG_SCALABLE is not set
511# CONFIG_TCP_CONG_LP is not set
512# CONFIG_TCP_CONG_VENO is not set
513# CONFIG_TCP_CONG_YEAH is not set
514# CONFIG_TCP_CONG_ILLINOIS is not set
515# CONFIG_DEFAULT_BIC is not set
516CONFIG_DEFAULT_CUBIC=y
517# CONFIG_DEFAULT_HTCP is not set
518# CONFIG_DEFAULT_VEGAS is not set
519# CONFIG_DEFAULT_WESTWOOD is not set
520# CONFIG_DEFAULT_RENO is not set
384CONFIG_DEFAULT_TCP_CONG="cubic" 521CONFIG_DEFAULT_TCP_CONG="cubic"
385# CONFIG_TCP_MD5SIG is not set 522CONFIG_TCP_MD5SIG=y
523# CONFIG_IP_VS is not set
386CONFIG_IPV6=y 524CONFIG_IPV6=y
387# CONFIG_IPV6_PRIVACY is not set 525# CONFIG_IPV6_PRIVACY is not set
388# CONFIG_IPV6_ROUTER_PREF is not set 526# CONFIG_IPV6_ROUTER_PREF is not set
389# CONFIG_IPV6_OPTIMISTIC_DAD is not set 527# CONFIG_IPV6_OPTIMISTIC_DAD is not set
390# CONFIG_INET6_AH is not set 528CONFIG_INET6_AH=y
391# CONFIG_INET6_ESP is not set 529CONFIG_INET6_ESP=y
392# CONFIG_INET6_IPCOMP is not set 530# CONFIG_INET6_IPCOMP is not set
393# CONFIG_IPV6_MIP6 is not set 531# CONFIG_IPV6_MIP6 is not set
394# CONFIG_INET6_XFRM_TUNNEL is not set 532# CONFIG_INET6_XFRM_TUNNEL is not set
395# CONFIG_INET6_TUNNEL is not set 533# CONFIG_INET6_TUNNEL is not set
396CONFIG_INET6_XFRM_MODE_TRANSPORT=y 534CONFIG_INET6_XFRM_MODE_TRANSPORT=y
397CONFIG_INET6_XFRM_MODE_TUNNEL=y 535CONFIG_INET6_XFRM_MODE_TUNNEL=y
398# CONFIG_INET6_XFRM_MODE_BEET is not set 536CONFIG_INET6_XFRM_MODE_BEET=y
399# CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set 537# CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set
400CONFIG_IPV6_SIT=y 538CONFIG_IPV6_SIT=y
539CONFIG_IPV6_NDISC_NODETYPE=y
401# CONFIG_IPV6_TUNNEL is not set 540# CONFIG_IPV6_TUNNEL is not set
402# CONFIG_IPV6_MULTIPLE_TABLES is not set 541# CONFIG_IPV6_MULTIPLE_TABLES is not set
403# CONFIG_NETWORK_SECMARK is not set 542# CONFIG_IPV6_MROUTE is not set
404# CONFIG_NETFILTER is not set 543CONFIG_NETLABEL=y
544CONFIG_NETWORK_SECMARK=y
545CONFIG_NETFILTER=y
546# CONFIG_NETFILTER_DEBUG is not set
547# CONFIG_NETFILTER_ADVANCED is not set
548
549#
550# Core Netfilter Configuration
551#
552CONFIG_NETFILTER_NETLINK=y
553CONFIG_NETFILTER_NETLINK_LOG=y
554CONFIG_NF_CONNTRACK=y
555CONFIG_NF_CONNTRACK_SECMARK=y
556CONFIG_NF_CONNTRACK_FTP=y
557CONFIG_NF_CONNTRACK_IRC=y
558CONFIG_NF_CONNTRACK_SIP=y
559CONFIG_NF_CT_NETLINK=y
560CONFIG_NETFILTER_XTABLES=y
561CONFIG_NETFILTER_XT_TARGET_MARK=y
562CONFIG_NETFILTER_XT_TARGET_NFLOG=y
563CONFIG_NETFILTER_XT_TARGET_SECMARK=y
564CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
565CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
566CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
567CONFIG_NETFILTER_XT_MATCH_MARK=y
568CONFIG_NETFILTER_XT_MATCH_POLICY=y
569CONFIG_NETFILTER_XT_MATCH_STATE=y
570
571#
572# IP: Netfilter Configuration
573#
574CONFIG_NF_CONNTRACK_IPV4=y
575CONFIG_NF_CONNTRACK_PROC_COMPAT=y
576CONFIG_IP_NF_IPTABLES=y
577CONFIG_IP_NF_FILTER=y
578CONFIG_IP_NF_TARGET_REJECT=y
579CONFIG_IP_NF_TARGET_LOG=y
580CONFIG_IP_NF_TARGET_ULOG=y
581CONFIG_NF_NAT=y
582CONFIG_NF_NAT_NEEDED=y
583CONFIG_IP_NF_TARGET_MASQUERADE=y
584CONFIG_NF_NAT_FTP=y
585CONFIG_NF_NAT_IRC=y
586# CONFIG_NF_NAT_TFTP is not set
587# CONFIG_NF_NAT_AMANDA is not set
588# CONFIG_NF_NAT_PPTP is not set
589# CONFIG_NF_NAT_H323 is not set
590CONFIG_NF_NAT_SIP=y
591CONFIG_IP_NF_MANGLE=y
592
593#
594# IPv6: Netfilter Configuration
595#
596CONFIG_NF_CONNTRACK_IPV6=y
597CONFIG_IP6_NF_IPTABLES=y
598CONFIG_IP6_NF_MATCH_IPV6HEADER=y
599CONFIG_IP6_NF_FILTER=y
600CONFIG_IP6_NF_TARGET_LOG=y
601CONFIG_IP6_NF_TARGET_REJECT=y
602CONFIG_IP6_NF_MANGLE=y
405# CONFIG_IP_DCCP is not set 603# CONFIG_IP_DCCP is not set
406# CONFIG_IP_SCTP is not set 604# CONFIG_IP_SCTP is not set
407# CONFIG_TIPC is not set 605# CONFIG_TIPC is not set
@@ -409,6 +607,7 @@ CONFIG_IPV6_SIT=y
409# CONFIG_BRIDGE is not set 607# CONFIG_BRIDGE is not set
410# CONFIG_VLAN_8021Q is not set 608# CONFIG_VLAN_8021Q is not set
411# CONFIG_DECNET is not set 609# CONFIG_DECNET is not set
610CONFIG_LLC=y
412# CONFIG_LLC2 is not set 611# CONFIG_LLC2 is not set
413# CONFIG_IPX is not set 612# CONFIG_IPX is not set
414# CONFIG_ATALK is not set 613# CONFIG_ATALK is not set
@@ -416,28 +615,89 @@ CONFIG_IPV6_SIT=y
416# CONFIG_LAPB is not set 615# CONFIG_LAPB is not set
417# CONFIG_ECONET is not set 616# CONFIG_ECONET is not set
418# CONFIG_WAN_ROUTER is not set 617# CONFIG_WAN_ROUTER is not set
419 618CONFIG_NET_SCHED=y
420# 619
421# QoS and/or fair queueing 620#
422# 621# Queueing/Scheduling
423# CONFIG_NET_SCHED is not set 622#
623# CONFIG_NET_SCH_CBQ is not set
624# CONFIG_NET_SCH_HTB is not set
625# CONFIG_NET_SCH_HFSC is not set
626# CONFIG_NET_SCH_PRIO is not set
627# CONFIG_NET_SCH_RED is not set
628# CONFIG_NET_SCH_SFQ is not set
629# CONFIG_NET_SCH_TEQL is not set
630# CONFIG_NET_SCH_TBF is not set
631# CONFIG_NET_SCH_GRED is not set
632# CONFIG_NET_SCH_DSMARK is not set
633# CONFIG_NET_SCH_NETEM is not set
634# CONFIG_NET_SCH_INGRESS is not set
635
636#
637# Classification
638#
639CONFIG_NET_CLS=y
640# CONFIG_NET_CLS_BASIC is not set
641# CONFIG_NET_CLS_TCINDEX is not set
642# CONFIG_NET_CLS_ROUTE4 is not set
643# CONFIG_NET_CLS_FW is not set
644# CONFIG_NET_CLS_U32 is not set
645# CONFIG_NET_CLS_RSVP is not set
646# CONFIG_NET_CLS_RSVP6 is not set
647# CONFIG_NET_CLS_FLOW is not set
648CONFIG_NET_EMATCH=y
649CONFIG_NET_EMATCH_STACK=32
650# CONFIG_NET_EMATCH_CMP is not set
651# CONFIG_NET_EMATCH_NBYTE is not set
652# CONFIG_NET_EMATCH_U32 is not set
653# CONFIG_NET_EMATCH_META is not set
654# CONFIG_NET_EMATCH_TEXT is not set
655CONFIG_NET_CLS_ACT=y
656# CONFIG_NET_ACT_POLICE is not set
657# CONFIG_NET_ACT_GACT is not set
658# CONFIG_NET_ACT_MIRRED is not set
659# CONFIG_NET_ACT_IPT is not set
660# CONFIG_NET_ACT_NAT is not set
661# CONFIG_NET_ACT_PEDIT is not set
662# CONFIG_NET_ACT_SIMP is not set
663CONFIG_NET_SCH_FIFO=y
424 664
425# 665#
426# Network testing 666# Network testing
427# 667#
428# CONFIG_NET_PKTGEN is not set 668# CONFIG_NET_PKTGEN is not set
429# CONFIG_NET_TCPPROBE is not set 669# CONFIG_NET_TCPPROBE is not set
430# CONFIG_HAMRADIO is not set 670CONFIG_HAMRADIO=y
671
672#
673# Packet Radio protocols
674#
675# CONFIG_AX25 is not set
676# CONFIG_CAN is not set
431# CONFIG_IRDA is not set 677# CONFIG_IRDA is not set
432# CONFIG_BT is not set 678# CONFIG_BT is not set
433# CONFIG_AF_RXRPC is not set 679# CONFIG_AF_RXRPC is not set
680CONFIG_FIB_RULES=y
434 681
435# 682#
436# Wireless 683# Wireless
437# 684#
438# CONFIG_CFG80211 is not set 685CONFIG_CFG80211=y
439# CONFIG_WIRELESS_EXT is not set 686CONFIG_NL80211=y
440# CONFIG_MAC80211 is not set 687CONFIG_WIRELESS_EXT=y
688CONFIG_WIRELESS_EXT_SYSFS=y
689CONFIG_MAC80211=y
690
691#
692# Rate control algorithm selection
693#
694CONFIG_MAC80211_RC_PID=y
695CONFIG_MAC80211_RC_DEFAULT_PID=y
696CONFIG_MAC80211_RC_DEFAULT="pid"
697# CONFIG_MAC80211_MESH is not set
698CONFIG_MAC80211_LEDS=y
699# CONFIG_MAC80211_DEBUGFS is not set
700# CONFIG_MAC80211_DEBUG_MENU is not set
441# CONFIG_IEEE80211 is not set 701# CONFIG_IEEE80211 is not set
442# CONFIG_RFKILL is not set 702# CONFIG_RFKILL is not set
443# CONFIG_NET_9P is not set 703# CONFIG_NET_9P is not set
@@ -449,13 +709,17 @@ CONFIG_IPV6_SIT=y
449# 709#
450# Generic Driver Options 710# Generic Driver Options
451# 711#
712CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
452CONFIG_STANDALONE=y 713CONFIG_STANDALONE=y
453CONFIG_PREVENT_FIRMWARE_BUILD=y 714CONFIG_PREVENT_FIRMWARE_BUILD=y
454CONFIG_FW_LOADER=y 715CONFIG_FW_LOADER=y
716CONFIG_FIRMWARE_IN_KERNEL=y
717CONFIG_EXTRA_FIRMWARE=""
455# CONFIG_DEBUG_DRIVER is not set 718# CONFIG_DEBUG_DRIVER is not set
456# CONFIG_DEBUG_DEVRES is not set 719CONFIG_DEBUG_DEVRES=y
457# CONFIG_SYS_HYPERVISOR is not set 720# CONFIG_SYS_HYPERVISOR is not set
458# CONFIG_CONNECTOR is not set 721CONFIG_CONNECTOR=y
722CONFIG_PROC_EVENTS=y
459# CONFIG_MTD is not set 723# CONFIG_MTD is not set
460# CONFIG_PARPORT is not set 724# CONFIG_PARPORT is not set
461CONFIG_PNP=y 725CONFIG_PNP=y
@@ -466,7 +730,7 @@ CONFIG_PNP=y
466# 730#
467CONFIG_PNPACPI=y 731CONFIG_PNPACPI=y
468CONFIG_BLK_DEV=y 732CONFIG_BLK_DEV=y
469CONFIG_BLK_DEV_FD=y 733# CONFIG_BLK_DEV_FD is not set
470# CONFIG_BLK_CPQ_DA is not set 734# CONFIG_BLK_CPQ_DA is not set
471# CONFIG_BLK_CPQ_CISS_DA is not set 735# CONFIG_BLK_CPQ_CISS_DA is not set
472# CONFIG_BLK_DEV_DAC960 is not set 736# CONFIG_BLK_DEV_DAC960 is not set
@@ -479,83 +743,30 @@ CONFIG_BLK_DEV_LOOP=y
479# CONFIG_BLK_DEV_UB is not set 743# CONFIG_BLK_DEV_UB is not set
480CONFIG_BLK_DEV_RAM=y 744CONFIG_BLK_DEV_RAM=y
481CONFIG_BLK_DEV_RAM_COUNT=16 745CONFIG_BLK_DEV_RAM_COUNT=16
482CONFIG_BLK_DEV_RAM_SIZE=4096 746CONFIG_BLK_DEV_RAM_SIZE=16384
483CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024 747# CONFIG_BLK_DEV_XIP is not set
484# CONFIG_CDROM_PKTCDVD is not set 748# CONFIG_CDROM_PKTCDVD is not set
485# CONFIG_ATA_OVER_ETH is not set 749# CONFIG_ATA_OVER_ETH is not set
750# CONFIG_BLK_DEV_HD is not set
486CONFIG_MISC_DEVICES=y 751CONFIG_MISC_DEVICES=y
487# CONFIG_IBM_ASM is not set 752# CONFIG_IBM_ASM is not set
488# CONFIG_PHANTOM is not set 753# CONFIG_PHANTOM is not set
489# CONFIG_EEPROM_93CX6 is not set 754# CONFIG_EEPROM_93CX6 is not set
490# CONFIG_SGI_IOC4 is not set 755# CONFIG_SGI_IOC4 is not set
491# CONFIG_TIFM_CORE is not set 756# CONFIG_TIFM_CORE is not set
757# CONFIG_ACER_WMI is not set
758# CONFIG_ASUS_LAPTOP is not set
759# CONFIG_FUJITSU_LAPTOP is not set
760# CONFIG_TC1100_WMI is not set
761# CONFIG_MSI_LAPTOP is not set
762# CONFIG_COMPAL_LAPTOP is not set
492# CONFIG_SONY_LAPTOP is not set 763# CONFIG_SONY_LAPTOP is not set
493# CONFIG_THINKPAD_ACPI is not set 764# CONFIG_THINKPAD_ACPI is not set
494CONFIG_IDE=y 765# CONFIG_INTEL_MENLOW is not set
495CONFIG_BLK_DEV_IDE=y 766# CONFIG_ENCLOSURE_SERVICES is not set
496 767# CONFIG_HP_ILO is not set
497# 768CONFIG_HAVE_IDE=y
498# Please see Documentation/ide.txt for help/info on IDE drives 769# CONFIG_IDE is not set
499#
500# CONFIG_BLK_DEV_IDE_SATA is not set
501# CONFIG_BLK_DEV_HD_IDE is not set
502CONFIG_BLK_DEV_IDEDISK=y
503CONFIG_IDEDISK_MULTI_MODE=y
504CONFIG_BLK_DEV_IDECD=y
505# CONFIG_BLK_DEV_IDETAPE is not set
506# CONFIG_BLK_DEV_IDEFLOPPY is not set
507# CONFIG_BLK_DEV_IDESCSI is not set
508CONFIG_BLK_DEV_IDEACPI=y
509# CONFIG_IDE_TASK_IOCTL is not set
510CONFIG_IDE_PROC_FS=y
511
512#
513# IDE chipset support/bugfixes
514#
515CONFIG_IDE_GENERIC=y
516# CONFIG_BLK_DEV_CMD640 is not set
517# CONFIG_BLK_DEV_IDEPNP is not set
518CONFIG_BLK_DEV_IDEPCI=y
519# CONFIG_IDEPCI_SHARE_IRQ is not set
520CONFIG_IDEPCI_PCIBUS_ORDER=y
521# CONFIG_BLK_DEV_OFFBOARD is not set
522# CONFIG_BLK_DEV_GENERIC is not set
523# CONFIG_BLK_DEV_OPTI621 is not set
524# CONFIG_BLK_DEV_RZ1000 is not set
525CONFIG_BLK_DEV_IDEDMA_PCI=y
526# CONFIG_BLK_DEV_IDEDMA_FORCED is not set
527# CONFIG_IDEDMA_ONLYDISK is not set
528# CONFIG_BLK_DEV_AEC62XX is not set
529# CONFIG_BLK_DEV_ALI15X3 is not set
530CONFIG_BLK_DEV_AMD74XX=y
531# CONFIG_BLK_DEV_ATIIXP is not set
532# CONFIG_BLK_DEV_CMD64X is not set
533# CONFIG_BLK_DEV_TRIFLEX is not set
534# CONFIG_BLK_DEV_CY82C693 is not set
535# CONFIG_BLK_DEV_CS5520 is not set
536# CONFIG_BLK_DEV_CS5530 is not set
537# CONFIG_BLK_DEV_CS5535 is not set
538# CONFIG_BLK_DEV_HPT34X is not set
539# CONFIG_BLK_DEV_HPT366 is not set
540# CONFIG_BLK_DEV_JMICRON is not set
541# CONFIG_BLK_DEV_SC1200 is not set
542CONFIG_BLK_DEV_PIIX=y
543# CONFIG_BLK_DEV_IT8213 is not set
544# CONFIG_BLK_DEV_IT821X is not set
545# CONFIG_BLK_DEV_NS87415 is not set
546# CONFIG_BLK_DEV_PDC202XX_OLD is not set
547# CONFIG_BLK_DEV_PDC202XX_NEW is not set
548# CONFIG_BLK_DEV_SVWKS is not set
549# CONFIG_BLK_DEV_SIIMAGE is not set
550# CONFIG_BLK_DEV_SIS5513 is not set
551# CONFIG_BLK_DEV_SLC90E66 is not set
552# CONFIG_BLK_DEV_TRM290 is not set
553# CONFIG_BLK_DEV_VIA82CXXX is not set
554# CONFIG_BLK_DEV_TC86C001 is not set
555# CONFIG_IDE_ARM is not set
556CONFIG_BLK_DEV_IDEDMA=y
557# CONFIG_IDEDMA_IVB is not set
558# CONFIG_BLK_DEV_HD is not set
559 770
560# 771#
561# SCSI device support 772# SCSI device support
@@ -564,8 +775,8 @@ CONFIG_BLK_DEV_IDEDMA=y
564CONFIG_SCSI=y 775CONFIG_SCSI=y
565CONFIG_SCSI_DMA=y 776CONFIG_SCSI_DMA=y
566# CONFIG_SCSI_TGT is not set 777# CONFIG_SCSI_TGT is not set
567CONFIG_SCSI_NETLINK=y 778# CONFIG_SCSI_NETLINK is not set
568# CONFIG_SCSI_PROC_FS is not set 779CONFIG_SCSI_PROC_FS=y
569 780
570# 781#
571# SCSI support type (disk, tape, CD-ROM) 782# SCSI support type (disk, tape, CD-ROM)
@@ -574,7 +785,7 @@ CONFIG_BLK_DEV_SD=y
574# CONFIG_CHR_DEV_ST is not set 785# CONFIG_CHR_DEV_ST is not set
575# CONFIG_CHR_DEV_OSST is not set 786# CONFIG_CHR_DEV_OSST is not set
576CONFIG_BLK_DEV_SR=y 787CONFIG_BLK_DEV_SR=y
577# CONFIG_BLK_DEV_SR_VENDOR is not set 788CONFIG_BLK_DEV_SR_VENDOR=y
578CONFIG_CHR_DEV_SG=y 789CONFIG_CHR_DEV_SG=y
579# CONFIG_CHR_DEV_SCH is not set 790# CONFIG_CHR_DEV_SCH is not set
580 791
@@ -582,7 +793,7 @@ CONFIG_CHR_DEV_SG=y
582# Some SCSI devices (e.g. CD jukebox) support multiple LUNs 793# Some SCSI devices (e.g. CD jukebox) support multiple LUNs
583# 794#
584# CONFIG_SCSI_MULTI_LUN is not set 795# CONFIG_SCSI_MULTI_LUN is not set
585# CONFIG_SCSI_CONSTANTS is not set 796CONFIG_SCSI_CONSTANTS=y
586# CONFIG_SCSI_LOGGING is not set 797# CONFIG_SCSI_LOGGING is not set
587# CONFIG_SCSI_SCAN_ASYNC is not set 798# CONFIG_SCSI_SCAN_ASYNC is not set
588CONFIG_SCSI_WAIT_SCAN=m 799CONFIG_SCSI_WAIT_SCAN=m
@@ -591,81 +802,38 @@ CONFIG_SCSI_WAIT_SCAN=m
591# SCSI Transports 802# SCSI Transports
592# 803#
593CONFIG_SCSI_SPI_ATTRS=y 804CONFIG_SCSI_SPI_ATTRS=y
594CONFIG_SCSI_FC_ATTRS=y 805# CONFIG_SCSI_FC_ATTRS is not set
595# CONFIG_SCSI_ISCSI_ATTRS is not set 806CONFIG_SCSI_ISCSI_ATTRS=y
596# CONFIG_SCSI_SAS_ATTRS is not set 807# CONFIG_SCSI_SAS_ATTRS is not set
597# CONFIG_SCSI_SAS_LIBSAS is not set 808# CONFIG_SCSI_SAS_LIBSAS is not set
598 809# CONFIG_SCSI_SRP_ATTRS is not set
599# 810# CONFIG_SCSI_LOWLEVEL is not set
600# SCSI low-level drivers 811# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
601# 812# CONFIG_SCSI_DH is not set
602# CONFIG_ISCSI_TCP is not set
603CONFIG_BLK_DEV_3W_XXXX_RAID=y
604# CONFIG_SCSI_3W_9XXX is not set
605# CONFIG_SCSI_ACARD is not set
606# CONFIG_SCSI_AACRAID is not set
607CONFIG_SCSI_AIC7XXX=y
608CONFIG_AIC7XXX_CMDS_PER_DEVICE=32
609CONFIG_AIC7XXX_RESET_DELAY_MS=5000
610CONFIG_AIC7XXX_DEBUG_ENABLE=y
611CONFIG_AIC7XXX_DEBUG_MASK=0
612CONFIG_AIC7XXX_REG_PRETTY_PRINT=y
613# CONFIG_SCSI_AIC7XXX_OLD is not set
614CONFIG_SCSI_AIC79XX=y
615CONFIG_AIC79XX_CMDS_PER_DEVICE=32
616CONFIG_AIC79XX_RESET_DELAY_MS=4000
617# CONFIG_AIC79XX_DEBUG_ENABLE is not set
618CONFIG_AIC79XX_DEBUG_MASK=0
619# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set
620# CONFIG_SCSI_AIC94XX is not set
621# CONFIG_SCSI_DPT_I2O is not set
622# CONFIG_SCSI_ADVANSYS is not set
623# CONFIG_SCSI_ARCMSR is not set
624# CONFIG_MEGARAID_NEWGEN is not set
625# CONFIG_MEGARAID_LEGACY is not set
626# CONFIG_MEGARAID_SAS is not set
627# CONFIG_SCSI_HPTIOP is not set
628# CONFIG_SCSI_BUSLOGIC is not set
629# CONFIG_SCSI_DMX3191D is not set
630# CONFIG_SCSI_EATA is not set
631# CONFIG_SCSI_FUTURE_DOMAIN is not set
632# CONFIG_SCSI_GDTH is not set
633# CONFIG_SCSI_IPS is not set
634# CONFIG_SCSI_INITIO is not set
635# CONFIG_SCSI_INIA100 is not set
636# CONFIG_SCSI_STEX is not set
637# CONFIG_SCSI_SYM53C8XX_2 is not set
638# CONFIG_SCSI_IPR is not set
639# CONFIG_SCSI_QLOGIC_1280 is not set
640# CONFIG_SCSI_QLA_FC is not set
641# CONFIG_SCSI_QLA_ISCSI is not set
642# CONFIG_SCSI_LPFC is not set
643# CONFIG_SCSI_DC395x is not set
644# CONFIG_SCSI_DC390T is not set
645# CONFIG_SCSI_NSP32 is not set
646# CONFIG_SCSI_DEBUG is not set
647# CONFIG_SCSI_SRP is not set
648CONFIG_ATA=y 813CONFIG_ATA=y
649# CONFIG_ATA_NONSTANDARD is not set 814# CONFIG_ATA_NONSTANDARD is not set
650CONFIG_ATA_ACPI=y 815CONFIG_ATA_ACPI=y
816CONFIG_SATA_PMP=y
651CONFIG_SATA_AHCI=y 817CONFIG_SATA_AHCI=y
652CONFIG_SATA_SVW=y 818# CONFIG_SATA_SIL24 is not set
819CONFIG_ATA_SFF=y
820# CONFIG_SATA_SVW is not set
653CONFIG_ATA_PIIX=y 821CONFIG_ATA_PIIX=y
654# CONFIG_SATA_MV is not set 822# CONFIG_SATA_MV is not set
655CONFIG_SATA_NV=y 823# CONFIG_SATA_NV is not set
656# CONFIG_PDC_ADMA is not set 824# CONFIG_PDC_ADMA is not set
657# CONFIG_SATA_QSTOR is not set 825# CONFIG_SATA_QSTOR is not set
658# CONFIG_SATA_PROMISE is not set 826# CONFIG_SATA_PROMISE is not set
659# CONFIG_SATA_SX4 is not set 827# CONFIG_SATA_SX4 is not set
660CONFIG_SATA_SIL=y 828# CONFIG_SATA_SIL is not set
661# CONFIG_SATA_SIL24 is not set
662# CONFIG_SATA_SIS is not set 829# CONFIG_SATA_SIS is not set
663# CONFIG_SATA_ULI is not set 830# CONFIG_SATA_ULI is not set
664CONFIG_SATA_VIA=y 831# CONFIG_SATA_VIA is not set
665# CONFIG_SATA_VITESSE is not set 832# CONFIG_SATA_VITESSE is not set
666# CONFIG_SATA_INIC162X is not set 833# CONFIG_SATA_INIC162X is not set
834# CONFIG_PATA_ACPI is not set
667# CONFIG_PATA_ALI is not set 835# CONFIG_PATA_ALI is not set
668# CONFIG_PATA_AMD is not set 836CONFIG_PATA_AMD=y
669# CONFIG_PATA_ARTOP is not set 837# CONFIG_PATA_ARTOP is not set
670# CONFIG_PATA_ATIIXP is not set 838# CONFIG_PATA_ATIIXP is not set
671# CONFIG_PATA_CMD640_PCI is not set 839# CONFIG_PATA_CMD640_PCI is not set
@@ -673,9 +841,10 @@ CONFIG_SATA_VIA=y
673# CONFIG_PATA_CS5520 is not set 841# CONFIG_PATA_CS5520 is not set
674# CONFIG_PATA_CS5530 is not set 842# CONFIG_PATA_CS5530 is not set
675# CONFIG_PATA_CS5535 is not set 843# CONFIG_PATA_CS5535 is not set
844# CONFIG_PATA_CS5536 is not set
676# CONFIG_PATA_CYPRESS is not set 845# CONFIG_PATA_CYPRESS is not set
677# CONFIG_PATA_EFAR is not set 846# CONFIG_PATA_EFAR is not set
678# CONFIG_ATA_GENERIC is not set 847CONFIG_ATA_GENERIC=y
679# CONFIG_PATA_HPT366 is not set 848# CONFIG_PATA_HPT366 is not set
680# CONFIG_PATA_HPT37X is not set 849# CONFIG_PATA_HPT37X is not set
681# CONFIG_PATA_HPT3X2N is not set 850# CONFIG_PATA_HPT3X2N is not set
@@ -685,12 +854,15 @@ CONFIG_SATA_VIA=y
685# CONFIG_PATA_JMICRON is not set 854# CONFIG_PATA_JMICRON is not set
686# CONFIG_PATA_TRIFLEX is not set 855# CONFIG_PATA_TRIFLEX is not set
687# CONFIG_PATA_MARVELL is not set 856# CONFIG_PATA_MARVELL is not set
688# CONFIG_PATA_MPIIX is not set 857CONFIG_PATA_MPIIX=y
689# CONFIG_PATA_OLDPIIX is not set 858CONFIG_PATA_OLDPIIX=y
690# CONFIG_PATA_NETCELL is not set 859# CONFIG_PATA_NETCELL is not set
860# CONFIG_PATA_NINJA32 is not set
691# CONFIG_PATA_NS87410 is not set 861# CONFIG_PATA_NS87410 is not set
862# CONFIG_PATA_NS87415 is not set
692# CONFIG_PATA_OPTI is not set 863# CONFIG_PATA_OPTI is not set
693# CONFIG_PATA_OPTIDMA is not set 864# CONFIG_PATA_OPTIDMA is not set
865# CONFIG_PATA_PCMCIA is not set
694# CONFIG_PATA_PDC_OLD is not set 866# CONFIG_PATA_PDC_OLD is not set
695# CONFIG_PATA_RADISYS is not set 867# CONFIG_PATA_RADISYS is not set
696# CONFIG_PATA_RZ1000 is not set 868# CONFIG_PATA_RZ1000 is not set
@@ -701,107 +873,106 @@ CONFIG_SATA_VIA=y
701# CONFIG_PATA_SIS is not set 873# CONFIG_PATA_SIS is not set
702# CONFIG_PATA_VIA is not set 874# CONFIG_PATA_VIA is not set
703# CONFIG_PATA_WINBOND is not set 875# CONFIG_PATA_WINBOND is not set
876CONFIG_PATA_SCH=y
704CONFIG_MD=y 877CONFIG_MD=y
705# CONFIG_BLK_DEV_MD is not set 878CONFIG_BLK_DEV_MD=y
879# CONFIG_MD_LINEAR is not set
880# CONFIG_MD_RAID0 is not set
881# CONFIG_MD_RAID1 is not set
882# CONFIG_MD_RAID10 is not set
883# CONFIG_MD_RAID456 is not set
884# CONFIG_MD_MULTIPATH is not set
885# CONFIG_MD_FAULTY is not set
706CONFIG_BLK_DEV_DM=y 886CONFIG_BLK_DEV_DM=y
707# CONFIG_DM_DEBUG is not set 887# CONFIG_DM_DEBUG is not set
708# CONFIG_DM_CRYPT is not set 888# CONFIG_DM_CRYPT is not set
709# CONFIG_DM_SNAPSHOT is not set 889# CONFIG_DM_SNAPSHOT is not set
710# CONFIG_DM_MIRROR is not set 890CONFIG_DM_MIRROR=y
711# CONFIG_DM_ZERO is not set 891CONFIG_DM_ZERO=y
712# CONFIG_DM_MULTIPATH is not set 892# CONFIG_DM_MULTIPATH is not set
713# CONFIG_DM_DELAY is not set 893# CONFIG_DM_DELAY is not set
714 894# CONFIG_DM_UEVENT is not set
715# 895# CONFIG_FUSION is not set
716# Fusion MPT device support
717#
718CONFIG_FUSION=y
719CONFIG_FUSION_SPI=y
720# CONFIG_FUSION_FC is not set
721# CONFIG_FUSION_SAS is not set
722CONFIG_FUSION_MAX_SGE=128
723# CONFIG_FUSION_CTL is not set
724 896
725# 897#
726# IEEE 1394 (FireWire) support 898# IEEE 1394 (FireWire) support
727# 899#
728# CONFIG_FIREWIRE is not set
729CONFIG_IEEE1394=y
730
731#
732# Subsystem Options
733#
734# CONFIG_IEEE1394_VERBOSEDEBUG is not set
735
736#
737# Controllers
738#
739
740#
741# Texas Instruments PCILynx requires I2C
742#
743CONFIG_IEEE1394_OHCI1394=y
744 900
745# 901#
746# Protocols 902# Enable only one of the two stacks, unless you know what you are doing
747# 903#
748# CONFIG_IEEE1394_VIDEO1394 is not set 904# CONFIG_FIREWIRE is not set
749# CONFIG_IEEE1394_SBP2 is not set 905# CONFIG_IEEE1394 is not set
750# CONFIG_IEEE1394_ETH1394_ROM_ENTRY is not set
751# CONFIG_IEEE1394_ETH1394 is not set
752# CONFIG_IEEE1394_DV1394 is not set
753CONFIG_IEEE1394_RAWIO=y
754# CONFIG_I2O is not set 906# CONFIG_I2O is not set
755CONFIG_MACINTOSH_DRIVERS=y 907CONFIG_MACINTOSH_DRIVERS=y
756# CONFIG_MAC_EMUMOUSEBTN is not set 908CONFIG_MAC_EMUMOUSEBTN=y
757CONFIG_NETDEVICES=y 909CONFIG_NETDEVICES=y
758CONFIG_NETDEVICES_MULTIQUEUE=y 910# CONFIG_IFB is not set
759# CONFIG_DUMMY is not set 911# CONFIG_DUMMY is not set
760# CONFIG_BONDING is not set 912# CONFIG_BONDING is not set
761# CONFIG_MACVLAN is not set 913# CONFIG_MACVLAN is not set
762# CONFIG_EQUALIZER is not set 914# CONFIG_EQUALIZER is not set
763# CONFIG_TUN is not set 915# CONFIG_TUN is not set
916# CONFIG_VETH is not set
764# CONFIG_NET_SB1000 is not set 917# CONFIG_NET_SB1000 is not set
765# CONFIG_ARCNET is not set 918# CONFIG_ARCNET is not set
766# CONFIG_PHYLIB is not set 919CONFIG_PHYLIB=y
920
921#
922# MII PHY device drivers
923#
924# CONFIG_MARVELL_PHY is not set
925# CONFIG_DAVICOM_PHY is not set
926# CONFIG_QSEMI_PHY is not set
927# CONFIG_LXT_PHY is not set
928# CONFIG_CICADA_PHY is not set
929# CONFIG_VITESSE_PHY is not set
930# CONFIG_SMSC_PHY is not set
931# CONFIG_BROADCOM_PHY is not set
932# CONFIG_ICPLUS_PHY is not set
933# CONFIG_REALTEK_PHY is not set
934# CONFIG_FIXED_PHY is not set
935# CONFIG_MDIO_BITBANG is not set
767CONFIG_NET_ETHERNET=y 936CONFIG_NET_ETHERNET=y
768CONFIG_MII=y 937CONFIG_MII=y
769# CONFIG_HAPPYMEAL is not set 938# CONFIG_HAPPYMEAL is not set
770# CONFIG_SUNGEM is not set 939# CONFIG_SUNGEM is not set
771# CONFIG_CASSINI is not set 940# CONFIG_CASSINI is not set
772CONFIG_NET_VENDOR_3COM=y 941CONFIG_NET_VENDOR_3COM=y
773CONFIG_VORTEX=y 942# CONFIG_VORTEX is not set
774# CONFIG_TYPHOON is not set 943# CONFIG_TYPHOON is not set
775CONFIG_NET_TULIP=y 944CONFIG_NET_TULIP=y
776# CONFIG_DE2104X is not set 945# CONFIG_DE2104X is not set
777CONFIG_TULIP=y 946# CONFIG_TULIP is not set
778# CONFIG_TULIP_MWI is not set
779# CONFIG_TULIP_MMIO is not set
780# CONFIG_TULIP_NAPI is not set
781# CONFIG_DE4X5 is not set 947# CONFIG_DE4X5 is not set
782# CONFIG_WINBOND_840 is not set 948# CONFIG_WINBOND_840 is not set
783# CONFIG_DM9102 is not set 949# CONFIG_DM9102 is not set
784# CONFIG_ULI526X is not set 950# CONFIG_ULI526X is not set
951# CONFIG_PCMCIA_XIRCOM is not set
785# CONFIG_HP100 is not set 952# CONFIG_HP100 is not set
953# CONFIG_IBM_NEW_EMAC_ZMII is not set
954# CONFIG_IBM_NEW_EMAC_RGMII is not set
955# CONFIG_IBM_NEW_EMAC_TAH is not set
956# CONFIG_IBM_NEW_EMAC_EMAC4 is not set
786CONFIG_NET_PCI=y 957CONFIG_NET_PCI=y
787# CONFIG_PCNET32 is not set 958# CONFIG_PCNET32 is not set
788# CONFIG_AMD8111_ETH is not set 959# CONFIG_AMD8111_ETH is not set
789# CONFIG_ADAPTEC_STARFIRE is not set 960# CONFIG_ADAPTEC_STARFIRE is not set
790CONFIG_B44=y 961# CONFIG_B44 is not set
791CONFIG_FORCEDETH=y 962CONFIG_FORCEDETH=y
792# CONFIG_FORCEDETH_NAPI is not set 963# CONFIG_FORCEDETH_NAPI is not set
793# CONFIG_DGRS is not set
794# CONFIG_EEPRO100 is not set 964# CONFIG_EEPRO100 is not set
795CONFIG_E100=y 965CONFIG_E100=y
796# CONFIG_FEALNX is not set 966# CONFIG_FEALNX is not set
797# CONFIG_NATSEMI is not set 967# CONFIG_NATSEMI is not set
798# CONFIG_NE2K_PCI is not set 968CONFIG_NE2K_PCI=y
799CONFIG_8139CP=y 969# CONFIG_8139CP is not set
800CONFIG_8139TOO=y 970CONFIG_8139TOO=y
801# CONFIG_8139TOO_PIO is not set 971# CONFIG_8139TOO_PIO is not set
802# CONFIG_8139TOO_TUNE_TWISTER is not set 972# CONFIG_8139TOO_TUNE_TWISTER is not set
803# CONFIG_8139TOO_8129 is not set 973# CONFIG_8139TOO_8129 is not set
804# CONFIG_8139_OLD_RX_RESET is not set 974# CONFIG_8139_OLD_RX_RESET is not set
975# CONFIG_R6040 is not set
805# CONFIG_SIS900 is not set 976# CONFIG_SIS900 is not set
806# CONFIG_EPIC100 is not set 977# CONFIG_EPIC100 is not set
807# CONFIG_SUNDANCE is not set 978# CONFIG_SUNDANCE is not set
@@ -812,36 +983,77 @@ CONFIG_NETDEV_1000=y
812# CONFIG_ACENIC is not set 983# CONFIG_ACENIC is not set
813# CONFIG_DL2K is not set 984# CONFIG_DL2K is not set
814CONFIG_E1000=y 985CONFIG_E1000=y
815# CONFIG_E1000_NAPI is not set
816# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set 986# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set
987CONFIG_E1000E=y
988# CONFIG_IP1000 is not set
989# CONFIG_IGB is not set
817# CONFIG_NS83820 is not set 990# CONFIG_NS83820 is not set
818# CONFIG_HAMACHI is not set 991# CONFIG_HAMACHI is not set
819# CONFIG_YELLOWFIN is not set 992# CONFIG_YELLOWFIN is not set
820CONFIG_R8169=y 993CONFIG_R8169=y
821# CONFIG_R8169_NAPI is not set
822# CONFIG_SIS190 is not set 994# CONFIG_SIS190 is not set
823# CONFIG_SKGE is not set 995# CONFIG_SKGE is not set
824CONFIG_SKY2=y 996CONFIG_SKY2=y
997# CONFIG_SKY2_DEBUG is not set
825# CONFIG_VIA_VELOCITY is not set 998# CONFIG_VIA_VELOCITY is not set
826CONFIG_TIGON3=y 999CONFIG_TIGON3=y
827CONFIG_BNX2=y 1000CONFIG_BNX2=y
828# CONFIG_QLA3XXX is not set 1001# CONFIG_QLA3XXX is not set
829# CONFIG_ATL1 is not set 1002# CONFIG_ATL1 is not set
1003# CONFIG_ATL1E is not set
830CONFIG_NETDEV_10000=y 1004CONFIG_NETDEV_10000=y
831# CONFIG_CHELSIO_T1 is not set 1005# CONFIG_CHELSIO_T1 is not set
832# CONFIG_CHELSIO_T3 is not set 1006# CONFIG_CHELSIO_T3 is not set
1007# CONFIG_IXGBE is not set
833# CONFIG_IXGB is not set 1008# CONFIG_IXGB is not set
834# CONFIG_S2IO is not set 1009# CONFIG_S2IO is not set
835# CONFIG_MYRI10GE is not set 1010# CONFIG_MYRI10GE is not set
836# CONFIG_NETXEN_NIC is not set 1011# CONFIG_NETXEN_NIC is not set
1012# CONFIG_NIU is not set
837# CONFIG_MLX4_CORE is not set 1013# CONFIG_MLX4_CORE is not set
838# CONFIG_TR is not set 1014# CONFIG_TEHUTI is not set
1015# CONFIG_BNX2X is not set
1016# CONFIG_SFC is not set
1017CONFIG_TR=y
1018# CONFIG_IBMOL is not set
1019# CONFIG_IBMLS is not set
1020# CONFIG_3C359 is not set
1021# CONFIG_TMS380TR is not set
839 1022
840# 1023#
841# Wireless LAN 1024# Wireless LAN
842# 1025#
843# CONFIG_WLAN_PRE80211 is not set 1026# CONFIG_WLAN_PRE80211 is not set
844# CONFIG_WLAN_80211 is not set 1027CONFIG_WLAN_80211=y
1028# CONFIG_PCMCIA_RAYCS is not set
1029# CONFIG_IPW2100 is not set
1030# CONFIG_IPW2200 is not set
1031# CONFIG_LIBERTAS is not set
1032# CONFIG_AIRO is not set
1033# CONFIG_HERMES is not set
1034# CONFIG_ATMEL is not set
1035# CONFIG_AIRO_CS is not set
1036# CONFIG_PCMCIA_WL3501 is not set
1037# CONFIG_PRISM54 is not set
1038# CONFIG_USB_ZD1201 is not set
1039# CONFIG_USB_NET_RNDIS_WLAN is not set
1040# CONFIG_RTL8180 is not set
1041# CONFIG_RTL8187 is not set
1042# CONFIG_ADM8211 is not set
1043# CONFIG_MAC80211_HWSIM is not set
1044# CONFIG_P54_COMMON is not set
1045CONFIG_ATH5K=y
1046# CONFIG_ATH5K_DEBUG is not set
1047# CONFIG_ATH9K is not set
1048# CONFIG_IWLCORE is not set
1049# CONFIG_IWLWIFI_LEDS is not set
1050# CONFIG_IWLAGN is not set
1051# CONFIG_IWL3945 is not set
1052# CONFIG_HOSTAP is not set
1053# CONFIG_B43 is not set
1054# CONFIG_B43LEGACY is not set
1055# CONFIG_ZD1211RW is not set
1056# CONFIG_RT2X00 is not set
845 1057
846# 1058#
847# USB Network Adapters 1059# USB Network Adapters
@@ -850,16 +1062,27 @@ CONFIG_NETDEV_10000=y
850# CONFIG_USB_KAWETH is not set 1062# CONFIG_USB_KAWETH is not set
851# CONFIG_USB_PEGASUS is not set 1063# CONFIG_USB_PEGASUS is not set
852# CONFIG_USB_RTL8150 is not set 1064# CONFIG_USB_RTL8150 is not set
853# CONFIG_USB_USBNET_MII is not set
854# CONFIG_USB_USBNET is not set 1065# CONFIG_USB_USBNET is not set
1066CONFIG_NET_PCMCIA=y
1067# CONFIG_PCMCIA_3C589 is not set
1068# CONFIG_PCMCIA_3C574 is not set
1069# CONFIG_PCMCIA_FMVJ18X is not set
1070# CONFIG_PCMCIA_PCNET is not set
1071# CONFIG_PCMCIA_NMCLAN is not set
1072# CONFIG_PCMCIA_SMC91C92 is not set
1073# CONFIG_PCMCIA_XIRC2PS is not set
1074# CONFIG_PCMCIA_AXNET is not set
1075# CONFIG_PCMCIA_IBMTR is not set
855# CONFIG_WAN is not set 1076# CONFIG_WAN is not set
856# CONFIG_FDDI is not set 1077CONFIG_FDDI=y
1078# CONFIG_DEFXX is not set
1079# CONFIG_SKFP is not set
857# CONFIG_HIPPI is not set 1080# CONFIG_HIPPI is not set
858# CONFIG_PPP is not set 1081# CONFIG_PPP is not set
859# CONFIG_SLIP is not set 1082# CONFIG_SLIP is not set
860# CONFIG_NET_FC is not set 1083# CONFIG_NET_FC is not set
861# CONFIG_SHAPER is not set
862CONFIG_NETCONSOLE=y 1084CONFIG_NETCONSOLE=y
1085# CONFIG_NETCONSOLE_DYNAMIC is not set
863CONFIG_NETPOLL=y 1086CONFIG_NETPOLL=y
864# CONFIG_NETPOLL_TRAP is not set 1087# CONFIG_NETPOLL_TRAP is not set
865CONFIG_NET_POLL_CONTROLLER=y 1088CONFIG_NET_POLL_CONTROLLER=y
@@ -870,18 +1093,17 @@ CONFIG_NET_POLL_CONTROLLER=y
870# Input device support 1093# Input device support
871# 1094#
872CONFIG_INPUT=y 1095CONFIG_INPUT=y
873# CONFIG_INPUT_FF_MEMLESS is not set 1096CONFIG_INPUT_FF_MEMLESS=y
874# CONFIG_INPUT_POLLDEV is not set 1097CONFIG_INPUT_POLLDEV=y
875 1098
876# 1099#
877# Userland interfaces 1100# Userland interfaces
878# 1101#
879CONFIG_INPUT_MOUSEDEV=y 1102CONFIG_INPUT_MOUSEDEV=y
880CONFIG_INPUT_MOUSEDEV_PSAUX=y 1103# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
881CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 1104CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
882CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 1105CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
883# CONFIG_INPUT_JOYDEV is not set 1106# CONFIG_INPUT_JOYDEV is not set
884# CONFIG_INPUT_TSDEV is not set
885CONFIG_INPUT_EVDEV=y 1107CONFIG_INPUT_EVDEV=y
886# CONFIG_INPUT_EVBUG is not set 1108# CONFIG_INPUT_EVBUG is not set
887 1109
@@ -905,18 +1127,67 @@ CONFIG_MOUSE_PS2_TRACKPOINT=y
905# CONFIG_MOUSE_PS2_TOUCHKIT is not set 1127# CONFIG_MOUSE_PS2_TOUCHKIT is not set
906# CONFIG_MOUSE_SERIAL is not set 1128# CONFIG_MOUSE_SERIAL is not set
907# CONFIG_MOUSE_APPLETOUCH is not set 1129# CONFIG_MOUSE_APPLETOUCH is not set
1130# CONFIG_MOUSE_BCM5974 is not set
908# CONFIG_MOUSE_VSXXXAA is not set 1131# CONFIG_MOUSE_VSXXXAA is not set
909# CONFIG_INPUT_JOYSTICK is not set 1132CONFIG_INPUT_JOYSTICK=y
910# CONFIG_INPUT_TABLET is not set 1133# CONFIG_JOYSTICK_ANALOG is not set
911# CONFIG_INPUT_TOUCHSCREEN is not set 1134# CONFIG_JOYSTICK_A3D is not set
912# CONFIG_INPUT_MISC is not set 1135# CONFIG_JOYSTICK_ADI is not set
1136# CONFIG_JOYSTICK_COBRA is not set
1137# CONFIG_JOYSTICK_GF2K is not set
1138# CONFIG_JOYSTICK_GRIP is not set
1139# CONFIG_JOYSTICK_GRIP_MP is not set
1140# CONFIG_JOYSTICK_GUILLEMOT is not set
1141# CONFIG_JOYSTICK_INTERACT is not set
1142# CONFIG_JOYSTICK_SIDEWINDER is not set
1143# CONFIG_JOYSTICK_TMDC is not set
1144# CONFIG_JOYSTICK_IFORCE is not set
1145# CONFIG_JOYSTICK_WARRIOR is not set
1146# CONFIG_JOYSTICK_MAGELLAN is not set
1147# CONFIG_JOYSTICK_SPACEORB is not set
1148# CONFIG_JOYSTICK_SPACEBALL is not set
1149# CONFIG_JOYSTICK_STINGER is not set
1150# CONFIG_JOYSTICK_TWIDJOY is not set
1151# CONFIG_JOYSTICK_ZHENHUA is not set
1152# CONFIG_JOYSTICK_JOYDUMP is not set
1153# CONFIG_JOYSTICK_XPAD is not set
1154CONFIG_INPUT_TABLET=y
1155# CONFIG_TABLET_USB_ACECAD is not set
1156# CONFIG_TABLET_USB_AIPTEK is not set
1157# CONFIG_TABLET_USB_GTCO is not set
1158# CONFIG_TABLET_USB_KBTAB is not set
1159# CONFIG_TABLET_USB_WACOM is not set
1160CONFIG_INPUT_TOUCHSCREEN=y
1161# CONFIG_TOUCHSCREEN_FUJITSU is not set
1162# CONFIG_TOUCHSCREEN_GUNZE is not set
1163# CONFIG_TOUCHSCREEN_ELO is not set
1164# CONFIG_TOUCHSCREEN_MTOUCH is not set
1165# CONFIG_TOUCHSCREEN_INEXIO is not set
1166# CONFIG_TOUCHSCREEN_MK712 is not set
1167# CONFIG_TOUCHSCREEN_PENMOUNT is not set
1168# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set
1169# CONFIG_TOUCHSCREEN_TOUCHWIN is not set
1170# CONFIG_TOUCHSCREEN_UCB1400 is not set
1171# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set
1172# CONFIG_TOUCHSCREEN_TOUCHIT213 is not set
1173CONFIG_INPUT_MISC=y
1174# CONFIG_INPUT_PCSPKR is not set
1175# CONFIG_INPUT_APANEL is not set
1176# CONFIG_INPUT_WISTRON_BTNS is not set
1177# CONFIG_INPUT_ATLAS_BTNS is not set
1178# CONFIG_INPUT_ATI_REMOTE is not set
1179# CONFIG_INPUT_ATI_REMOTE2 is not set
1180# CONFIG_INPUT_KEYSPAN_REMOTE is not set
1181# CONFIG_INPUT_POWERMATE is not set
1182# CONFIG_INPUT_YEALINK is not set
1183# CONFIG_INPUT_UINPUT is not set
913 1184
914# 1185#
915# Hardware I/O ports 1186# Hardware I/O ports
916# 1187#
917CONFIG_SERIO=y 1188CONFIG_SERIO=y
918CONFIG_SERIO_I8042=y 1189CONFIG_SERIO_I8042=y
919# CONFIG_SERIO_SERPORT is not set 1190CONFIG_SERIO_SERPORT=y
920# CONFIG_SERIO_CT82C710 is not set 1191# CONFIG_SERIO_CT82C710 is not set
921# CONFIG_SERIO_PCIPS2 is not set 1192# CONFIG_SERIO_PCIPS2 is not set
922CONFIG_SERIO_LIBPS2=y 1193CONFIG_SERIO_LIBPS2=y
@@ -927,10 +1198,29 @@ CONFIG_SERIO_LIBPS2=y
927# Character devices 1198# Character devices
928# 1199#
929CONFIG_VT=y 1200CONFIG_VT=y
1201CONFIG_CONSOLE_TRANSLATIONS=y
930CONFIG_VT_CONSOLE=y 1202CONFIG_VT_CONSOLE=y
931CONFIG_HW_CONSOLE=y 1203CONFIG_HW_CONSOLE=y
932# CONFIG_VT_HW_CONSOLE_BINDING is not set 1204CONFIG_VT_HW_CONSOLE_BINDING=y
933# CONFIG_SERIAL_NONSTANDARD is not set 1205CONFIG_DEVKMEM=y
1206CONFIG_SERIAL_NONSTANDARD=y
1207# CONFIG_COMPUTONE is not set
1208# CONFIG_ROCKETPORT is not set
1209# CONFIG_CYCLADES is not set
1210# CONFIG_DIGIEPCA is not set
1211# CONFIG_MOXA_INTELLIO is not set
1212# CONFIG_MOXA_SMARTIO is not set
1213# CONFIG_ISI is not set
1214# CONFIG_SYNCLINK is not set
1215# CONFIG_SYNCLINKMP is not set
1216# CONFIG_SYNCLINK_GT is not set
1217# CONFIG_N_HDLC is not set
1218# CONFIG_RISCOM8 is not set
1219# CONFIG_SPECIALIX is not set
1220# CONFIG_SX is not set
1221# CONFIG_RIO is not set
1222# CONFIG_STALDRV is not set
1223# CONFIG_NOZOMI is not set
934 1224
935# 1225#
936# Serial drivers 1226# Serial drivers
@@ -940,9 +1230,14 @@ CONFIG_SERIAL_8250_CONSOLE=y
940CONFIG_FIX_EARLYCON_MEM=y 1230CONFIG_FIX_EARLYCON_MEM=y
941CONFIG_SERIAL_8250_PCI=y 1231CONFIG_SERIAL_8250_PCI=y
942CONFIG_SERIAL_8250_PNP=y 1232CONFIG_SERIAL_8250_PNP=y
943CONFIG_SERIAL_8250_NR_UARTS=4 1233# CONFIG_SERIAL_8250_CS is not set
1234CONFIG_SERIAL_8250_NR_UARTS=32
944CONFIG_SERIAL_8250_RUNTIME_UARTS=4 1235CONFIG_SERIAL_8250_RUNTIME_UARTS=4
945# CONFIG_SERIAL_8250_EXTENDED is not set 1236CONFIG_SERIAL_8250_EXTENDED=y
1237CONFIG_SERIAL_8250_MANY_PORTS=y
1238CONFIG_SERIAL_8250_SHARE_IRQ=y
1239CONFIG_SERIAL_8250_DETECT_IRQ=y
1240CONFIG_SERIAL_8250_RSA=y
946 1241
947# 1242#
948# Non-8250 serial port support 1243# Non-8250 serial port support
@@ -951,125 +1246,434 @@ CONFIG_SERIAL_CORE=y
951CONFIG_SERIAL_CORE_CONSOLE=y 1246CONFIG_SERIAL_CORE_CONSOLE=y
952# CONFIG_SERIAL_JSM is not set 1247# CONFIG_SERIAL_JSM is not set
953CONFIG_UNIX98_PTYS=y 1248CONFIG_UNIX98_PTYS=y
954CONFIG_LEGACY_PTYS=y 1249# CONFIG_LEGACY_PTYS is not set
955CONFIG_LEGACY_PTY_COUNT=256
956# CONFIG_IPMI_HANDLER is not set 1250# CONFIG_IPMI_HANDLER is not set
957# CONFIG_WATCHDOG is not set
958CONFIG_HW_RANDOM=y 1251CONFIG_HW_RANDOM=y
959CONFIG_HW_RANDOM_INTEL=y 1252CONFIG_HW_RANDOM_INTEL=y
960CONFIG_HW_RANDOM_AMD=y 1253CONFIG_HW_RANDOM_AMD=y
961CONFIG_HW_RANDOM_GEODE=y 1254CONFIG_HW_RANDOM_GEODE=y
962CONFIG_HW_RANDOM_VIA=y 1255CONFIG_HW_RANDOM_VIA=y
963# CONFIG_NVRAM is not set 1256CONFIG_NVRAM=y
964CONFIG_RTC=y
965# CONFIG_R3964 is not set 1257# CONFIG_R3964 is not set
966# CONFIG_APPLICOM is not set 1258# CONFIG_APPLICOM is not set
967# CONFIG_SONYPI is not set 1259# CONFIG_SONYPI is not set
968CONFIG_AGP=y 1260
969# CONFIG_AGP_ALI is not set 1261#
970# CONFIG_AGP_ATI is not set 1262# PCMCIA character devices
971# CONFIG_AGP_AMD is not set 1263#
972CONFIG_AGP_AMD64=y 1264# CONFIG_SYNCLINK_CS is not set
973CONFIG_AGP_INTEL=y 1265# CONFIG_CARDMAN_4000 is not set
974# CONFIG_AGP_NVIDIA is not set 1266# CONFIG_CARDMAN_4040 is not set
975# CONFIG_AGP_SIS is not set 1267# CONFIG_IPWIRELESS is not set
976# CONFIG_AGP_SWORKS is not set
977# CONFIG_AGP_VIA is not set
978# CONFIG_AGP_EFFICEON is not set
979# CONFIG_DRM is not set
980# CONFIG_MWAVE is not set 1268# CONFIG_MWAVE is not set
981# CONFIG_PC8736x_GPIO is not set 1269# CONFIG_PC8736x_GPIO is not set
982# CONFIG_NSC_GPIO is not set 1270# CONFIG_NSC_GPIO is not set
983# CONFIG_CS5535_GPIO is not set 1271# CONFIG_CS5535_GPIO is not set
984CONFIG_RAW_DRIVER=y 1272# CONFIG_RAW_DRIVER is not set
985CONFIG_MAX_RAW_DEVS=256
986CONFIG_HPET=y 1273CONFIG_HPET=y
987# CONFIG_HPET_RTC_IRQ is not set 1274# CONFIG_HPET_MMAP is not set
988CONFIG_HPET_MMAP=y
989# CONFIG_HANGCHECK_TIMER is not set 1275# CONFIG_HANGCHECK_TIMER is not set
990# CONFIG_TCG_TPM is not set 1276# CONFIG_TCG_TPM is not set
991# CONFIG_TELCLOCK is not set 1277# CONFIG_TELCLOCK is not set
992CONFIG_DEVPORT=y 1278CONFIG_DEVPORT=y
993# CONFIG_I2C is not set 1279CONFIG_I2C=y
1280CONFIG_I2C_BOARDINFO=y
1281# CONFIG_I2C_CHARDEV is not set
1282CONFIG_I2C_HELPER_AUTO=y
1283
1284#
1285# I2C Hardware Bus support
1286#
1287
1288#
1289# PC SMBus host controller drivers
1290#
1291# CONFIG_I2C_ALI1535 is not set
1292# CONFIG_I2C_ALI1563 is not set
1293# CONFIG_I2C_ALI15X3 is not set
1294# CONFIG_I2C_AMD756 is not set
1295# CONFIG_I2C_AMD8111 is not set
1296CONFIG_I2C_I801=y
1297# CONFIG_I2C_ISCH is not set
1298# CONFIG_I2C_PIIX4 is not set
1299# CONFIG_I2C_NFORCE2 is not set
1300# CONFIG_I2C_SIS5595 is not set
1301# CONFIG_I2C_SIS630 is not set
1302# CONFIG_I2C_SIS96X is not set
1303# CONFIG_I2C_VIA is not set
1304# CONFIG_I2C_VIAPRO is not set
1305
1306#
1307# I2C system bus drivers (mostly embedded / system-on-chip)
1308#
1309# CONFIG_I2C_OCORES is not set
1310# CONFIG_I2C_SIMTEC is not set
994 1311
995# 1312#
996# SPI support 1313# External I2C/SMBus adapter drivers
997# 1314#
1315# CONFIG_I2C_PARPORT_LIGHT is not set
1316# CONFIG_I2C_TAOS_EVM is not set
1317# CONFIG_I2C_TINY_USB is not set
1318
1319#
1320# Graphics adapter I2C/DDC channel drivers
1321#
1322# CONFIG_I2C_VOODOO3 is not set
1323
1324#
1325# Other I2C/SMBus bus drivers
1326#
1327# CONFIG_I2C_PCA_PLATFORM is not set
1328# CONFIG_I2C_STUB is not set
1329# CONFIG_SCx200_ACB is not set
1330
1331#
1332# Miscellaneous I2C Chip support
1333#
1334# CONFIG_DS1682 is not set
1335# CONFIG_AT24 is not set
1336# CONFIG_SENSORS_EEPROM is not set
1337# CONFIG_SENSORS_PCF8574 is not set
1338# CONFIG_PCF8575 is not set
1339# CONFIG_SENSORS_PCA9539 is not set
1340# CONFIG_SENSORS_PCF8591 is not set
1341# CONFIG_SENSORS_MAX6875 is not set
1342# CONFIG_SENSORS_TSL2550 is not set
1343# CONFIG_I2C_DEBUG_CORE is not set
1344# CONFIG_I2C_DEBUG_ALGO is not set
1345# CONFIG_I2C_DEBUG_BUS is not set
1346# CONFIG_I2C_DEBUG_CHIP is not set
998# CONFIG_SPI is not set 1347# CONFIG_SPI is not set
999# CONFIG_SPI_MASTER is not set 1348CONFIG_ARCH_WANT_OPTIONAL_GPIOLIB=y
1349# CONFIG_GPIOLIB is not set
1000# CONFIG_W1 is not set 1350# CONFIG_W1 is not set
1001# CONFIG_POWER_SUPPLY is not set 1351CONFIG_POWER_SUPPLY=y
1352# CONFIG_POWER_SUPPLY_DEBUG is not set
1353# CONFIG_PDA_POWER is not set
1354# CONFIG_BATTERY_DS2760 is not set
1002# CONFIG_HWMON is not set 1355# CONFIG_HWMON is not set
1356CONFIG_THERMAL=y
1357CONFIG_WATCHDOG=y
1358# CONFIG_WATCHDOG_NOWAYOUT is not set
1359
1360#
1361# Watchdog Device Drivers
1362#
1363# CONFIG_SOFT_WATCHDOG is not set
1364# CONFIG_ACQUIRE_WDT is not set
1365# CONFIG_ADVANTECH_WDT is not set
1366# CONFIG_ALIM1535_WDT is not set
1367# CONFIG_ALIM7101_WDT is not set
1368# CONFIG_SC520_WDT is not set
1369# CONFIG_EUROTECH_WDT is not set
1370# CONFIG_IB700_WDT is not set
1371# CONFIG_IBMASR is not set
1372# CONFIG_WAFER_WDT is not set
1373# CONFIG_I6300ESB_WDT is not set
1374# CONFIG_ITCO_WDT is not set
1375# CONFIG_IT8712F_WDT is not set
1376# CONFIG_HP_WATCHDOG is not set
1377# CONFIG_SC1200_WDT is not set
1378# CONFIG_PC87413_WDT is not set
1379# CONFIG_60XX_WDT is not set
1380# CONFIG_SBC8360_WDT is not set
1381# CONFIG_SBC7240_WDT is not set
1382# CONFIG_CPU5_WDT is not set
1383# CONFIG_SMSC37B787_WDT is not set
1384# CONFIG_W83627HF_WDT is not set
1385# CONFIG_W83697HF_WDT is not set
1386# CONFIG_W83877F_WDT is not set
1387# CONFIG_W83977F_WDT is not set
1388# CONFIG_MACHZ_WDT is not set
1389# CONFIG_SBC_EPX_C3_WATCHDOG is not set
1390
1391#
1392# PCI-based Watchdog Cards
1393#
1394# CONFIG_PCIPCWATCHDOG is not set
1395# CONFIG_WDTPCI is not set
1396
1397#
1398# USB-based Watchdog Cards
1399#
1400# CONFIG_USBPCWATCHDOG is not set
1401
1402#
1403# Sonics Silicon Backplane
1404#
1405CONFIG_SSB_POSSIBLE=y
1406# CONFIG_SSB is not set
1003 1407
1004# 1408#
1005# Multifunction device drivers 1409# Multifunction device drivers
1006# 1410#
1411# CONFIG_MFD_CORE is not set
1007# CONFIG_MFD_SM501 is not set 1412# CONFIG_MFD_SM501 is not set
1413# CONFIG_HTC_PASIC3 is not set
1414# CONFIG_MFD_TMIO is not set
1008 1415
1009# 1416#
1010# Multimedia devices 1417# Multimedia devices
1011# 1418#
1419
1420#
1421# Multimedia core support
1422#
1012# CONFIG_VIDEO_DEV is not set 1423# CONFIG_VIDEO_DEV is not set
1013# CONFIG_DVB_CORE is not set 1424# CONFIG_DVB_CORE is not set
1425# CONFIG_VIDEO_MEDIA is not set
1426
1427#
1428# Multimedia drivers
1429#
1014CONFIG_DAB=y 1430CONFIG_DAB=y
1015# CONFIG_USB_DABUSB is not set 1431# CONFIG_USB_DABUSB is not set
1016 1432
1017# 1433#
1018# Graphics support 1434# Graphics support
1019# 1435#
1020# CONFIG_BACKLIGHT_LCD_SUPPORT is not set 1436CONFIG_AGP=y
1437# CONFIG_AGP_ALI is not set
1438# CONFIG_AGP_ATI is not set
1439# CONFIG_AGP_AMD is not set
1440CONFIG_AGP_AMD64=y
1441CONFIG_AGP_INTEL=y
1442# CONFIG_AGP_NVIDIA is not set
1443# CONFIG_AGP_SIS is not set
1444# CONFIG_AGP_SWORKS is not set
1445# CONFIG_AGP_VIA is not set
1446# CONFIG_AGP_EFFICEON is not set
1447CONFIG_DRM=y
1448# CONFIG_DRM_TDFX is not set
1449# CONFIG_DRM_R128 is not set
1450# CONFIG_DRM_RADEON is not set
1451# CONFIG_DRM_I810 is not set
1452# CONFIG_DRM_I830 is not set
1453CONFIG_DRM_I915=y
1454# CONFIG_DRM_MGA is not set
1455# CONFIG_DRM_SIS is not set
1456# CONFIG_DRM_VIA is not set
1457# CONFIG_DRM_SAVAGE is not set
1458# CONFIG_VGASTATE is not set
1459# CONFIG_VIDEO_OUTPUT_CONTROL is not set
1460CONFIG_FB=y
1461# CONFIG_FIRMWARE_EDID is not set
1462# CONFIG_FB_DDC is not set
1463CONFIG_FB_CFB_FILLRECT=y
1464CONFIG_FB_CFB_COPYAREA=y
1465CONFIG_FB_CFB_IMAGEBLIT=y
1466# CONFIG_FB_CFB_REV_PIXELS_IN_BYTE is not set
1467# CONFIG_FB_SYS_FILLRECT is not set
1468# CONFIG_FB_SYS_COPYAREA is not set
1469# CONFIG_FB_SYS_IMAGEBLIT is not set
1470# CONFIG_FB_FOREIGN_ENDIAN is not set
1471# CONFIG_FB_SYS_FOPS is not set
1472# CONFIG_FB_SVGALIB is not set
1473# CONFIG_FB_MACMODES is not set
1474# CONFIG_FB_BACKLIGHT is not set
1475CONFIG_FB_MODE_HELPERS=y
1476CONFIG_FB_TILEBLITTING=y
1477
1478#
1479# Frame buffer hardware drivers
1480#
1481# CONFIG_FB_CIRRUS is not set
1482# CONFIG_FB_PM2 is not set
1483# CONFIG_FB_CYBER2000 is not set
1484# CONFIG_FB_ARC is not set
1485# CONFIG_FB_ASILIANT is not set
1486# CONFIG_FB_IMSTT is not set
1487# CONFIG_FB_VGA16 is not set
1488# CONFIG_FB_UVESA is not set
1489# CONFIG_FB_VESA is not set
1490CONFIG_FB_EFI=y
1491# CONFIG_FB_IMAC is not set
1492# CONFIG_FB_N411 is not set
1493# CONFIG_FB_HGA is not set
1494# CONFIG_FB_S1D13XXX is not set
1495# CONFIG_FB_NVIDIA is not set
1496# CONFIG_FB_RIVA is not set
1497# CONFIG_FB_I810 is not set
1498# CONFIG_FB_LE80578 is not set
1499# CONFIG_FB_INTEL is not set
1500# CONFIG_FB_MATROX is not set
1501# CONFIG_FB_RADEON is not set
1502# CONFIG_FB_ATY128 is not set
1503# CONFIG_FB_ATY is not set
1504# CONFIG_FB_S3 is not set
1505# CONFIG_FB_SAVAGE is not set
1506# CONFIG_FB_SIS is not set
1507# CONFIG_FB_NEOMAGIC is not set
1508# CONFIG_FB_KYRO is not set
1509# CONFIG_FB_3DFX is not set
1510# CONFIG_FB_VOODOO1 is not set
1511# CONFIG_FB_VT8623 is not set
1512# CONFIG_FB_CYBLA is not set
1513# CONFIG_FB_TRIDENT is not set
1514# CONFIG_FB_ARK is not set
1515# CONFIG_FB_PM3 is not set
1516# CONFIG_FB_CARMINE is not set
1517# CONFIG_FB_GEODE is not set
1518# CONFIG_FB_VIRTUAL is not set
1519CONFIG_BACKLIGHT_LCD_SUPPORT=y
1520# CONFIG_LCD_CLASS_DEVICE is not set
1521CONFIG_BACKLIGHT_CLASS_DEVICE=y
1522# CONFIG_BACKLIGHT_CORGI is not set
1523# CONFIG_BACKLIGHT_PROGEAR is not set
1524# CONFIG_BACKLIGHT_MBP_NVIDIA is not set
1021 1525
1022# 1526#
1023# Display device support 1527# Display device support
1024# 1528#
1025# CONFIG_DISPLAY_SUPPORT is not set 1529# CONFIG_DISPLAY_SUPPORT is not set
1026# CONFIG_VGASTATE is not set
1027# CONFIG_FB is not set
1028 1530
1029# 1531#
1030# Console display driver support 1532# Console display driver support
1031# 1533#
1032CONFIG_VGA_CONSOLE=y 1534CONFIG_VGA_CONSOLE=y
1033CONFIG_VGACON_SOFT_SCROLLBACK=y 1535CONFIG_VGACON_SOFT_SCROLLBACK=y
1034CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=128 1536CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64
1035CONFIG_VIDEO_SELECT=y
1036CONFIG_DUMMY_CONSOLE=y 1537CONFIG_DUMMY_CONSOLE=y
1037 1538# CONFIG_FRAMEBUFFER_CONSOLE is not set
1038# 1539CONFIG_LOGO=y
1039# Sound 1540# CONFIG_LOGO_LINUX_MONO is not set
1040# 1541# CONFIG_LOGO_LINUX_VGA16 is not set
1542CONFIG_LOGO_LINUX_CLUT224=y
1041CONFIG_SOUND=y 1543CONFIG_SOUND=y
1042 1544CONFIG_SND=y
1043# 1545CONFIG_SND_TIMER=y
1044# Advanced Linux Sound Architecture 1546CONFIG_SND_PCM=y
1045# 1547CONFIG_SND_HWDEP=y
1046# CONFIG_SND is not set 1548CONFIG_SND_SEQUENCER=y
1047 1549CONFIG_SND_SEQ_DUMMY=y
1048# 1550CONFIG_SND_OSSEMUL=y
1049# Open Sound System 1551CONFIG_SND_MIXER_OSS=y
1050# 1552CONFIG_SND_PCM_OSS=y
1051CONFIG_SOUND_PRIME=y 1553CONFIG_SND_PCM_OSS_PLUGINS=y
1052# CONFIG_SOUND_TRIDENT is not set 1554CONFIG_SND_SEQUENCER_OSS=y
1053# CONFIG_SOUND_MSNDCLAS is not set 1555CONFIG_SND_DYNAMIC_MINORS=y
1054# CONFIG_SOUND_MSNDPIN is not set 1556CONFIG_SND_SUPPORT_OLD_API=y
1055# CONFIG_SOUND_OSS is not set 1557CONFIG_SND_VERBOSE_PROCFS=y
1558# CONFIG_SND_VERBOSE_PRINTK is not set
1559# CONFIG_SND_DEBUG is not set
1560CONFIG_SND_VMASTER=y
1561CONFIG_SND_DRIVERS=y
1562# CONFIG_SND_PCSP is not set
1563# CONFIG_SND_DUMMY is not set
1564# CONFIG_SND_VIRMIDI is not set
1565# CONFIG_SND_MTPAV is not set
1566# CONFIG_SND_SERIAL_U16550 is not set
1567# CONFIG_SND_MPU401 is not set
1568CONFIG_SND_PCI=y
1569# CONFIG_SND_AD1889 is not set
1570# CONFIG_SND_ALS300 is not set
1571# CONFIG_SND_ALS4000 is not set
1572# CONFIG_SND_ALI5451 is not set
1573# CONFIG_SND_ATIIXP is not set
1574# CONFIG_SND_ATIIXP_MODEM is not set
1575# CONFIG_SND_AU8810 is not set
1576# CONFIG_SND_AU8820 is not set
1577# CONFIG_SND_AU8830 is not set
1578# CONFIG_SND_AW2 is not set
1579# CONFIG_SND_AZT3328 is not set
1580# CONFIG_SND_BT87X is not set
1581# CONFIG_SND_CA0106 is not set
1582# CONFIG_SND_CMIPCI is not set
1583# CONFIG_SND_OXYGEN is not set
1584# CONFIG_SND_CS4281 is not set
1585# CONFIG_SND_CS46XX is not set
1586# CONFIG_SND_CS5530 is not set
1587# CONFIG_SND_CS5535AUDIO is not set
1588# CONFIG_SND_DARLA20 is not set
1589# CONFIG_SND_GINA20 is not set
1590# CONFIG_SND_LAYLA20 is not set
1591# CONFIG_SND_DARLA24 is not set
1592# CONFIG_SND_GINA24 is not set
1593# CONFIG_SND_LAYLA24 is not set
1594# CONFIG_SND_MONA is not set
1595# CONFIG_SND_MIA is not set
1596# CONFIG_SND_ECHO3G is not set
1597# CONFIG_SND_INDIGO is not set
1598# CONFIG_SND_INDIGOIO is not set
1599# CONFIG_SND_INDIGODJ is not set
1600# CONFIG_SND_EMU10K1 is not set
1601# CONFIG_SND_EMU10K1X is not set
1602# CONFIG_SND_ENS1370 is not set
1603# CONFIG_SND_ENS1371 is not set
1604# CONFIG_SND_ES1938 is not set
1605# CONFIG_SND_ES1968 is not set
1606# CONFIG_SND_FM801 is not set
1607CONFIG_SND_HDA_INTEL=y
1608CONFIG_SND_HDA_HWDEP=y
1609CONFIG_SND_HDA_CODEC_REALTEK=y
1610CONFIG_SND_HDA_CODEC_ANALOG=y
1611CONFIG_SND_HDA_CODEC_SIGMATEL=y
1612CONFIG_SND_HDA_CODEC_VIA=y
1613CONFIG_SND_HDA_CODEC_ATIHDMI=y
1614CONFIG_SND_HDA_CODEC_CONEXANT=y
1615CONFIG_SND_HDA_CODEC_CMEDIA=y
1616CONFIG_SND_HDA_CODEC_SI3054=y
1617CONFIG_SND_HDA_GENERIC=y
1618# CONFIG_SND_HDA_POWER_SAVE is not set
1619# CONFIG_SND_HDSP is not set
1620# CONFIG_SND_HDSPM is not set
1621# CONFIG_SND_HIFIER is not set
1622# CONFIG_SND_ICE1712 is not set
1623# CONFIG_SND_ICE1724 is not set
1624# CONFIG_SND_INTEL8X0 is not set
1625# CONFIG_SND_INTEL8X0M is not set
1626# CONFIG_SND_KORG1212 is not set
1627# CONFIG_SND_MAESTRO3 is not set
1628# CONFIG_SND_MIXART is not set
1629# CONFIG_SND_NM256 is not set
1630# CONFIG_SND_PCXHR is not set
1631# CONFIG_SND_RIPTIDE is not set
1632# CONFIG_SND_RME32 is not set
1633# CONFIG_SND_RME96 is not set
1634# CONFIG_SND_RME9652 is not set
1635# CONFIG_SND_SIS7019 is not set
1636# CONFIG_SND_SONICVIBES is not set
1637# CONFIG_SND_TRIDENT is not set
1638# CONFIG_SND_VIA82XX is not set
1639# CONFIG_SND_VIA82XX_MODEM is not set
1640# CONFIG_SND_VIRTUOSO is not set
1641# CONFIG_SND_VX222 is not set
1642# CONFIG_SND_YMFPCI is not set
1643CONFIG_SND_USB=y
1644# CONFIG_SND_USB_AUDIO is not set
1645# CONFIG_SND_USB_USX2Y is not set
1646# CONFIG_SND_USB_CAIAQ is not set
1647CONFIG_SND_PCMCIA=y
1648# CONFIG_SND_VXPOCKET is not set
1649# CONFIG_SND_PDAUDIOCF is not set
1650# CONFIG_SND_SOC is not set
1651# CONFIG_SOUND_PRIME is not set
1056CONFIG_HID_SUPPORT=y 1652CONFIG_HID_SUPPORT=y
1057CONFIG_HID=y 1653CONFIG_HID=y
1058# CONFIG_HID_DEBUG is not set 1654CONFIG_HID_DEBUG=y
1655CONFIG_HIDRAW=y
1059 1656
1060# 1657#
1061# USB Input Devices 1658# USB Input Devices
1062# 1659#
1063CONFIG_USB_HID=y 1660CONFIG_USB_HID=y
1064# CONFIG_USB_HIDINPUT_POWERBOOK is not set 1661CONFIG_USB_HIDINPUT_POWERBOOK=y
1065# CONFIG_HID_FF is not set 1662CONFIG_HID_FF=y
1066# CONFIG_USB_HIDDEV is not set 1663CONFIG_HID_PID=y
1664CONFIG_LOGITECH_FF=y
1665# CONFIG_LOGIRUMBLEPAD2_FF is not set
1666CONFIG_PANTHERLORD_FF=y
1667CONFIG_THRUSTMASTER_FF=y
1668CONFIG_ZEROPLUS_FF=y
1669CONFIG_USB_HIDDEV=y
1067CONFIG_USB_SUPPORT=y 1670CONFIG_USB_SUPPORT=y
1068CONFIG_USB_ARCH_HAS_HCD=y 1671CONFIG_USB_ARCH_HAS_HCD=y
1069CONFIG_USB_ARCH_HAS_OHCI=y 1672CONFIG_USB_ARCH_HAS_OHCI=y
1070CONFIG_USB_ARCH_HAS_EHCI=y 1673CONFIG_USB_ARCH_HAS_EHCI=y
1071CONFIG_USB=y 1674CONFIG_USB=y
1072# CONFIG_USB_DEBUG is not set 1675CONFIG_USB_DEBUG=y
1676CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
1073 1677
1074# 1678#
1075# Miscellaneous USB options 1679# Miscellaneous USB options
@@ -1077,18 +1681,19 @@ CONFIG_USB=y
1077CONFIG_USB_DEVICEFS=y 1681CONFIG_USB_DEVICEFS=y
1078# CONFIG_USB_DEVICE_CLASS is not set 1682# CONFIG_USB_DEVICE_CLASS is not set
1079# CONFIG_USB_DYNAMIC_MINORS is not set 1683# CONFIG_USB_DYNAMIC_MINORS is not set
1080# CONFIG_USB_SUSPEND is not set 1684CONFIG_USB_SUSPEND=y
1081# CONFIG_USB_PERSIST is not set
1082# CONFIG_USB_OTG is not set 1685# CONFIG_USB_OTG is not set
1686CONFIG_USB_MON=y
1083 1687
1084# 1688#
1085# USB Host Controller Drivers 1689# USB Host Controller Drivers
1086# 1690#
1691# CONFIG_USB_C67X00_HCD is not set
1087CONFIG_USB_EHCI_HCD=y 1692CONFIG_USB_EHCI_HCD=y
1088# CONFIG_USB_EHCI_SPLIT_ISO is not set
1089# CONFIG_USB_EHCI_ROOT_HUB_TT is not set 1693# CONFIG_USB_EHCI_ROOT_HUB_TT is not set
1090# CONFIG_USB_EHCI_TT_NEWSCHED is not set 1694# CONFIG_USB_EHCI_TT_NEWSCHED is not set
1091# CONFIG_USB_ISP116X_HCD is not set 1695# CONFIG_USB_ISP116X_HCD is not set
1696# CONFIG_USB_ISP1760_HCD is not set
1092CONFIG_USB_OHCI_HCD=y 1697CONFIG_USB_OHCI_HCD=y
1093# CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set 1698# CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set
1094# CONFIG_USB_OHCI_BIG_ENDIAN_MMIO is not set 1699# CONFIG_USB_OHCI_BIG_ENDIAN_MMIO is not set
@@ -1102,6 +1707,7 @@ CONFIG_USB_UHCI_HCD=y
1102# 1707#
1103# CONFIG_USB_ACM is not set 1708# CONFIG_USB_ACM is not set
1104CONFIG_USB_PRINTER=y 1709CONFIG_USB_PRINTER=y
1710# CONFIG_USB_WDM is not set
1105 1711
1106# 1712#
1107# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' 1713# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support'
@@ -1121,23 +1727,21 @@ CONFIG_USB_STORAGE=y
1121# CONFIG_USB_STORAGE_SDDR55 is not set 1727# CONFIG_USB_STORAGE_SDDR55 is not set
1122# CONFIG_USB_STORAGE_JUMPSHOT is not set 1728# CONFIG_USB_STORAGE_JUMPSHOT is not set
1123# CONFIG_USB_STORAGE_ALAUDA is not set 1729# CONFIG_USB_STORAGE_ALAUDA is not set
1730# CONFIG_USB_STORAGE_ONETOUCH is not set
1124# CONFIG_USB_STORAGE_KARMA is not set 1731# CONFIG_USB_STORAGE_KARMA is not set
1125# CONFIG_USB_LIBUSUAL is not set 1732# CONFIG_USB_STORAGE_SIERRA is not set
1733# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set
1734CONFIG_USB_LIBUSUAL=y
1126 1735
1127# 1736#
1128# USB Imaging devices 1737# USB Imaging devices
1129# 1738#
1130# CONFIG_USB_MDC800 is not set 1739# CONFIG_USB_MDC800 is not set
1131# CONFIG_USB_MICROTEK is not set 1740# CONFIG_USB_MICROTEK is not set
1132CONFIG_USB_MON=y
1133 1741
1134# 1742#
1135# USB port drivers 1743# USB port drivers
1136# 1744#
1137
1138#
1139# USB Serial Converter support
1140#
1141# CONFIG_USB_SERIAL is not set 1745# CONFIG_USB_SERIAL is not set
1142 1746
1143# 1747#
@@ -1146,7 +1750,6 @@ CONFIG_USB_MON=y
1146# CONFIG_USB_EMI62 is not set 1750# CONFIG_USB_EMI62 is not set
1147# CONFIG_USB_EMI26 is not set 1751# CONFIG_USB_EMI26 is not set
1148# CONFIG_USB_ADUTUX is not set 1752# CONFIG_USB_ADUTUX is not set
1149# CONFIG_USB_AUERSWALD is not set
1150# CONFIG_USB_RIO500 is not set 1753# CONFIG_USB_RIO500 is not set
1151# CONFIG_USB_LEGOTOWER is not set 1754# CONFIG_USB_LEGOTOWER is not set
1152# CONFIG_USB_LCD is not set 1755# CONFIG_USB_LCD is not set
@@ -1163,90 +1766,131 @@ CONFIG_USB_MON=y
1163# CONFIG_USB_TRANCEVIBRATOR is not set 1766# CONFIG_USB_TRANCEVIBRATOR is not set
1164# CONFIG_USB_IOWARRIOR is not set 1767# CONFIG_USB_IOWARRIOR is not set
1165# CONFIG_USB_TEST is not set 1768# CONFIG_USB_TEST is not set
1769# CONFIG_USB_ISIGHTFW is not set
1770# CONFIG_USB_GADGET is not set
1771# CONFIG_MMC is not set
1772# CONFIG_MEMSTICK is not set
1773CONFIG_NEW_LEDS=y
1774CONFIG_LEDS_CLASS=y
1166 1775
1167# 1776#
1168# USB DSL modem support 1777# LED drivers
1169# 1778#
1779# CONFIG_LEDS_PCA9532 is not set
1780# CONFIG_LEDS_CLEVO_MAIL is not set
1781# CONFIG_LEDS_PCA955X is not set
1170 1782
1171# 1783#
1172# USB Gadget Support 1784# LED Triggers
1173# 1785#
1174# CONFIG_USB_GADGET is not set 1786CONFIG_LEDS_TRIGGERS=y
1175# CONFIG_MMC is not set 1787# CONFIG_LEDS_TRIGGER_TIMER is not set
1788# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
1789# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
1790# CONFIG_ACCESSIBILITY is not set
1791# CONFIG_INFINIBAND is not set
1792CONFIG_EDAC=y
1176 1793
1177# 1794#
1178# LED devices 1795# Reporting subsystems
1179# 1796#
1180# CONFIG_NEW_LEDS is not set 1797# CONFIG_EDAC_DEBUG is not set
1798# CONFIG_EDAC_MM_EDAC is not set
1799CONFIG_RTC_LIB=y
1800CONFIG_RTC_CLASS=y
1801# CONFIG_RTC_HCTOSYS is not set
1802# CONFIG_RTC_DEBUG is not set
1181 1803
1182# 1804#
1183# LED drivers 1805# RTC interfaces
1184# 1806#
1807CONFIG_RTC_INTF_SYSFS=y
1808CONFIG_RTC_INTF_PROC=y
1809CONFIG_RTC_INTF_DEV=y
1810# CONFIG_RTC_INTF_DEV_UIE_EMUL is not set
1811# CONFIG_RTC_DRV_TEST is not set
1185 1812
1186# 1813#
1187# LED Triggers 1814# I2C RTC drivers
1188# 1815#
1189# CONFIG_INFINIBAND is not set 1816# CONFIG_RTC_DRV_DS1307 is not set
1190# CONFIG_EDAC is not set 1817# CONFIG_RTC_DRV_DS1374 is not set
1818# CONFIG_RTC_DRV_DS1672 is not set
1819# CONFIG_RTC_DRV_MAX6900 is not set
1820# CONFIG_RTC_DRV_RS5C372 is not set
1821# CONFIG_RTC_DRV_ISL1208 is not set
1822# CONFIG_RTC_DRV_X1205 is not set
1823# CONFIG_RTC_DRV_PCF8563 is not set
1824# CONFIG_RTC_DRV_PCF8583 is not set
1825# CONFIG_RTC_DRV_M41T80 is not set
1826# CONFIG_RTC_DRV_S35390A is not set
1827# CONFIG_RTC_DRV_FM3130 is not set
1191 1828
1192# 1829#
1193# Real Time Clock 1830# SPI RTC drivers
1194# 1831#
1195# CONFIG_RTC_CLASS is not set
1196 1832
1197# 1833#
1198# DMA Engine support 1834# Platform RTC drivers
1199# 1835#
1200# CONFIG_DMA_ENGINE is not set 1836CONFIG_RTC_DRV_CMOS=y
1837# CONFIG_RTC_DRV_DS1511 is not set
1838# CONFIG_RTC_DRV_DS1553 is not set
1839# CONFIG_RTC_DRV_DS1742 is not set
1840# CONFIG_RTC_DRV_STK17TA8 is not set
1841# CONFIG_RTC_DRV_M48T86 is not set
1842# CONFIG_RTC_DRV_M48T59 is not set
1843# CONFIG_RTC_DRV_V3020 is not set
1201 1844
1202# 1845#
1203# DMA Clients 1846# on-CPU RTC drivers
1204# 1847#
1848CONFIG_DMADEVICES=y
1205 1849
1206# 1850#
1207# DMA Devices 1851# DMA Devices
1208# 1852#
1209CONFIG_VIRTUALIZATION=y 1853# CONFIG_INTEL_IOATDMA is not set
1210# CONFIG_KVM is not set 1854# CONFIG_UIO is not set
1211 1855
1212# 1856#
1213# Userspace I/O 1857# Firmware Drivers
1214# 1858#
1215# CONFIG_UIO is not set 1859# CONFIG_EDD is not set
1860CONFIG_FIRMWARE_MEMMAP=y
1861CONFIG_EFI_VARS=y
1862# CONFIG_DELL_RBU is not set
1863# CONFIG_DCDBAS is not set
1864CONFIG_DMIID=y
1865CONFIG_ISCSI_IBFT_FIND=y
1866CONFIG_ISCSI_IBFT=y
1216 1867
1217# 1868#
1218# File systems 1869# File systems
1219# 1870#
1220CONFIG_EXT2_FS=y 1871# CONFIG_EXT2_FS is not set
1221CONFIG_EXT2_FS_XATTR=y
1222CONFIG_EXT2_FS_POSIX_ACL=y
1223# CONFIG_EXT2_FS_SECURITY is not set
1224# CONFIG_EXT2_FS_XIP is not set
1225CONFIG_EXT3_FS=y 1872CONFIG_EXT3_FS=y
1226CONFIG_EXT3_FS_XATTR=y 1873CONFIG_EXT3_FS_XATTR=y
1227CONFIG_EXT3_FS_POSIX_ACL=y 1874CONFIG_EXT3_FS_POSIX_ACL=y
1228# CONFIG_EXT3_FS_SECURITY is not set 1875CONFIG_EXT3_FS_SECURITY=y
1229# CONFIG_EXT4DEV_FS is not set 1876# CONFIG_EXT4DEV_FS is not set
1230CONFIG_JBD=y 1877CONFIG_JBD=y
1231# CONFIG_JBD_DEBUG is not set 1878# CONFIG_JBD_DEBUG is not set
1232CONFIG_FS_MBCACHE=y 1879CONFIG_FS_MBCACHE=y
1233CONFIG_REISERFS_FS=y 1880# CONFIG_REISERFS_FS is not set
1234# CONFIG_REISERFS_CHECK is not set
1235# CONFIG_REISERFS_PROC_INFO is not set
1236CONFIG_REISERFS_FS_XATTR=y
1237CONFIG_REISERFS_FS_POSIX_ACL=y
1238# CONFIG_REISERFS_FS_SECURITY is not set
1239# CONFIG_JFS_FS is not set 1881# CONFIG_JFS_FS is not set
1240CONFIG_FS_POSIX_ACL=y 1882CONFIG_FS_POSIX_ACL=y
1241# CONFIG_XFS_FS is not set 1883# CONFIG_XFS_FS is not set
1242# CONFIG_GFS2_FS is not set
1243# CONFIG_OCFS2_FS is not set 1884# CONFIG_OCFS2_FS is not set
1244# CONFIG_MINIX_FS is not set 1885CONFIG_DNOTIFY=y
1245# CONFIG_ROMFS_FS is not set
1246CONFIG_INOTIFY=y 1886CONFIG_INOTIFY=y
1247CONFIG_INOTIFY_USER=y 1887CONFIG_INOTIFY_USER=y
1248# CONFIG_QUOTA is not set 1888CONFIG_QUOTA=y
1249CONFIG_DNOTIFY=y 1889CONFIG_QUOTA_NETLINK_INTERFACE=y
1890# CONFIG_PRINT_QUOTA_WARNING is not set
1891# CONFIG_QFMT_V1 is not set
1892CONFIG_QFMT_V2=y
1893CONFIG_QUOTACTL=y
1250# CONFIG_AUTOFS_FS is not set 1894# CONFIG_AUTOFS_FS is not set
1251CONFIG_AUTOFS4_FS=y 1895CONFIG_AUTOFS4_FS=y
1252# CONFIG_FUSE_FS is not set 1896# CONFIG_FUSE_FS is not set
@@ -1256,8 +1900,8 @@ CONFIG_GENERIC_ACL=y
1256# CD-ROM/DVD Filesystems 1900# CD-ROM/DVD Filesystems
1257# 1901#
1258CONFIG_ISO9660_FS=y 1902CONFIG_ISO9660_FS=y
1259# CONFIG_JOLIET is not set 1903CONFIG_JOLIET=y
1260# CONFIG_ZISOFS is not set 1904CONFIG_ZISOFS=y
1261# CONFIG_UDF_FS is not set 1905# CONFIG_UDF_FS is not set
1262 1906
1263# 1907#
@@ -1275,13 +1919,13 @@ CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1"
1275# 1919#
1276CONFIG_PROC_FS=y 1920CONFIG_PROC_FS=y
1277CONFIG_PROC_KCORE=y 1921CONFIG_PROC_KCORE=y
1922CONFIG_PROC_VMCORE=y
1278CONFIG_PROC_SYSCTL=y 1923CONFIG_PROC_SYSCTL=y
1279CONFIG_SYSFS=y 1924CONFIG_SYSFS=y
1280CONFIG_TMPFS=y 1925CONFIG_TMPFS=y
1281CONFIG_TMPFS_POSIX_ACL=y 1926CONFIG_TMPFS_POSIX_ACL=y
1282CONFIG_HUGETLBFS=y 1927CONFIG_HUGETLBFS=y
1283CONFIG_HUGETLB_PAGE=y 1928CONFIG_HUGETLB_PAGE=y
1284CONFIG_RAMFS=y
1285# CONFIG_CONFIGFS_FS is not set 1929# CONFIG_CONFIGFS_FS is not set
1286 1930
1287# 1931#
@@ -1289,6 +1933,7 @@ CONFIG_RAMFS=y
1289# 1933#
1290# CONFIG_ADFS_FS is not set 1934# CONFIG_ADFS_FS is not set
1291# CONFIG_AFFS_FS is not set 1935# CONFIG_AFFS_FS is not set
1936# CONFIG_ECRYPT_FS is not set
1292# CONFIG_HFS_FS is not set 1937# CONFIG_HFS_FS is not set
1293# CONFIG_HFSPLUS_FS is not set 1938# CONFIG_HFSPLUS_FS is not set
1294# CONFIG_BEFS_FS is not set 1939# CONFIG_BEFS_FS is not set
@@ -1296,32 +1941,27 @@ CONFIG_RAMFS=y
1296# CONFIG_EFS_FS is not set 1941# CONFIG_EFS_FS is not set
1297# CONFIG_CRAMFS is not set 1942# CONFIG_CRAMFS is not set
1298# CONFIG_VXFS_FS is not set 1943# CONFIG_VXFS_FS is not set
1944# CONFIG_MINIX_FS is not set
1945# CONFIG_OMFS_FS is not set
1299# CONFIG_HPFS_FS is not set 1946# CONFIG_HPFS_FS is not set
1300# CONFIG_QNX4FS_FS is not set 1947# CONFIG_QNX4FS_FS is not set
1948# CONFIG_ROMFS_FS is not set
1301# CONFIG_SYSV_FS is not set 1949# CONFIG_SYSV_FS is not set
1302# CONFIG_UFS_FS is not set 1950# CONFIG_UFS_FS is not set
1303 1951CONFIG_NETWORK_FILESYSTEMS=y
1304#
1305# Network File Systems
1306#
1307CONFIG_NFS_FS=y 1952CONFIG_NFS_FS=y
1308CONFIG_NFS_V3=y 1953CONFIG_NFS_V3=y
1309# CONFIG_NFS_V3_ACL is not set 1954CONFIG_NFS_V3_ACL=y
1310# CONFIG_NFS_V4 is not set 1955CONFIG_NFS_V4=y
1311# CONFIG_NFS_DIRECTIO is not set
1312CONFIG_NFSD=y
1313CONFIG_NFSD_V3=y
1314# CONFIG_NFSD_V3_ACL is not set
1315# CONFIG_NFSD_V4 is not set
1316CONFIG_NFSD_TCP=y
1317CONFIG_ROOT_NFS=y 1956CONFIG_ROOT_NFS=y
1957# CONFIG_NFSD is not set
1318CONFIG_LOCKD=y 1958CONFIG_LOCKD=y
1319CONFIG_LOCKD_V4=y 1959CONFIG_LOCKD_V4=y
1320CONFIG_EXPORTFS=y 1960CONFIG_NFS_ACL_SUPPORT=y
1321CONFIG_NFS_COMMON=y 1961CONFIG_NFS_COMMON=y
1322CONFIG_SUNRPC=y 1962CONFIG_SUNRPC=y
1323# CONFIG_SUNRPC_BIND34 is not set 1963CONFIG_SUNRPC_GSS=y
1324# CONFIG_RPCSEC_GSS_KRB5 is not set 1964CONFIG_RPCSEC_GSS_KRB5=y
1325# CONFIG_RPCSEC_GSS_SPKM3 is not set 1965# CONFIG_RPCSEC_GSS_SPKM3 is not set
1326# CONFIG_SMB_FS is not set 1966# CONFIG_SMB_FS is not set
1327# CONFIG_CIFS is not set 1967# CONFIG_CIFS is not set
@@ -1332,14 +1972,26 @@ CONFIG_SUNRPC=y
1332# 1972#
1333# Partition Types 1973# Partition Types
1334# 1974#
1335# CONFIG_PARTITION_ADVANCED is not set 1975CONFIG_PARTITION_ADVANCED=y
1976# CONFIG_ACORN_PARTITION is not set
1977CONFIG_OSF_PARTITION=y
1978CONFIG_AMIGA_PARTITION=y
1979# CONFIG_ATARI_PARTITION is not set
1980CONFIG_MAC_PARTITION=y
1336CONFIG_MSDOS_PARTITION=y 1981CONFIG_MSDOS_PARTITION=y
1337 1982CONFIG_BSD_DISKLABEL=y
1338# 1983CONFIG_MINIX_SUBPARTITION=y
1339# Native Language Support 1984CONFIG_SOLARIS_X86_PARTITION=y
1340# 1985CONFIG_UNIXWARE_DISKLABEL=y
1986# CONFIG_LDM_PARTITION is not set
1987CONFIG_SGI_PARTITION=y
1988# CONFIG_ULTRIX_PARTITION is not set
1989CONFIG_SUN_PARTITION=y
1990CONFIG_KARMA_PARTITION=y
1991CONFIG_EFI_PARTITION=y
1992# CONFIG_SYSV68_PARTITION is not set
1341CONFIG_NLS=y 1993CONFIG_NLS=y
1342CONFIG_NLS_DEFAULT="iso8859-1" 1994CONFIG_NLS_DEFAULT="utf8"
1343CONFIG_NLS_CODEPAGE_437=y 1995CONFIG_NLS_CODEPAGE_437=y
1344# CONFIG_NLS_CODEPAGE_737 is not set 1996# CONFIG_NLS_CODEPAGE_737 is not set
1345# CONFIG_NLS_CODEPAGE_775 is not set 1997# CONFIG_NLS_CODEPAGE_775 is not set
@@ -1374,37 +2026,33 @@ CONFIG_NLS_ISO8859_1=y
1374# CONFIG_NLS_ISO8859_9 is not set 2026# CONFIG_NLS_ISO8859_9 is not set
1375# CONFIG_NLS_ISO8859_13 is not set 2027# CONFIG_NLS_ISO8859_13 is not set
1376# CONFIG_NLS_ISO8859_14 is not set 2028# CONFIG_NLS_ISO8859_14 is not set
1377CONFIG_NLS_ISO8859_15=y 2029# CONFIG_NLS_ISO8859_15 is not set
1378# CONFIG_NLS_KOI8_R is not set 2030# CONFIG_NLS_KOI8_R is not set
1379# CONFIG_NLS_KOI8_U is not set 2031# CONFIG_NLS_KOI8_U is not set
1380CONFIG_NLS_UTF8=y 2032CONFIG_NLS_UTF8=y
1381
1382#
1383# Distributed Lock Manager
1384#
1385# CONFIG_DLM is not set 2033# CONFIG_DLM is not set
1386CONFIG_INSTRUMENTATION=y
1387CONFIG_PROFILING=y
1388CONFIG_OPROFILE=y
1389CONFIG_KPROBES=y
1390 2034
1391# 2035#
1392# Kernel hacking 2036# Kernel hacking
1393# 2037#
1394CONFIG_TRACE_IRQFLAGS_SUPPORT=y 2038CONFIG_TRACE_IRQFLAGS_SUPPORT=y
1395# CONFIG_PRINTK_TIME is not set 2039CONFIG_PRINTK_TIME=y
1396# CONFIG_ENABLE_MUST_CHECK is not set 2040CONFIG_ENABLE_WARN_DEPRECATED=y
2041CONFIG_ENABLE_MUST_CHECK=y
2042CONFIG_FRAME_WARN=2048
1397CONFIG_MAGIC_SYSRQ=y 2043CONFIG_MAGIC_SYSRQ=y
1398CONFIG_UNUSED_SYMBOLS=y 2044# CONFIG_UNUSED_SYMBOLS is not set
1399# CONFIG_DEBUG_FS is not set 2045CONFIG_DEBUG_FS=y
1400# CONFIG_HEADERS_CHECK is not set 2046# CONFIG_HEADERS_CHECK is not set
1401CONFIG_DEBUG_KERNEL=y 2047CONFIG_DEBUG_KERNEL=y
1402# CONFIG_DEBUG_SHIRQ is not set 2048# CONFIG_DEBUG_SHIRQ is not set
1403CONFIG_DETECT_SOFTLOCKUP=y 2049# CONFIG_DETECT_SOFTLOCKUP is not set
1404# CONFIG_SCHED_DEBUG is not set 2050# CONFIG_SCHED_DEBUG is not set
1405# CONFIG_SCHEDSTATS is not set 2051CONFIG_SCHEDSTATS=y
1406CONFIG_TIMER_STATS=y 2052CONFIG_TIMER_STATS=y
2053# CONFIG_DEBUG_OBJECTS is not set
1407# CONFIG_SLUB_DEBUG_ON is not set 2054# CONFIG_SLUB_DEBUG_ON is not set
2055# CONFIG_SLUB_STATS is not set
1408# CONFIG_DEBUG_RT_MUTEXES is not set 2056# CONFIG_DEBUG_RT_MUTEXES is not set
1409# CONFIG_RT_MUTEX_TESTER is not set 2057# CONFIG_RT_MUTEX_TESTER is not set
1410# CONFIG_DEBUG_SPINLOCK is not set 2058# CONFIG_DEBUG_SPINLOCK is not set
@@ -1419,48 +2067,189 @@ CONFIG_TIMER_STATS=y
1419CONFIG_DEBUG_BUGVERBOSE=y 2067CONFIG_DEBUG_BUGVERBOSE=y
1420# CONFIG_DEBUG_INFO is not set 2068# CONFIG_DEBUG_INFO is not set
1421# CONFIG_DEBUG_VM is not set 2069# CONFIG_DEBUG_VM is not set
2070# CONFIG_DEBUG_WRITECOUNT is not set
2071CONFIG_DEBUG_MEMORY_INIT=y
1422# CONFIG_DEBUG_LIST is not set 2072# CONFIG_DEBUG_LIST is not set
1423# CONFIG_FRAME_POINTER is not set 2073# CONFIG_DEBUG_SG is not set
1424CONFIG_OPTIMIZE_INLINING=y 2074CONFIG_FRAME_POINTER=y
2075# CONFIG_BOOT_PRINTK_DELAY is not set
1425# CONFIG_RCU_TORTURE_TEST is not set 2076# CONFIG_RCU_TORTURE_TEST is not set
2077# CONFIG_KPROBES_SANITY_TEST is not set
2078# CONFIG_BACKTRACE_SELF_TEST is not set
1426# CONFIG_LKDTM is not set 2079# CONFIG_LKDTM is not set
1427# CONFIG_FAULT_INJECTION is not set 2080# CONFIG_FAULT_INJECTION is not set
2081# CONFIG_LATENCYTOP is not set
2082CONFIG_SYSCTL_SYSCALL_CHECK=y
2083CONFIG_HAVE_FTRACE=y
2084CONFIG_HAVE_DYNAMIC_FTRACE=y
2085# CONFIG_FTRACE is not set
2086# CONFIG_IRQSOFF_TRACER is not set
2087# CONFIG_SYSPROF_TRACER is not set
2088# CONFIG_SCHED_TRACER is not set
2089# CONFIG_CONTEXT_SWITCH_TRACER is not set
2090CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
2091# CONFIG_SAMPLES is not set
2092CONFIG_HAVE_ARCH_KGDB=y
2093# CONFIG_KGDB is not set
2094# CONFIG_STRICT_DEVMEM is not set
2095CONFIG_X86_VERBOSE_BOOTUP=y
1428CONFIG_EARLY_PRINTK=y 2096CONFIG_EARLY_PRINTK=y
1429CONFIG_DEBUG_STACKOVERFLOW=y 2097CONFIG_DEBUG_STACKOVERFLOW=y
1430# CONFIG_DEBUG_STACK_USAGE is not set 2098CONFIG_DEBUG_STACK_USAGE=y
1431# CONFIG_DEBUG_RODATA is not set 2099# CONFIG_DEBUG_PAGEALLOC is not set
2100# CONFIG_DEBUG_PER_CPU_MAPS is not set
2101# CONFIG_X86_PTDUMP is not set
2102CONFIG_DEBUG_RODATA=y
2103# CONFIG_DEBUG_RODATA_TEST is not set
2104CONFIG_DEBUG_NX_TEST=m
1432# CONFIG_4KSTACKS is not set 2105# CONFIG_4KSTACKS is not set
1433CONFIG_X86_FIND_SMP_CONFIG=y
1434CONFIG_X86_MPPARSE=y
1435CONFIG_DOUBLEFAULT=y 2106CONFIG_DOUBLEFAULT=y
2107# CONFIG_MMIOTRACE is not set
2108CONFIG_IO_DELAY_TYPE_0X80=0
2109CONFIG_IO_DELAY_TYPE_0XED=1
2110CONFIG_IO_DELAY_TYPE_UDELAY=2
2111CONFIG_IO_DELAY_TYPE_NONE=3
2112CONFIG_IO_DELAY_0X80=y
2113# CONFIG_IO_DELAY_0XED is not set
2114# CONFIG_IO_DELAY_UDELAY is not set
2115# CONFIG_IO_DELAY_NONE is not set
2116CONFIG_DEFAULT_IO_DELAY_TYPE=0
2117CONFIG_DEBUG_BOOT_PARAMS=y
2118# CONFIG_CPA_DEBUG is not set
2119CONFIG_OPTIMIZE_INLINING=y
1436 2120
1437# 2121#
1438# Security options 2122# Security options
1439# 2123#
1440# CONFIG_KEYS is not set 2124CONFIG_KEYS=y
1441# CONFIG_SECURITY is not set 2125CONFIG_KEYS_DEBUG_PROC_KEYS=y
1442# CONFIG_CRYPTO is not set 2126CONFIG_SECURITY=y
2127CONFIG_SECURITY_NETWORK=y
2128# CONFIG_SECURITY_NETWORK_XFRM is not set
2129CONFIG_SECURITY_FILE_CAPABILITIES=y
2130# CONFIG_SECURITY_ROOTPLUG is not set
2131CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536
2132CONFIG_SECURITY_SELINUX=y
2133CONFIG_SECURITY_SELINUX_BOOTPARAM=y
2134CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE=1
2135CONFIG_SECURITY_SELINUX_DISABLE=y
2136CONFIG_SECURITY_SELINUX_DEVELOP=y
2137CONFIG_SECURITY_SELINUX_AVC_STATS=y
2138CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
2139# CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT is not set
2140# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
2141# CONFIG_SECURITY_SMACK is not set
2142CONFIG_CRYPTO=y
2143
2144#
2145# Crypto core or helper
2146#
2147CONFIG_CRYPTO_ALGAPI=y
2148CONFIG_CRYPTO_AEAD=y
2149CONFIG_CRYPTO_BLKCIPHER=y
2150CONFIG_CRYPTO_HASH=y
2151CONFIG_CRYPTO_MANAGER=y
2152# CONFIG_CRYPTO_GF128MUL is not set
2153# CONFIG_CRYPTO_NULL is not set
2154# CONFIG_CRYPTO_CRYPTD is not set
2155CONFIG_CRYPTO_AUTHENC=y
2156# CONFIG_CRYPTO_TEST is not set
2157
2158#
2159# Authenticated Encryption with Associated Data
2160#
2161# CONFIG_CRYPTO_CCM is not set
2162# CONFIG_CRYPTO_GCM is not set
2163# CONFIG_CRYPTO_SEQIV is not set
2164
2165#
2166# Block modes
2167#
2168CONFIG_CRYPTO_CBC=y
2169# CONFIG_CRYPTO_CTR is not set
2170# CONFIG_CRYPTO_CTS is not set
2171CONFIG_CRYPTO_ECB=y
2172# CONFIG_CRYPTO_LRW is not set
2173# CONFIG_CRYPTO_PCBC is not set
2174# CONFIG_CRYPTO_XTS is not set
2175
2176#
2177# Hash modes
2178#
2179CONFIG_CRYPTO_HMAC=y
2180# CONFIG_CRYPTO_XCBC is not set
2181
2182#
2183# Digest
2184#
2185# CONFIG_CRYPTO_CRC32C is not set
2186# CONFIG_CRYPTO_MD4 is not set
2187CONFIG_CRYPTO_MD5=y
2188# CONFIG_CRYPTO_MICHAEL_MIC is not set
2189# CONFIG_CRYPTO_RMD128 is not set
2190# CONFIG_CRYPTO_RMD160 is not set
2191# CONFIG_CRYPTO_RMD256 is not set
2192# CONFIG_CRYPTO_RMD320 is not set
2193CONFIG_CRYPTO_SHA1=y
2194# CONFIG_CRYPTO_SHA256 is not set
2195# CONFIG_CRYPTO_SHA512 is not set
2196# CONFIG_CRYPTO_TGR192 is not set
2197# CONFIG_CRYPTO_WP512 is not set
2198
2199#
2200# Ciphers
2201#
2202CONFIG_CRYPTO_AES=y
2203CONFIG_CRYPTO_AES_586=y
2204# CONFIG_CRYPTO_ANUBIS is not set
2205CONFIG_CRYPTO_ARC4=y
2206# CONFIG_CRYPTO_BLOWFISH is not set
2207# CONFIG_CRYPTO_CAMELLIA is not set
2208# CONFIG_CRYPTO_CAST5 is not set
2209# CONFIG_CRYPTO_CAST6 is not set
2210CONFIG_CRYPTO_DES=y
2211# CONFIG_CRYPTO_FCRYPT is not set
2212# CONFIG_CRYPTO_KHAZAD is not set
2213# CONFIG_CRYPTO_SALSA20 is not set
2214# CONFIG_CRYPTO_SALSA20_586 is not set
2215# CONFIG_CRYPTO_SEED is not set
2216# CONFIG_CRYPTO_SERPENT is not set
2217# CONFIG_CRYPTO_TEA is not set
2218# CONFIG_CRYPTO_TWOFISH is not set
2219# CONFIG_CRYPTO_TWOFISH_586 is not set
2220
2221#
2222# Compression
2223#
2224# CONFIG_CRYPTO_DEFLATE is not set
2225# CONFIG_CRYPTO_LZO is not set
2226CONFIG_CRYPTO_HW=y
2227# CONFIG_CRYPTO_DEV_PADLOCK is not set
2228# CONFIG_CRYPTO_DEV_GEODE is not set
2229# CONFIG_CRYPTO_DEV_HIFN_795X is not set
2230CONFIG_HAVE_KVM=y
2231CONFIG_VIRTUALIZATION=y
2232# CONFIG_KVM is not set
2233# CONFIG_LGUEST is not set
2234# CONFIG_VIRTIO_PCI is not set
2235# CONFIG_VIRTIO_BALLOON is not set
1443 2236
1444# 2237#
1445# Library routines 2238# Library routines
1446# 2239#
1447CONFIG_BITREVERSE=y 2240CONFIG_BITREVERSE=y
2241CONFIG_GENERIC_FIND_FIRST_BIT=y
2242CONFIG_GENERIC_FIND_NEXT_BIT=y
1448# CONFIG_CRC_CCITT is not set 2243# CONFIG_CRC_CCITT is not set
1449# CONFIG_CRC16 is not set 2244# CONFIG_CRC16 is not set
2245CONFIG_CRC_T10DIF=y
1450# CONFIG_CRC_ITU_T is not set 2246# CONFIG_CRC_ITU_T is not set
1451CONFIG_CRC32=y 2247CONFIG_CRC32=y
1452# CONFIG_CRC7 is not set 2248# CONFIG_CRC7 is not set
1453# CONFIG_LIBCRC32C is not set 2249# CONFIG_LIBCRC32C is not set
2250CONFIG_AUDIT_GENERIC=y
1454CONFIG_ZLIB_INFLATE=y 2251CONFIG_ZLIB_INFLATE=y
1455CONFIG_PLIST=y 2252CONFIG_PLIST=y
1456CONFIG_HAS_IOMEM=y 2253CONFIG_HAS_IOMEM=y
1457CONFIG_HAS_IOPORT=y 2254CONFIG_HAS_IOPORT=y
1458CONFIG_HAS_DMA=y 2255CONFIG_HAS_DMA=y
1459CONFIG_GENERIC_HARDIRQS=y
1460CONFIG_GENERIC_IRQ_PROBE=y
1461CONFIG_GENERIC_PENDING_IRQ=y
1462CONFIG_X86_SMP=y
1463CONFIG_X86_HT=y
1464CONFIG_X86_BIOS_REBOOT=y
1465CONFIG_X86_TRAMPOLINE=y
1466CONFIG_KTIME_SCALAR=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 2d6f5b2809d2..f0a03d7a7d63 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1,64 +1,105 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.22-git14 3# Linux kernel version: 2.6.27-rc5
4# Fri Jul 20 09:53:15 2007 4# Wed Sep 3 17:13:39 2008
5# 5#
6CONFIG_X86_64=y
7CONFIG_64BIT=y 6CONFIG_64BIT=y
7# CONFIG_X86_32 is not set
8CONFIG_X86_64=y
8CONFIG_X86=y 9CONFIG_X86=y
10CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig"
11# CONFIG_GENERIC_LOCKBREAK is not set
9CONFIG_GENERIC_TIME=y 12CONFIG_GENERIC_TIME=y
10CONFIG_GENERIC_TIME_VSYSCALL=y
11CONFIG_GENERIC_CMOS_UPDATE=y 13CONFIG_GENERIC_CMOS_UPDATE=y
12CONFIG_ZONE_DMA32=y 14CONFIG_CLOCKSOURCE_WATCHDOG=y
15CONFIG_GENERIC_CLOCKEVENTS=y
16CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y
13CONFIG_LOCKDEP_SUPPORT=y 17CONFIG_LOCKDEP_SUPPORT=y
14CONFIG_STACKTRACE_SUPPORT=y 18CONFIG_STACKTRACE_SUPPORT=y
15CONFIG_SEMAPHORE_SLEEPERS=y 19CONFIG_HAVE_LATENCYTOP_SUPPORT=y
20CONFIG_FAST_CMPXCHG_LOCAL=y
16CONFIG_MMU=y 21CONFIG_MMU=y
17CONFIG_ZONE_DMA=y 22CONFIG_ZONE_DMA=y
18CONFIG_QUICKLIST=y
19CONFIG_NR_QUICK=2
20CONFIG_RWSEM_GENERIC_SPINLOCK=y
21CONFIG_GENERIC_HWEIGHT=y
22CONFIG_GENERIC_CALIBRATE_DELAY=y
23CONFIG_X86_CMPXCHG=y
24CONFIG_EARLY_PRINTK=y
25CONFIG_GENERIC_ISA_DMA=y 23CONFIG_GENERIC_ISA_DMA=y
26CONFIG_GENERIC_IOMAP=y 24CONFIG_GENERIC_IOMAP=y
27CONFIG_ARCH_MAY_HAVE_PC_FDC=y
28CONFIG_ARCH_POPULATES_NODE_MAP=y
29CONFIG_DMI=y
30CONFIG_AUDIT_ARCH=y
31CONFIG_GENERIC_BUG=y 25CONFIG_GENERIC_BUG=y
26CONFIG_GENERIC_HWEIGHT=y
27# CONFIG_GENERIC_GPIO is not set
28CONFIG_ARCH_MAY_HAVE_PC_FDC=y
29CONFIG_RWSEM_GENERIC_SPINLOCK=y
30# CONFIG_RWSEM_XCHGADD_ALGORITHM is not set
32# CONFIG_ARCH_HAS_ILOG2_U32 is not set 31# CONFIG_ARCH_HAS_ILOG2_U32 is not set
33# CONFIG_ARCH_HAS_ILOG2_U64 is not set 32# CONFIG_ARCH_HAS_ILOG2_U64 is not set
33CONFIG_ARCH_HAS_CPU_IDLE_WAIT=y
34CONFIG_GENERIC_CALIBRATE_DELAY=y
35CONFIG_GENERIC_TIME_VSYSCALL=y
36CONFIG_ARCH_HAS_CPU_RELAX=y
37CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
38CONFIG_HAVE_SETUP_PER_CPU_AREA=y
39CONFIG_HAVE_CPUMASK_OF_CPU_MAP=y
40CONFIG_ARCH_HIBERNATION_POSSIBLE=y
41CONFIG_ARCH_SUSPEND_POSSIBLE=y
42CONFIG_ZONE_DMA32=y
43CONFIG_ARCH_POPULATES_NODE_MAP=y
44CONFIG_AUDIT_ARCH=y
45CONFIG_ARCH_SUPPORTS_AOUT=y
46CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
47CONFIG_GENERIC_HARDIRQS=y
48CONFIG_GENERIC_IRQ_PROBE=y
49CONFIG_GENERIC_PENDING_IRQ=y
50CONFIG_X86_SMP=y
51CONFIG_X86_64_SMP=y
52CONFIG_X86_HT=y
53CONFIG_X86_BIOS_REBOOT=y
54CONFIG_X86_TRAMPOLINE=y
55# CONFIG_KTIME_SCALAR is not set
34CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" 56CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
35 57
36# 58#
37# Code maturity level options 59# General setup
38# 60#
39CONFIG_EXPERIMENTAL=y 61CONFIG_EXPERIMENTAL=y
40CONFIG_LOCK_KERNEL=y 62CONFIG_LOCK_KERNEL=y
41CONFIG_INIT_ENV_ARG_LIMIT=32 63CONFIG_INIT_ENV_ARG_LIMIT=32
42
43#
44# General setup
45#
46CONFIG_LOCALVERSION="" 64CONFIG_LOCALVERSION=""
47CONFIG_LOCALVERSION_AUTO=y 65# CONFIG_LOCALVERSION_AUTO is not set
48CONFIG_SWAP=y 66CONFIG_SWAP=y
49CONFIG_SYSVIPC=y 67CONFIG_SYSVIPC=y
50CONFIG_SYSVIPC_SYSCTL=y 68CONFIG_SYSVIPC_SYSCTL=y
51CONFIG_POSIX_MQUEUE=y 69CONFIG_POSIX_MQUEUE=y
52# CONFIG_BSD_PROCESS_ACCT is not set 70CONFIG_BSD_PROCESS_ACCT=y
53# CONFIG_TASKSTATS is not set 71# CONFIG_BSD_PROCESS_ACCT_V3 is not set
54# CONFIG_USER_NS is not set 72CONFIG_TASKSTATS=y
55# CONFIG_AUDIT is not set 73CONFIG_TASK_DELAY_ACCT=y
56CONFIG_IKCONFIG=y 74CONFIG_TASK_XACCT=y
57CONFIG_IKCONFIG_PROC=y 75CONFIG_TASK_IO_ACCOUNTING=y
58CONFIG_LOG_BUF_SHIFT=18 76CONFIG_AUDIT=y
59# CONFIG_CPUSETS is not set 77CONFIG_AUDITSYSCALL=y
60CONFIG_SYSFS_DEPRECATED=y 78CONFIG_AUDIT_TREE=y
79# CONFIG_IKCONFIG is not set
80CONFIG_LOG_BUF_SHIFT=17
81CONFIG_CGROUPS=y
82# CONFIG_CGROUP_DEBUG is not set
83CONFIG_CGROUP_NS=y
84# CONFIG_CGROUP_DEVICE is not set
85CONFIG_CPUSETS=y
86CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y
87CONFIG_GROUP_SCHED=y
88CONFIG_FAIR_GROUP_SCHED=y
89# CONFIG_RT_GROUP_SCHED is not set
90# CONFIG_USER_SCHED is not set
91CONFIG_CGROUP_SCHED=y
92CONFIG_CGROUP_CPUACCT=y
93CONFIG_RESOURCE_COUNTERS=y
94# CONFIG_CGROUP_MEM_RES_CTLR is not set
95# CONFIG_SYSFS_DEPRECATED_V2 is not set
96CONFIG_PROC_PID_CPUSET=y
61CONFIG_RELAY=y 97CONFIG_RELAY=y
98CONFIG_NAMESPACES=y
99CONFIG_UTS_NS=y
100CONFIG_IPC_NS=y
101CONFIG_USER_NS=y
102CONFIG_PID_NS=y
62CONFIG_BLK_DEV_INITRD=y 103CONFIG_BLK_DEV_INITRD=y
63CONFIG_INITRAMFS_SOURCE="" 104CONFIG_INITRAMFS_SOURCE=""
64CONFIG_CC_OPTIMIZE_FOR_SIZE=y 105CONFIG_CC_OPTIMIZE_FOR_SIZE=y
@@ -68,11 +109,13 @@ CONFIG_UID16=y
68CONFIG_SYSCTL_SYSCALL=y 109CONFIG_SYSCTL_SYSCALL=y
69CONFIG_KALLSYMS=y 110CONFIG_KALLSYMS=y
70CONFIG_KALLSYMS_ALL=y 111CONFIG_KALLSYMS_ALL=y
71# CONFIG_KALLSYMS_EXTRA_PASS is not set 112CONFIG_KALLSYMS_EXTRA_PASS=y
72CONFIG_HOTPLUG=y 113CONFIG_HOTPLUG=y
73CONFIG_PRINTK=y 114CONFIG_PRINTK=y
74CONFIG_BUG=y 115CONFIG_BUG=y
75CONFIG_ELF_CORE=y 116CONFIG_ELF_CORE=y
117CONFIG_PCSPKR_PLATFORM=y
118# CONFIG_COMPAT_BRK is not set
76CONFIG_BASE_FULL=y 119CONFIG_BASE_FULL=y
77CONFIG_FUTEX=y 120CONFIG_FUTEX=y
78CONFIG_ANON_INODES=y 121CONFIG_ANON_INODES=y
@@ -82,28 +125,49 @@ CONFIG_TIMERFD=y
82CONFIG_EVENTFD=y 125CONFIG_EVENTFD=y
83CONFIG_SHMEM=y 126CONFIG_SHMEM=y
84CONFIG_VM_EVENT_COUNTERS=y 127CONFIG_VM_EVENT_COUNTERS=y
85CONFIG_SLAB=y 128CONFIG_SLUB_DEBUG=y
86# CONFIG_SLUB is not set 129# CONFIG_SLAB is not set
130CONFIG_SLUB=y
87# CONFIG_SLOB is not set 131# CONFIG_SLOB is not set
132CONFIG_PROFILING=y
133CONFIG_MARKERS=y
134# CONFIG_OPROFILE is not set
135CONFIG_HAVE_OPROFILE=y
136CONFIG_KPROBES=y
137CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y
138CONFIG_KRETPROBES=y
139CONFIG_HAVE_IOREMAP_PROT=y
140CONFIG_HAVE_KPROBES=y
141CONFIG_HAVE_KRETPROBES=y
142# CONFIG_HAVE_ARCH_TRACEHOOK is not set
143# CONFIG_HAVE_DMA_ATTRS is not set
144CONFIG_USE_GENERIC_SMP_HELPERS=y
145# CONFIG_HAVE_CLK is not set
146CONFIG_PROC_PAGE_MONITOR=y
147# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set
148CONFIG_SLABINFO=y
88CONFIG_RT_MUTEXES=y 149CONFIG_RT_MUTEXES=y
89# CONFIG_TINY_SHMEM is not set 150# CONFIG_TINY_SHMEM is not set
90CONFIG_BASE_SMALL=0 151CONFIG_BASE_SMALL=0
91CONFIG_MODULES=y 152CONFIG_MODULES=y
153# CONFIG_MODULE_FORCE_LOAD is not set
92CONFIG_MODULE_UNLOAD=y 154CONFIG_MODULE_UNLOAD=y
93CONFIG_MODULE_FORCE_UNLOAD=y 155CONFIG_MODULE_FORCE_UNLOAD=y
94# CONFIG_MODVERSIONS is not set 156# CONFIG_MODVERSIONS is not set
95# CONFIG_MODULE_SRCVERSION_ALL is not set 157# CONFIG_MODULE_SRCVERSION_ALL is not set
96# CONFIG_KMOD is not set 158CONFIG_KMOD=y
97CONFIG_STOP_MACHINE=y 159CONFIG_STOP_MACHINE=y
98CONFIG_BLOCK=y 160CONFIG_BLOCK=y
99# CONFIG_BLK_DEV_IO_TRACE is not set 161CONFIG_BLK_DEV_IO_TRACE=y
100# CONFIG_BLK_DEV_BSG is not set 162CONFIG_BLK_DEV_BSG=y
163# CONFIG_BLK_DEV_INTEGRITY is not set
164CONFIG_BLOCK_COMPAT=y
101 165
102# 166#
103# IO Schedulers 167# IO Schedulers
104# 168#
105CONFIG_IOSCHED_NOOP=y 169CONFIG_IOSCHED_NOOP=y
106# CONFIG_IOSCHED_AS is not set 170CONFIG_IOSCHED_AS=y
107CONFIG_IOSCHED_DEADLINE=y 171CONFIG_IOSCHED_DEADLINE=y
108CONFIG_IOSCHED_CFQ=y 172CONFIG_IOSCHED_CFQ=y
109# CONFIG_DEFAULT_AS is not set 173# CONFIG_DEFAULT_AS is not set
@@ -111,110 +175,177 @@ CONFIG_IOSCHED_CFQ=y
111CONFIG_DEFAULT_CFQ=y 175CONFIG_DEFAULT_CFQ=y
112# CONFIG_DEFAULT_NOOP is not set 176# CONFIG_DEFAULT_NOOP is not set
113CONFIG_DEFAULT_IOSCHED="cfq" 177CONFIG_DEFAULT_IOSCHED="cfq"
178CONFIG_CLASSIC_RCU=y
114 179
115# 180#
116# Processor type and features 181# Processor type and features
117# 182#
183CONFIG_TICK_ONESHOT=y
184CONFIG_NO_HZ=y
185CONFIG_HIGH_RES_TIMERS=y
186CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
187CONFIG_SMP=y
188CONFIG_X86_FIND_SMP_CONFIG=y
189CONFIG_X86_MPPARSE=y
118CONFIG_X86_PC=y 190CONFIG_X86_PC=y
191# CONFIG_X86_ELAN is not set
192# CONFIG_X86_VOYAGER is not set
193# CONFIG_X86_GENERICARCH is not set
119# CONFIG_X86_VSMP is not set 194# CONFIG_X86_VSMP is not set
195# CONFIG_PARAVIRT_GUEST is not set
196# CONFIG_MEMTEST is not set
197# CONFIG_M386 is not set
198# CONFIG_M486 is not set
199# CONFIG_M586 is not set
200# CONFIG_M586TSC is not set
201# CONFIG_M586MMX is not set
202# CONFIG_M686 is not set
203# CONFIG_MPENTIUMII is not set
204# CONFIG_MPENTIUMIII is not set
205# CONFIG_MPENTIUMM is not set
206# CONFIG_MPENTIUM4 is not set
207# CONFIG_MK6 is not set
208# CONFIG_MK7 is not set
120# CONFIG_MK8 is not set 209# CONFIG_MK8 is not set
210# CONFIG_MCRUSOE is not set
211# CONFIG_MEFFICEON is not set
212# CONFIG_MWINCHIPC6 is not set
213# CONFIG_MWINCHIP3D is not set
214# CONFIG_MGEODEGX1 is not set
215# CONFIG_MGEODE_LX is not set
216# CONFIG_MCYRIXIII is not set
217# CONFIG_MVIAC3_2 is not set
218# CONFIG_MVIAC7 is not set
121# CONFIG_MPSC is not set 219# CONFIG_MPSC is not set
122# CONFIG_MCORE2 is not set 220# CONFIG_MCORE2 is not set
123CONFIG_GENERIC_CPU=y 221CONFIG_GENERIC_CPU=y
222CONFIG_X86_CPU=y
124CONFIG_X86_L1_CACHE_BYTES=128 223CONFIG_X86_L1_CACHE_BYTES=128
125CONFIG_X86_L1_CACHE_SHIFT=7
126CONFIG_X86_INTERNODE_CACHE_BYTES=128 224CONFIG_X86_INTERNODE_CACHE_BYTES=128
225CONFIG_X86_CMPXCHG=y
226CONFIG_X86_L1_CACHE_SHIFT=7
227CONFIG_X86_WP_WORKS_OK=y
127CONFIG_X86_TSC=y 228CONFIG_X86_TSC=y
128CONFIG_X86_GOOD_APIC=y 229CONFIG_X86_CMPXCHG64=y
129# CONFIG_MICROCODE is not set 230CONFIG_X86_CMOV=y
130CONFIG_X86_MSR=y 231CONFIG_X86_MINIMUM_CPU_FAMILY=64
131CONFIG_X86_CPUID=y 232CONFIG_X86_DEBUGCTLMSR=y
132CONFIG_X86_HT=y 233CONFIG_HPET_TIMER=y
133CONFIG_X86_IO_APIC=y 234CONFIG_HPET_EMULATE_RTC=y
134CONFIG_X86_LOCAL_APIC=y 235CONFIG_DMI=y
135CONFIG_MTRR=y 236CONFIG_GART_IOMMU=y
136CONFIG_SMP=y 237CONFIG_CALGARY_IOMMU=y
238CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y
239CONFIG_AMD_IOMMU=y
240CONFIG_SWIOTLB=y
241CONFIG_IOMMU_HELPER=y
242CONFIG_NR_CPUS=64
137CONFIG_SCHED_SMT=y 243CONFIG_SCHED_SMT=y
138CONFIG_SCHED_MC=y 244CONFIG_SCHED_MC=y
139# CONFIG_PREEMPT_NONE is not set 245# CONFIG_PREEMPT_NONE is not set
140CONFIG_PREEMPT_VOLUNTARY=y 246CONFIG_PREEMPT_VOLUNTARY=y
141# CONFIG_PREEMPT is not set 247# CONFIG_PREEMPT is not set
142CONFIG_PREEMPT_BKL=y 248CONFIG_X86_LOCAL_APIC=y
249CONFIG_X86_IO_APIC=y
250# CONFIG_X86_MCE is not set
251# CONFIG_I8K is not set
252CONFIG_MICROCODE=y
253CONFIG_MICROCODE_OLD_INTERFACE=y
254CONFIG_X86_MSR=y
255CONFIG_X86_CPUID=y
143CONFIG_NUMA=y 256CONFIG_NUMA=y
144CONFIG_K8_NUMA=y 257CONFIG_K8_NUMA=y
145CONFIG_NODES_SHIFT=6
146CONFIG_X86_64_ACPI_NUMA=y 258CONFIG_X86_64_ACPI_NUMA=y
147CONFIG_NUMA_EMU=y 259CONFIG_NODES_SPAN_OTHER_NODES=y
260# CONFIG_NUMA_EMU is not set
261CONFIG_NODES_SHIFT=6
262CONFIG_ARCH_SPARSEMEM_DEFAULT=y
263CONFIG_ARCH_SPARSEMEM_ENABLE=y
264CONFIG_ARCH_SELECT_MEMORY_MODEL=y
265CONFIG_SELECT_MEMORY_MODEL=y
266# CONFIG_FLATMEM_MANUAL is not set
267# CONFIG_DISCONTIGMEM_MANUAL is not set
268CONFIG_SPARSEMEM_MANUAL=y
269CONFIG_SPARSEMEM=y
148CONFIG_NEED_MULTIPLE_NODES=y 270CONFIG_NEED_MULTIPLE_NODES=y
271CONFIG_HAVE_MEMORY_PRESENT=y
149# CONFIG_SPARSEMEM_STATIC is not set 272# CONFIG_SPARSEMEM_STATIC is not set
273CONFIG_SPARSEMEM_EXTREME=y
274CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y
275CONFIG_SPARSEMEM_VMEMMAP=y
276
277#
278# Memory hotplug is currently incompatible with Software Suspend
279#
280CONFIG_PAGEFLAGS_EXTENDED=y
150CONFIG_SPLIT_PTLOCK_CPUS=4 281CONFIG_SPLIT_PTLOCK_CPUS=4
151CONFIG_MIGRATION=y 282CONFIG_MIGRATION=y
152CONFIG_RESOURCES_64BIT=y 283CONFIG_RESOURCES_64BIT=y
153CONFIG_ZONE_DMA_FLAG=1 284CONFIG_ZONE_DMA_FLAG=1
154CONFIG_BOUNCE=y 285CONFIG_BOUNCE=y
155CONFIG_VIRT_TO_BUS=y 286CONFIG_VIRT_TO_BUS=y
156CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y 287CONFIG_MTRR=y
157CONFIG_OUT_OF_LINE_PFN_TO_PAGE=y 288# CONFIG_MTRR_SANITIZER is not set
158CONFIG_NR_CPUS=32 289CONFIG_X86_PAT=y
159CONFIG_PHYSICAL_ALIGN=0x200000 290CONFIG_EFI=y
160CONFIG_HOTPLUG_CPU=y
161CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
162CONFIG_HPET_TIMER=y
163CONFIG_HPET_EMULATE_RTC=y
164CONFIG_GART_IOMMU=y
165# CONFIG_CALGARY_IOMMU is not set
166CONFIG_SWIOTLB=y
167CONFIG_X86_MCE=y
168CONFIG_X86_MCE_INTEL=y
169CONFIG_X86_MCE_AMD=y
170# CONFIG_KEXEC is not set
171# CONFIG_CRASH_DUMP is not set
172# CONFIG_RELOCATABLE is not set
173CONFIG_PHYSICAL_START=0x200000
174CONFIG_SECCOMP=y 291CONFIG_SECCOMP=y
175# CONFIG_CC_STACKPROTECTOR is not set
176# CONFIG_HZ_100 is not set 292# CONFIG_HZ_100 is not set
177CONFIG_HZ_250=y 293# CONFIG_HZ_250 is not set
178# CONFIG_HZ_300 is not set 294# CONFIG_HZ_300 is not set
179# CONFIG_HZ_1000 is not set 295CONFIG_HZ_1000=y
180CONFIG_HZ=250 296CONFIG_HZ=1000
181CONFIG_K8_NB=y 297CONFIG_SCHED_HRTICK=y
182CONFIG_GENERIC_HARDIRQS=y 298CONFIG_KEXEC=y
183CONFIG_GENERIC_IRQ_PROBE=y 299CONFIG_CRASH_DUMP=y
184CONFIG_ISA_DMA_API=y 300CONFIG_PHYSICAL_START=0x1000000
185CONFIG_GENERIC_PENDING_IRQ=y 301CONFIG_RELOCATABLE=y
302CONFIG_PHYSICAL_ALIGN=0x200000
303CONFIG_HOTPLUG_CPU=y
304# CONFIG_COMPAT_VDSO is not set
305CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
306CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
186 307
187# 308#
188# Power management options 309# Power management options
189# 310#
311CONFIG_ARCH_HIBERNATION_HEADER=y
190CONFIG_PM=y 312CONFIG_PM=y
191# CONFIG_PM_LEGACY is not set 313CONFIG_PM_DEBUG=y
192# CONFIG_PM_DEBUG is not set 314# CONFIG_PM_VERBOSE is not set
315CONFIG_CAN_PM_TRACE=y
316CONFIG_PM_TRACE=y
317CONFIG_PM_TRACE_RTC=y
318CONFIG_PM_SLEEP_SMP=y
319CONFIG_PM_SLEEP=y
320CONFIG_SUSPEND=y
321# CONFIG_PM_TEST_SUSPEND is not set
322CONFIG_SUSPEND_FREEZER=y
193CONFIG_HIBERNATION=y 323CONFIG_HIBERNATION=y
194CONFIG_PM_STD_PARTITION="" 324CONFIG_PM_STD_PARTITION=""
195
196#
197# ACPI (Advanced Configuration and Power Interface) Support
198#
199CONFIG_ACPI=y 325CONFIG_ACPI=y
200CONFIG_ACPI_SLEEP=y 326CONFIG_ACPI_SLEEP=y
201CONFIG_ACPI_SLEEP_PROC_FS=y
202CONFIG_ACPI_SLEEP_PROC_SLEEP=y
203CONFIG_ACPI_PROCFS=y 327CONFIG_ACPI_PROCFS=y
328CONFIG_ACPI_PROCFS_POWER=y
329CONFIG_ACPI_SYSFS_POWER=y
330CONFIG_ACPI_PROC_EVENT=y
204CONFIG_ACPI_AC=y 331CONFIG_ACPI_AC=y
205CONFIG_ACPI_BATTERY=y 332CONFIG_ACPI_BATTERY=y
206CONFIG_ACPI_BUTTON=y 333CONFIG_ACPI_BUTTON=y
207CONFIG_ACPI_FAN=y 334CONFIG_ACPI_FAN=y
208# CONFIG_ACPI_DOCK is not set 335CONFIG_ACPI_DOCK=y
336# CONFIG_ACPI_BAY is not set
209CONFIG_ACPI_PROCESSOR=y 337CONFIG_ACPI_PROCESSOR=y
210CONFIG_ACPI_HOTPLUG_CPU=y 338CONFIG_ACPI_HOTPLUG_CPU=y
211CONFIG_ACPI_THERMAL=y 339CONFIG_ACPI_THERMAL=y
212CONFIG_ACPI_NUMA=y 340CONFIG_ACPI_NUMA=y
341# CONFIG_ACPI_WMI is not set
213# CONFIG_ACPI_ASUS is not set 342# CONFIG_ACPI_ASUS is not set
214# CONFIG_ACPI_TOSHIBA is not set 343# CONFIG_ACPI_TOSHIBA is not set
344# CONFIG_ACPI_CUSTOM_DSDT is not set
215CONFIG_ACPI_BLACKLIST_YEAR=0 345CONFIG_ACPI_BLACKLIST_YEAR=0
216# CONFIG_ACPI_DEBUG is not set 346# CONFIG_ACPI_DEBUG is not set
217CONFIG_ACPI_EC=y 347CONFIG_ACPI_EC=y
348# CONFIG_ACPI_PCI_SLOT is not set
218CONFIG_ACPI_POWER=y 349CONFIG_ACPI_POWER=y
219CONFIG_ACPI_SYSTEM=y 350CONFIG_ACPI_SYSTEM=y
220CONFIG_X86_PM_TIMER=y 351CONFIG_X86_PM_TIMER=y
@@ -227,29 +358,34 @@ CONFIG_ACPI_CONTAINER=y
227CONFIG_CPU_FREQ=y 358CONFIG_CPU_FREQ=y
228CONFIG_CPU_FREQ_TABLE=y 359CONFIG_CPU_FREQ_TABLE=y
229CONFIG_CPU_FREQ_DEBUG=y 360CONFIG_CPU_FREQ_DEBUG=y
230CONFIG_CPU_FREQ_STAT=y 361# CONFIG_CPU_FREQ_STAT is not set
231# CONFIG_CPU_FREQ_STAT_DETAILS is not set 362# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set
232CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y 363# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set
233# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set 364CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y
365# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set
366# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set
234CONFIG_CPU_FREQ_GOV_PERFORMANCE=y 367CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
235# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set 368# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set
236CONFIG_CPU_FREQ_GOV_USERSPACE=y 369CONFIG_CPU_FREQ_GOV_USERSPACE=y
237CONFIG_CPU_FREQ_GOV_ONDEMAND=y 370CONFIG_CPU_FREQ_GOV_ONDEMAND=y
238CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y 371# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set
239 372
240# 373#
241# CPUFreq processor drivers 374# CPUFreq processor drivers
242# 375#
243CONFIG_X86_POWERNOW_K8=y
244CONFIG_X86_POWERNOW_K8_ACPI=y
245# CONFIG_X86_SPEEDSTEP_CENTRINO is not set
246CONFIG_X86_ACPI_CPUFREQ=y 376CONFIG_X86_ACPI_CPUFREQ=y
377# CONFIG_X86_POWERNOW_K8 is not set
378# CONFIG_X86_SPEEDSTEP_CENTRINO is not set
379# CONFIG_X86_P4_CLOCKMOD is not set
247 380
248# 381#
249# shared options 382# shared options
250# 383#
251CONFIG_X86_ACPI_CPUFREQ_PROC_INTF=y 384# CONFIG_X86_ACPI_CPUFREQ_PROC_INTF is not set
252# CONFIG_X86_SPEEDSTEP_LIB is not set 385# CONFIG_X86_SPEEDSTEP_LIB is not set
386CONFIG_CPU_IDLE=y
387CONFIG_CPU_IDLE_GOV_LADDER=y
388CONFIG_CPU_IDLE_GOV_MENU=y
253 389
254# 390#
255# Bus options (PCI etc.) 391# Bus options (PCI etc.)
@@ -257,54 +393,91 @@ CONFIG_X86_ACPI_CPUFREQ_PROC_INTF=y
257CONFIG_PCI=y 393CONFIG_PCI=y
258CONFIG_PCI_DIRECT=y 394CONFIG_PCI_DIRECT=y
259CONFIG_PCI_MMCONFIG=y 395CONFIG_PCI_MMCONFIG=y
396CONFIG_PCI_DOMAINS=y
397CONFIG_DMAR=y
398CONFIG_DMAR_GFX_WA=y
399CONFIG_DMAR_FLOPPY_WA=y
260CONFIG_PCIEPORTBUS=y 400CONFIG_PCIEPORTBUS=y
401# CONFIG_HOTPLUG_PCI_PCIE is not set
261CONFIG_PCIEAER=y 402CONFIG_PCIEAER=y
403# CONFIG_PCIEASPM is not set
262CONFIG_ARCH_SUPPORTS_MSI=y 404CONFIG_ARCH_SUPPORTS_MSI=y
263CONFIG_PCI_MSI=y 405CONFIG_PCI_MSI=y
406# CONFIG_PCI_LEGACY is not set
264# CONFIG_PCI_DEBUG is not set 407# CONFIG_PCI_DEBUG is not set
265# CONFIG_HT_IRQ is not set 408CONFIG_HT_IRQ=y
266 409CONFIG_ISA_DMA_API=y
267# 410CONFIG_K8_NB=y
268# PCCARD (PCMCIA/CardBus) support 411CONFIG_PCCARD=y
269# 412# CONFIG_PCMCIA_DEBUG is not set
270# CONFIG_PCCARD is not set 413CONFIG_PCMCIA=y
271# CONFIG_HOTPLUG_PCI is not set 414CONFIG_PCMCIA_LOAD_CIS=y
415CONFIG_PCMCIA_IOCTL=y
416CONFIG_CARDBUS=y
417
418#
419# PC-card bridges
420#
421CONFIG_YENTA=y
422CONFIG_YENTA_O2=y
423CONFIG_YENTA_RICOH=y
424CONFIG_YENTA_TI=y
425CONFIG_YENTA_ENE_TUNE=y
426CONFIG_YENTA_TOSHIBA=y
427# CONFIG_PD6729 is not set
428# CONFIG_I82092 is not set
429CONFIG_PCCARD_NONSTATIC=y
430CONFIG_HOTPLUG_PCI=y
431# CONFIG_HOTPLUG_PCI_FAKE is not set
432# CONFIG_HOTPLUG_PCI_ACPI is not set
433# CONFIG_HOTPLUG_PCI_CPCI is not set
434# CONFIG_HOTPLUG_PCI_SHPC is not set
272 435
273# 436#
274# Executable file formats / Emulations 437# Executable file formats / Emulations
275# 438#
276CONFIG_BINFMT_ELF=y 439CONFIG_BINFMT_ELF=y
277# CONFIG_BINFMT_MISC is not set 440CONFIG_COMPAT_BINFMT_ELF=y
441CONFIG_BINFMT_MISC=y
278CONFIG_IA32_EMULATION=y 442CONFIG_IA32_EMULATION=y
279CONFIG_IA32_AOUT=y 443# CONFIG_IA32_AOUT is not set
280CONFIG_COMPAT=y 444CONFIG_COMPAT=y
445CONFIG_COMPAT_FOR_U64_ALIGNMENT=y
281CONFIG_SYSVIPC_COMPAT=y 446CONFIG_SYSVIPC_COMPAT=y
282
283#
284# Networking
285#
286CONFIG_NET=y 447CONFIG_NET=y
287 448
288# 449#
289# Networking options 450# Networking options
290# 451#
291CONFIG_PACKET=y 452CONFIG_PACKET=y
292# CONFIG_PACKET_MMAP is not set 453CONFIG_PACKET_MMAP=y
293CONFIG_UNIX=y 454CONFIG_UNIX=y
455CONFIG_XFRM=y
456CONFIG_XFRM_USER=y
457# CONFIG_XFRM_SUB_POLICY is not set
458# CONFIG_XFRM_MIGRATE is not set
459# CONFIG_XFRM_STATISTICS is not set
294# CONFIG_NET_KEY is not set 460# CONFIG_NET_KEY is not set
295CONFIG_INET=y 461CONFIG_INET=y
296CONFIG_IP_MULTICAST=y 462CONFIG_IP_MULTICAST=y
297# CONFIG_IP_ADVANCED_ROUTER is not set 463CONFIG_IP_ADVANCED_ROUTER=y
464CONFIG_ASK_IP_FIB_HASH=y
465# CONFIG_IP_FIB_TRIE is not set
298CONFIG_IP_FIB_HASH=y 466CONFIG_IP_FIB_HASH=y
467CONFIG_IP_MULTIPLE_TABLES=y
468CONFIG_IP_ROUTE_MULTIPATH=y
469CONFIG_IP_ROUTE_VERBOSE=y
299CONFIG_IP_PNP=y 470CONFIG_IP_PNP=y
300CONFIG_IP_PNP_DHCP=y 471CONFIG_IP_PNP_DHCP=y
301# CONFIG_IP_PNP_BOOTP is not set 472CONFIG_IP_PNP_BOOTP=y
302# CONFIG_IP_PNP_RARP is not set 473CONFIG_IP_PNP_RARP=y
303# CONFIG_NET_IPIP is not set 474# CONFIG_NET_IPIP is not set
304# CONFIG_NET_IPGRE is not set 475# CONFIG_NET_IPGRE is not set
305# CONFIG_IP_MROUTE is not set 476CONFIG_IP_MROUTE=y
477CONFIG_IP_PIMSM_V1=y
478CONFIG_IP_PIMSM_V2=y
306# CONFIG_ARPD is not set 479# CONFIG_ARPD is not set
307# CONFIG_SYN_COOKIES is not set 480CONFIG_SYN_COOKIES=y
308# CONFIG_INET_AH is not set 481# CONFIG_INET_AH is not set
309# CONFIG_INET_ESP is not set 482# CONFIG_INET_ESP is not set
310# CONFIG_INET_IPCOMP is not set 483# CONFIG_INET_IPCOMP is not set
@@ -313,31 +486,109 @@ CONFIG_INET_TUNNEL=y
313# CONFIG_INET_XFRM_MODE_TRANSPORT is not set 486# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
314# CONFIG_INET_XFRM_MODE_TUNNEL is not set 487# CONFIG_INET_XFRM_MODE_TUNNEL is not set
315# CONFIG_INET_XFRM_MODE_BEET is not set 488# CONFIG_INET_XFRM_MODE_BEET is not set
316CONFIG_INET_DIAG=y 489CONFIG_INET_LRO=y
317CONFIG_INET_TCP_DIAG=y 490# CONFIG_INET_DIAG is not set
318# CONFIG_TCP_CONG_ADVANCED is not set 491CONFIG_TCP_CONG_ADVANCED=y
492# CONFIG_TCP_CONG_BIC is not set
319CONFIG_TCP_CONG_CUBIC=y 493CONFIG_TCP_CONG_CUBIC=y
494# CONFIG_TCP_CONG_WESTWOOD is not set
495# CONFIG_TCP_CONG_HTCP is not set
496# CONFIG_TCP_CONG_HSTCP is not set
497# CONFIG_TCP_CONG_HYBLA is not set
498# CONFIG_TCP_CONG_VEGAS is not set
499# CONFIG_TCP_CONG_SCALABLE is not set
500# CONFIG_TCP_CONG_LP is not set
501# CONFIG_TCP_CONG_VENO is not set
502# CONFIG_TCP_CONG_YEAH is not set
503# CONFIG_TCP_CONG_ILLINOIS is not set
504# CONFIG_DEFAULT_BIC is not set
505CONFIG_DEFAULT_CUBIC=y
506# CONFIG_DEFAULT_HTCP is not set
507# CONFIG_DEFAULT_VEGAS is not set
508# CONFIG_DEFAULT_WESTWOOD is not set
509# CONFIG_DEFAULT_RENO is not set
320CONFIG_DEFAULT_TCP_CONG="cubic" 510CONFIG_DEFAULT_TCP_CONG="cubic"
321# CONFIG_TCP_MD5SIG is not set 511CONFIG_TCP_MD5SIG=y
512# CONFIG_IP_VS is not set
322CONFIG_IPV6=y 513CONFIG_IPV6=y
323# CONFIG_IPV6_PRIVACY is not set 514# CONFIG_IPV6_PRIVACY is not set
324# CONFIG_IPV6_ROUTER_PREF is not set 515# CONFIG_IPV6_ROUTER_PREF is not set
325# CONFIG_IPV6_OPTIMISTIC_DAD is not set 516# CONFIG_IPV6_OPTIMISTIC_DAD is not set
326# CONFIG_INET6_AH is not set 517CONFIG_INET6_AH=y
327# CONFIG_INET6_ESP is not set 518CONFIG_INET6_ESP=y
328# CONFIG_INET6_IPCOMP is not set 519# CONFIG_INET6_IPCOMP is not set
329# CONFIG_IPV6_MIP6 is not set 520# CONFIG_IPV6_MIP6 is not set
330# CONFIG_INET6_XFRM_TUNNEL is not set 521# CONFIG_INET6_XFRM_TUNNEL is not set
331# CONFIG_INET6_TUNNEL is not set 522# CONFIG_INET6_TUNNEL is not set
332# CONFIG_INET6_XFRM_MODE_TRANSPORT is not set 523CONFIG_INET6_XFRM_MODE_TRANSPORT=y
333# CONFIG_INET6_XFRM_MODE_TUNNEL is not set 524CONFIG_INET6_XFRM_MODE_TUNNEL=y
334# CONFIG_INET6_XFRM_MODE_BEET is not set 525CONFIG_INET6_XFRM_MODE_BEET=y
335# CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set 526# CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set
336CONFIG_IPV6_SIT=y 527CONFIG_IPV6_SIT=y
528CONFIG_IPV6_NDISC_NODETYPE=y
337# CONFIG_IPV6_TUNNEL is not set 529# CONFIG_IPV6_TUNNEL is not set
338# CONFIG_IPV6_MULTIPLE_TABLES is not set 530# CONFIG_IPV6_MULTIPLE_TABLES is not set
339# CONFIG_NETWORK_SECMARK is not set 531# CONFIG_IPV6_MROUTE is not set
340# CONFIG_NETFILTER is not set 532CONFIG_NETLABEL=y
533CONFIG_NETWORK_SECMARK=y
534CONFIG_NETFILTER=y
535# CONFIG_NETFILTER_DEBUG is not set
536# CONFIG_NETFILTER_ADVANCED is not set
537
538#
539# Core Netfilter Configuration
540#
541CONFIG_NETFILTER_NETLINK=y
542CONFIG_NETFILTER_NETLINK_LOG=y
543CONFIG_NF_CONNTRACK=y
544CONFIG_NF_CONNTRACK_SECMARK=y
545CONFIG_NF_CONNTRACK_FTP=y
546CONFIG_NF_CONNTRACK_IRC=y
547CONFIG_NF_CONNTRACK_SIP=y
548CONFIG_NF_CT_NETLINK=y
549CONFIG_NETFILTER_XTABLES=y
550CONFIG_NETFILTER_XT_TARGET_MARK=y
551CONFIG_NETFILTER_XT_TARGET_NFLOG=y
552CONFIG_NETFILTER_XT_TARGET_SECMARK=y
553CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
554CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
555CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
556CONFIG_NETFILTER_XT_MATCH_MARK=y
557CONFIG_NETFILTER_XT_MATCH_POLICY=y
558CONFIG_NETFILTER_XT_MATCH_STATE=y
559
560#
561# IP: Netfilter Configuration
562#
563CONFIG_NF_CONNTRACK_IPV4=y
564CONFIG_NF_CONNTRACK_PROC_COMPAT=y
565CONFIG_IP_NF_IPTABLES=y
566CONFIG_IP_NF_FILTER=y
567CONFIG_IP_NF_TARGET_REJECT=y
568CONFIG_IP_NF_TARGET_LOG=y
569CONFIG_IP_NF_TARGET_ULOG=y
570CONFIG_NF_NAT=y
571CONFIG_NF_NAT_NEEDED=y
572CONFIG_IP_NF_TARGET_MASQUERADE=y
573CONFIG_NF_NAT_FTP=y
574CONFIG_NF_NAT_IRC=y
575# CONFIG_NF_NAT_TFTP is not set
576# CONFIG_NF_NAT_AMANDA is not set
577# CONFIG_NF_NAT_PPTP is not set
578# CONFIG_NF_NAT_H323 is not set
579CONFIG_NF_NAT_SIP=y
580CONFIG_IP_NF_MANGLE=y
581
582#
583# IPv6: Netfilter Configuration
584#
585CONFIG_NF_CONNTRACK_IPV6=y
586CONFIG_IP6_NF_IPTABLES=y
587CONFIG_IP6_NF_MATCH_IPV6HEADER=y
588CONFIG_IP6_NF_FILTER=y
589CONFIG_IP6_NF_TARGET_LOG=y
590CONFIG_IP6_NF_TARGET_REJECT=y
591CONFIG_IP6_NF_MANGLE=y
341# CONFIG_IP_DCCP is not set 592# CONFIG_IP_DCCP is not set
342# CONFIG_IP_SCTP is not set 593# CONFIG_IP_SCTP is not set
343# CONFIG_TIPC is not set 594# CONFIG_TIPC is not set
@@ -345,6 +596,7 @@ CONFIG_IPV6_SIT=y
345# CONFIG_BRIDGE is not set 596# CONFIG_BRIDGE is not set
346# CONFIG_VLAN_8021Q is not set 597# CONFIG_VLAN_8021Q is not set
347# CONFIG_DECNET is not set 598# CONFIG_DECNET is not set
599CONFIG_LLC=y
348# CONFIG_LLC2 is not set 600# CONFIG_LLC2 is not set
349# CONFIG_IPX is not set 601# CONFIG_IPX is not set
350# CONFIG_ATALK is not set 602# CONFIG_ATALK is not set
@@ -352,28 +604,89 @@ CONFIG_IPV6_SIT=y
352# CONFIG_LAPB is not set 604# CONFIG_LAPB is not set
353# CONFIG_ECONET is not set 605# CONFIG_ECONET is not set
354# CONFIG_WAN_ROUTER is not set 606# CONFIG_WAN_ROUTER is not set
355 607CONFIG_NET_SCHED=y
356# 608
357# QoS and/or fair queueing 609#
358# 610# Queueing/Scheduling
359# CONFIG_NET_SCHED is not set 611#
612# CONFIG_NET_SCH_CBQ is not set
613# CONFIG_NET_SCH_HTB is not set
614# CONFIG_NET_SCH_HFSC is not set
615# CONFIG_NET_SCH_PRIO is not set
616# CONFIG_NET_SCH_RED is not set
617# CONFIG_NET_SCH_SFQ is not set
618# CONFIG_NET_SCH_TEQL is not set
619# CONFIG_NET_SCH_TBF is not set
620# CONFIG_NET_SCH_GRED is not set
621# CONFIG_NET_SCH_DSMARK is not set
622# CONFIG_NET_SCH_NETEM is not set
623# CONFIG_NET_SCH_INGRESS is not set
624
625#
626# Classification
627#
628CONFIG_NET_CLS=y
629# CONFIG_NET_CLS_BASIC is not set
630# CONFIG_NET_CLS_TCINDEX is not set
631# CONFIG_NET_CLS_ROUTE4 is not set
632# CONFIG_NET_CLS_FW is not set
633# CONFIG_NET_CLS_U32 is not set
634# CONFIG_NET_CLS_RSVP is not set
635# CONFIG_NET_CLS_RSVP6 is not set
636# CONFIG_NET_CLS_FLOW is not set
637CONFIG_NET_EMATCH=y
638CONFIG_NET_EMATCH_STACK=32
639# CONFIG_NET_EMATCH_CMP is not set
640# CONFIG_NET_EMATCH_NBYTE is not set
641# CONFIG_NET_EMATCH_U32 is not set
642# CONFIG_NET_EMATCH_META is not set
643# CONFIG_NET_EMATCH_TEXT is not set
644CONFIG_NET_CLS_ACT=y
645# CONFIG_NET_ACT_POLICE is not set
646# CONFIG_NET_ACT_GACT is not set
647# CONFIG_NET_ACT_MIRRED is not set
648# CONFIG_NET_ACT_IPT is not set
649# CONFIG_NET_ACT_NAT is not set
650# CONFIG_NET_ACT_PEDIT is not set
651# CONFIG_NET_ACT_SIMP is not set
652CONFIG_NET_SCH_FIFO=y
360 653
361# 654#
362# Network testing 655# Network testing
363# 656#
364# CONFIG_NET_PKTGEN is not set 657# CONFIG_NET_PKTGEN is not set
365# CONFIG_NET_TCPPROBE is not set 658# CONFIG_NET_TCPPROBE is not set
366# CONFIG_HAMRADIO is not set 659CONFIG_HAMRADIO=y
660
661#
662# Packet Radio protocols
663#
664# CONFIG_AX25 is not set
665# CONFIG_CAN is not set
367# CONFIG_IRDA is not set 666# CONFIG_IRDA is not set
368# CONFIG_BT is not set 667# CONFIG_BT is not set
369# CONFIG_AF_RXRPC is not set 668# CONFIG_AF_RXRPC is not set
669CONFIG_FIB_RULES=y
370 670
371# 671#
372# Wireless 672# Wireless
373# 673#
374# CONFIG_CFG80211 is not set 674CONFIG_CFG80211=y
375# CONFIG_WIRELESS_EXT is not set 675CONFIG_NL80211=y
376# CONFIG_MAC80211 is not set 676CONFIG_WIRELESS_EXT=y
677CONFIG_WIRELESS_EXT_SYSFS=y
678CONFIG_MAC80211=y
679
680#
681# Rate control algorithm selection
682#
683CONFIG_MAC80211_RC_PID=y
684CONFIG_MAC80211_RC_DEFAULT_PID=y
685CONFIG_MAC80211_RC_DEFAULT="pid"
686# CONFIG_MAC80211_MESH is not set
687CONFIG_MAC80211_LEDS=y
688# CONFIG_MAC80211_DEBUGFS is not set
689# CONFIG_MAC80211_DEBUG_MENU is not set
377# CONFIG_IEEE80211 is not set 690# CONFIG_IEEE80211 is not set
378# CONFIG_RFKILL is not set 691# CONFIG_RFKILL is not set
379# CONFIG_NET_9P is not set 692# CONFIG_NET_9P is not set
@@ -385,13 +698,17 @@ CONFIG_IPV6_SIT=y
385# 698#
386# Generic Driver Options 699# Generic Driver Options
387# 700#
701CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
388CONFIG_STANDALONE=y 702CONFIG_STANDALONE=y
389CONFIG_PREVENT_FIRMWARE_BUILD=y 703CONFIG_PREVENT_FIRMWARE_BUILD=y
390CONFIG_FW_LOADER=y 704CONFIG_FW_LOADER=y
705CONFIG_FIRMWARE_IN_KERNEL=y
706CONFIG_EXTRA_FIRMWARE=""
391# CONFIG_DEBUG_DRIVER is not set 707# CONFIG_DEBUG_DRIVER is not set
392# CONFIG_DEBUG_DEVRES is not set 708CONFIG_DEBUG_DEVRES=y
393# CONFIG_SYS_HYPERVISOR is not set 709# CONFIG_SYS_HYPERVISOR is not set
394# CONFIG_CONNECTOR is not set 710CONFIG_CONNECTOR=y
711CONFIG_PROC_EVENTS=y
395# CONFIG_MTD is not set 712# CONFIG_MTD is not set
396# CONFIG_PARPORT is not set 713# CONFIG_PARPORT is not set
397CONFIG_PNP=y 714CONFIG_PNP=y
@@ -402,7 +719,7 @@ CONFIG_PNP=y
402# 719#
403CONFIG_PNPACPI=y 720CONFIG_PNPACPI=y
404CONFIG_BLK_DEV=y 721CONFIG_BLK_DEV=y
405CONFIG_BLK_DEV_FD=y 722# CONFIG_BLK_DEV_FD is not set
406# CONFIG_BLK_CPQ_DA is not set 723# CONFIG_BLK_CPQ_DA is not set
407# CONFIG_BLK_CPQ_CISS_DA is not set 724# CONFIG_BLK_CPQ_CISS_DA is not set
408# CONFIG_BLK_DEV_DAC960 is not set 725# CONFIG_BLK_DEV_DAC960 is not set
@@ -415,82 +732,31 @@ CONFIG_BLK_DEV_LOOP=y
415# CONFIG_BLK_DEV_UB is not set 732# CONFIG_BLK_DEV_UB is not set
416CONFIG_BLK_DEV_RAM=y 733CONFIG_BLK_DEV_RAM=y
417CONFIG_BLK_DEV_RAM_COUNT=16 734CONFIG_BLK_DEV_RAM_COUNT=16
418CONFIG_BLK_DEV_RAM_SIZE=4096 735CONFIG_BLK_DEV_RAM_SIZE=16384
419CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024 736# CONFIG_BLK_DEV_XIP is not set
420# CONFIG_CDROM_PKTCDVD is not set 737# CONFIG_CDROM_PKTCDVD is not set
421# CONFIG_ATA_OVER_ETH is not set 738# CONFIG_ATA_OVER_ETH is not set
739# CONFIG_BLK_DEV_HD is not set
422CONFIG_MISC_DEVICES=y 740CONFIG_MISC_DEVICES=y
423# CONFIG_IBM_ASM is not set 741# CONFIG_IBM_ASM is not set
424# CONFIG_PHANTOM is not set 742# CONFIG_PHANTOM is not set
425# CONFIG_EEPROM_93CX6 is not set 743# CONFIG_EEPROM_93CX6 is not set
426# CONFIG_SGI_IOC4 is not set 744# CONFIG_SGI_IOC4 is not set
427# CONFIG_TIFM_CORE is not set 745# CONFIG_TIFM_CORE is not set
746# CONFIG_ACER_WMI is not set
747# CONFIG_ASUS_LAPTOP is not set
748# CONFIG_FUJITSU_LAPTOP is not set
749# CONFIG_MSI_LAPTOP is not set
750# CONFIG_COMPAL_LAPTOP is not set
428# CONFIG_SONY_LAPTOP is not set 751# CONFIG_SONY_LAPTOP is not set
429# CONFIG_THINKPAD_ACPI is not set 752# CONFIG_THINKPAD_ACPI is not set
430CONFIG_IDE=y 753# CONFIG_INTEL_MENLOW is not set
431CONFIG_BLK_DEV_IDE=y 754# CONFIG_ENCLOSURE_SERVICES is not set
432 755# CONFIG_SGI_XP is not set
433# 756# CONFIG_HP_ILO is not set
434# Please see Documentation/ide.txt for help/info on IDE drives 757# CONFIG_SGI_GRU is not set
435# 758CONFIG_HAVE_IDE=y
436# CONFIG_BLK_DEV_IDE_SATA is not set 759# CONFIG_IDE is not set
437# CONFIG_BLK_DEV_HD_IDE is not set
438CONFIG_BLK_DEV_IDEDISK=y
439CONFIG_IDEDISK_MULTI_MODE=y
440CONFIG_BLK_DEV_IDECD=y
441# CONFIG_BLK_DEV_IDETAPE is not set
442# CONFIG_BLK_DEV_IDEFLOPPY is not set
443# CONFIG_BLK_DEV_IDESCSI is not set
444CONFIG_BLK_DEV_IDEACPI=y
445# CONFIG_IDE_TASK_IOCTL is not set
446CONFIG_IDE_PROC_FS=y
447
448#
449# IDE chipset support/bugfixes
450#
451CONFIG_IDE_GENERIC=y
452# CONFIG_BLK_DEV_CMD640 is not set
453# CONFIG_BLK_DEV_IDEPNP is not set
454CONFIG_BLK_DEV_IDEPCI=y
455# CONFIG_IDEPCI_SHARE_IRQ is not set
456CONFIG_IDEPCI_PCIBUS_ORDER=y
457# CONFIG_BLK_DEV_OFFBOARD is not set
458# CONFIG_BLK_DEV_GENERIC is not set
459# CONFIG_BLK_DEV_OPTI621 is not set
460# CONFIG_BLK_DEV_RZ1000 is not set
461CONFIG_BLK_DEV_IDEDMA_PCI=y
462# CONFIG_BLK_DEV_IDEDMA_FORCED is not set
463# CONFIG_IDEDMA_ONLYDISK is not set
464# CONFIG_BLK_DEV_AEC62XX is not set
465# CONFIG_BLK_DEV_ALI15X3 is not set
466CONFIG_BLK_DEV_AMD74XX=y
467CONFIG_BLK_DEV_ATIIXP=y
468# CONFIG_BLK_DEV_CMD64X is not set
469# CONFIG_BLK_DEV_TRIFLEX is not set
470# CONFIG_BLK_DEV_CY82C693 is not set
471# CONFIG_BLK_DEV_CS5520 is not set
472# CONFIG_BLK_DEV_CS5530 is not set
473# CONFIG_BLK_DEV_HPT34X is not set
474# CONFIG_BLK_DEV_HPT366 is not set
475# CONFIG_BLK_DEV_JMICRON is not set
476# CONFIG_BLK_DEV_SC1200 is not set
477CONFIG_BLK_DEV_PIIX=y
478# CONFIG_BLK_DEV_IT8213 is not set
479# CONFIG_BLK_DEV_IT821X is not set
480# CONFIG_BLK_DEV_NS87415 is not set
481# CONFIG_BLK_DEV_PDC202XX_OLD is not set
482CONFIG_BLK_DEV_PDC202XX_NEW=y
483# CONFIG_BLK_DEV_SVWKS is not set
484# CONFIG_BLK_DEV_SIIMAGE is not set
485# CONFIG_BLK_DEV_SIS5513 is not set
486# CONFIG_BLK_DEV_SLC90E66 is not set
487# CONFIG_BLK_DEV_TRM290 is not set
488# CONFIG_BLK_DEV_VIA82CXXX is not set
489# CONFIG_BLK_DEV_TC86C001 is not set
490# CONFIG_IDE_ARM is not set
491CONFIG_BLK_DEV_IDEDMA=y
492# CONFIG_IDEDMA_IVB is not set
493# CONFIG_BLK_DEV_HD is not set
494 760
495# 761#
496# SCSI device support 762# SCSI device support
@@ -499,8 +765,8 @@ CONFIG_BLK_DEV_IDEDMA=y
499CONFIG_SCSI=y 765CONFIG_SCSI=y
500CONFIG_SCSI_DMA=y 766CONFIG_SCSI_DMA=y
501# CONFIG_SCSI_TGT is not set 767# CONFIG_SCSI_TGT is not set
502CONFIG_SCSI_NETLINK=y 768# CONFIG_SCSI_NETLINK is not set
503# CONFIG_SCSI_PROC_FS is not set 769CONFIG_SCSI_PROC_FS=y
504 770
505# 771#
506# SCSI support type (disk, tape, CD-ROM) 772# SCSI support type (disk, tape, CD-ROM)
@@ -509,7 +775,7 @@ CONFIG_BLK_DEV_SD=y
509# CONFIG_CHR_DEV_ST is not set 775# CONFIG_CHR_DEV_ST is not set
510# CONFIG_CHR_DEV_OSST is not set 776# CONFIG_CHR_DEV_OSST is not set
511CONFIG_BLK_DEV_SR=y 777CONFIG_BLK_DEV_SR=y
512# CONFIG_BLK_DEV_SR_VENDOR is not set 778CONFIG_BLK_DEV_SR_VENDOR=y
513CONFIG_CHR_DEV_SG=y 779CONFIG_CHR_DEV_SG=y
514# CONFIG_CHR_DEV_SCH is not set 780# CONFIG_CHR_DEV_SCH is not set
515 781
@@ -526,73 +792,38 @@ CONFIG_SCSI_WAIT_SCAN=m
526# SCSI Transports 792# SCSI Transports
527# 793#
528CONFIG_SCSI_SPI_ATTRS=y 794CONFIG_SCSI_SPI_ATTRS=y
529CONFIG_SCSI_FC_ATTRS=y 795# CONFIG_SCSI_FC_ATTRS is not set
530# CONFIG_SCSI_ISCSI_ATTRS is not set 796CONFIG_SCSI_ISCSI_ATTRS=y
531CONFIG_SCSI_SAS_ATTRS=y 797# CONFIG_SCSI_SAS_ATTRS is not set
532# CONFIG_SCSI_SAS_LIBSAS is not set 798# CONFIG_SCSI_SAS_LIBSAS is not set
533 799# CONFIG_SCSI_SRP_ATTRS is not set
534# 800# CONFIG_SCSI_LOWLEVEL is not set
535# SCSI low-level drivers 801# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
536# 802# CONFIG_SCSI_DH is not set
537# CONFIG_ISCSI_TCP is not set
538# CONFIG_BLK_DEV_3W_XXXX_RAID is not set
539# CONFIG_SCSI_3W_9XXX is not set
540# CONFIG_SCSI_ACARD is not set
541# CONFIG_SCSI_AACRAID is not set
542# CONFIG_SCSI_AIC7XXX is not set
543# CONFIG_SCSI_AIC7XXX_OLD is not set
544CONFIG_SCSI_AIC79XX=y
545CONFIG_AIC79XX_CMDS_PER_DEVICE=32
546CONFIG_AIC79XX_RESET_DELAY_MS=4000
547# CONFIG_AIC79XX_DEBUG_ENABLE is not set
548CONFIG_AIC79XX_DEBUG_MASK=0
549# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set
550# CONFIG_SCSI_AIC94XX is not set
551# CONFIG_SCSI_ARCMSR is not set
552# CONFIG_MEGARAID_NEWGEN is not set
553# CONFIG_MEGARAID_LEGACY is not set
554# CONFIG_MEGARAID_SAS is not set
555# CONFIG_SCSI_HPTIOP is not set
556# CONFIG_SCSI_BUSLOGIC is not set
557# CONFIG_SCSI_DMX3191D is not set
558# CONFIG_SCSI_EATA is not set
559# CONFIG_SCSI_FUTURE_DOMAIN is not set
560# CONFIG_SCSI_GDTH is not set
561# CONFIG_SCSI_IPS is not set
562# CONFIG_SCSI_INITIO is not set
563# CONFIG_SCSI_INIA100 is not set
564# CONFIG_SCSI_STEX is not set
565# CONFIG_SCSI_SYM53C8XX_2 is not set
566# CONFIG_SCSI_IPR is not set
567# CONFIG_SCSI_QLOGIC_1280 is not set
568# CONFIG_SCSI_QLA_FC is not set
569# CONFIG_SCSI_QLA_ISCSI is not set
570# CONFIG_SCSI_LPFC is not set
571# CONFIG_SCSI_DC395x is not set
572# CONFIG_SCSI_DC390T is not set
573# CONFIG_SCSI_DEBUG is not set
574# CONFIG_SCSI_SRP is not set
575CONFIG_ATA=y 803CONFIG_ATA=y
576# CONFIG_ATA_NONSTANDARD is not set 804# CONFIG_ATA_NONSTANDARD is not set
577CONFIG_ATA_ACPI=y 805CONFIG_ATA_ACPI=y
806CONFIG_SATA_PMP=y
578CONFIG_SATA_AHCI=y 807CONFIG_SATA_AHCI=y
579CONFIG_SATA_SVW=y 808# CONFIG_SATA_SIL24 is not set
809CONFIG_ATA_SFF=y
810# CONFIG_SATA_SVW is not set
580CONFIG_ATA_PIIX=y 811CONFIG_ATA_PIIX=y
581# CONFIG_SATA_MV is not set 812# CONFIG_SATA_MV is not set
582CONFIG_SATA_NV=y 813# CONFIG_SATA_NV is not set
583# CONFIG_PDC_ADMA is not set 814# CONFIG_PDC_ADMA is not set
584# CONFIG_SATA_QSTOR is not set 815# CONFIG_SATA_QSTOR is not set
585# CONFIG_SATA_PROMISE is not set 816# CONFIG_SATA_PROMISE is not set
586# CONFIG_SATA_SX4 is not set 817# CONFIG_SATA_SX4 is not set
587CONFIG_SATA_SIL=y 818# CONFIG_SATA_SIL is not set
588# CONFIG_SATA_SIL24 is not set
589# CONFIG_SATA_SIS is not set 819# CONFIG_SATA_SIS is not set
590# CONFIG_SATA_ULI is not set 820# CONFIG_SATA_ULI is not set
591CONFIG_SATA_VIA=y 821# CONFIG_SATA_VIA is not set
592# CONFIG_SATA_VITESSE is not set 822# CONFIG_SATA_VITESSE is not set
593# CONFIG_SATA_INIC162X is not set 823# CONFIG_SATA_INIC162X is not set
824# CONFIG_PATA_ACPI is not set
594# CONFIG_PATA_ALI is not set 825# CONFIG_PATA_ALI is not set
595# CONFIG_PATA_AMD is not set 826CONFIG_PATA_AMD=y
596# CONFIG_PATA_ARTOP is not set 827# CONFIG_PATA_ARTOP is not set
597# CONFIG_PATA_ATIIXP is not set 828# CONFIG_PATA_ATIIXP is not set
598# CONFIG_PATA_CMD640_PCI is not set 829# CONFIG_PATA_CMD640_PCI is not set
@@ -612,11 +843,14 @@ CONFIG_SATA_VIA=y
612# CONFIG_PATA_TRIFLEX is not set 843# CONFIG_PATA_TRIFLEX is not set
613# CONFIG_PATA_MARVELL is not set 844# CONFIG_PATA_MARVELL is not set
614# CONFIG_PATA_MPIIX is not set 845# CONFIG_PATA_MPIIX is not set
615# CONFIG_PATA_OLDPIIX is not set 846CONFIG_PATA_OLDPIIX=y
616# CONFIG_PATA_NETCELL is not set 847# CONFIG_PATA_NETCELL is not set
848# CONFIG_PATA_NINJA32 is not set
617# CONFIG_PATA_NS87410 is not set 849# CONFIG_PATA_NS87410 is not set
850# CONFIG_PATA_NS87415 is not set
618# CONFIG_PATA_OPTI is not set 851# CONFIG_PATA_OPTI is not set
619# CONFIG_PATA_OPTIDMA is not set 852# CONFIG_PATA_OPTIDMA is not set
853# CONFIG_PATA_PCMCIA is not set
620# CONFIG_PATA_PDC_OLD is not set 854# CONFIG_PATA_PDC_OLD is not set
621# CONFIG_PATA_RADISYS is not set 855# CONFIG_PATA_RADISYS is not set
622# CONFIG_PATA_RZ1000 is not set 856# CONFIG_PATA_RZ1000 is not set
@@ -627,147 +861,186 @@ CONFIG_SATA_VIA=y
627# CONFIG_PATA_SIS is not set 861# CONFIG_PATA_SIS is not set
628# CONFIG_PATA_VIA is not set 862# CONFIG_PATA_VIA is not set
629# CONFIG_PATA_WINBOND is not set 863# CONFIG_PATA_WINBOND is not set
864CONFIG_PATA_SCH=y
630CONFIG_MD=y 865CONFIG_MD=y
631# CONFIG_BLK_DEV_MD is not set 866CONFIG_BLK_DEV_MD=y
867# CONFIG_MD_LINEAR is not set
868# CONFIG_MD_RAID0 is not set
869# CONFIG_MD_RAID1 is not set
870# CONFIG_MD_RAID10 is not set
871# CONFIG_MD_RAID456 is not set
872# CONFIG_MD_MULTIPATH is not set
873# CONFIG_MD_FAULTY is not set
632CONFIG_BLK_DEV_DM=y 874CONFIG_BLK_DEV_DM=y
633# CONFIG_DM_DEBUG is not set 875# CONFIG_DM_DEBUG is not set
634# CONFIG_DM_CRYPT is not set 876# CONFIG_DM_CRYPT is not set
635# CONFIG_DM_SNAPSHOT is not set 877# CONFIG_DM_SNAPSHOT is not set
636# CONFIG_DM_MIRROR is not set 878CONFIG_DM_MIRROR=y
637# CONFIG_DM_ZERO is not set 879CONFIG_DM_ZERO=y
638# CONFIG_DM_MULTIPATH is not set 880# CONFIG_DM_MULTIPATH is not set
639# CONFIG_DM_DELAY is not set 881# CONFIG_DM_DELAY is not set
640 882# CONFIG_DM_UEVENT is not set
641# 883# CONFIG_FUSION is not set
642# Fusion MPT device support
643#
644CONFIG_FUSION=y
645CONFIG_FUSION_SPI=y
646# CONFIG_FUSION_FC is not set
647# CONFIG_FUSION_SAS is not set
648CONFIG_FUSION_MAX_SGE=128
649# CONFIG_FUSION_CTL is not set
650 884
651# 885#
652# IEEE 1394 (FireWire) support 886# IEEE 1394 (FireWire) support
653# 887#
654# CONFIG_FIREWIRE is not set
655CONFIG_IEEE1394=y
656
657#
658# Subsystem Options
659#
660# CONFIG_IEEE1394_VERBOSEDEBUG is not set
661
662#
663# Controllers
664#
665 888
666# 889#
667# Texas Instruments PCILynx requires I2C 890# Enable only one of the two stacks, unless you know what you are doing
668# 891#
669CONFIG_IEEE1394_OHCI1394=y 892# CONFIG_FIREWIRE is not set
670 893# CONFIG_IEEE1394 is not set
671#
672# Protocols
673#
674# CONFIG_IEEE1394_VIDEO1394 is not set
675# CONFIG_IEEE1394_SBP2 is not set
676# CONFIG_IEEE1394_ETH1394_ROM_ENTRY is not set
677# CONFIG_IEEE1394_ETH1394 is not set
678# CONFIG_IEEE1394_DV1394 is not set
679CONFIG_IEEE1394_RAWIO=y
680# CONFIG_I2O is not set 894# CONFIG_I2O is not set
681CONFIG_MACINTOSH_DRIVERS=y 895CONFIG_MACINTOSH_DRIVERS=y
682# CONFIG_MAC_EMUMOUSEBTN is not set 896CONFIG_MAC_EMUMOUSEBTN=y
683CONFIG_NETDEVICES=y 897CONFIG_NETDEVICES=y
684CONFIG_NETDEVICES_MULTIQUEUE=y 898# CONFIG_IFB is not set
685# CONFIG_DUMMY is not set 899# CONFIG_DUMMY is not set
686# CONFIG_BONDING is not set 900# CONFIG_BONDING is not set
687# CONFIG_MACVLAN is not set 901# CONFIG_MACVLAN is not set
688# CONFIG_EQUALIZER is not set 902# CONFIG_EQUALIZER is not set
689CONFIG_TUN=y 903# CONFIG_TUN is not set
904# CONFIG_VETH is not set
690# CONFIG_NET_SB1000 is not set 905# CONFIG_NET_SB1000 is not set
691# CONFIG_ARCNET is not set 906# CONFIG_ARCNET is not set
692# CONFIG_PHYLIB is not set 907CONFIG_PHYLIB=y
908
909#
910# MII PHY device drivers
911#
912# CONFIG_MARVELL_PHY is not set
913# CONFIG_DAVICOM_PHY is not set
914# CONFIG_QSEMI_PHY is not set
915# CONFIG_LXT_PHY is not set
916# CONFIG_CICADA_PHY is not set
917# CONFIG_VITESSE_PHY is not set
918# CONFIG_SMSC_PHY is not set
919# CONFIG_BROADCOM_PHY is not set
920# CONFIG_ICPLUS_PHY is not set
921# CONFIG_REALTEK_PHY is not set
922# CONFIG_FIXED_PHY is not set
923# CONFIG_MDIO_BITBANG is not set
693CONFIG_NET_ETHERNET=y 924CONFIG_NET_ETHERNET=y
694CONFIG_MII=y 925CONFIG_MII=y
695# CONFIG_HAPPYMEAL is not set 926# CONFIG_HAPPYMEAL is not set
696# CONFIG_SUNGEM is not set 927# CONFIG_SUNGEM is not set
697# CONFIG_CASSINI is not set 928# CONFIG_CASSINI is not set
698CONFIG_NET_VENDOR_3COM=y 929CONFIG_NET_VENDOR_3COM=y
699CONFIG_VORTEX=y 930# CONFIG_VORTEX is not set
700# CONFIG_TYPHOON is not set 931# CONFIG_TYPHOON is not set
701CONFIG_NET_TULIP=y 932CONFIG_NET_TULIP=y
702# CONFIG_DE2104X is not set 933# CONFIG_DE2104X is not set
703CONFIG_TULIP=y 934# CONFIG_TULIP is not set
704# CONFIG_TULIP_MWI is not set
705# CONFIG_TULIP_MMIO is not set
706# CONFIG_TULIP_NAPI is not set
707# CONFIG_DE4X5 is not set 935# CONFIG_DE4X5 is not set
708# CONFIG_WINBOND_840 is not set 936# CONFIG_WINBOND_840 is not set
709# CONFIG_DM9102 is not set 937# CONFIG_DM9102 is not set
710# CONFIG_ULI526X is not set 938# CONFIG_ULI526X is not set
939# CONFIG_PCMCIA_XIRCOM is not set
711# CONFIG_HP100 is not set 940# CONFIG_HP100 is not set
941# CONFIG_IBM_NEW_EMAC_ZMII is not set
942# CONFIG_IBM_NEW_EMAC_RGMII is not set
943# CONFIG_IBM_NEW_EMAC_TAH is not set
944# CONFIG_IBM_NEW_EMAC_EMAC4 is not set
712CONFIG_NET_PCI=y 945CONFIG_NET_PCI=y
713# CONFIG_PCNET32 is not set 946# CONFIG_PCNET32 is not set
714CONFIG_AMD8111_ETH=y 947# CONFIG_AMD8111_ETH is not set
715# CONFIG_AMD8111E_NAPI is not set
716# CONFIG_ADAPTEC_STARFIRE is not set 948# CONFIG_ADAPTEC_STARFIRE is not set
717CONFIG_B44=y 949# CONFIG_B44 is not set
718CONFIG_FORCEDETH=y 950CONFIG_FORCEDETH=y
719# CONFIG_FORCEDETH_NAPI is not set 951# CONFIG_FORCEDETH_NAPI is not set
720# CONFIG_DGRS is not set
721# CONFIG_EEPRO100 is not set 952# CONFIG_EEPRO100 is not set
722CONFIG_E100=y 953CONFIG_E100=y
723# CONFIG_FEALNX is not set 954# CONFIG_FEALNX is not set
724# CONFIG_NATSEMI is not set 955# CONFIG_NATSEMI is not set
725# CONFIG_NE2K_PCI is not set 956# CONFIG_NE2K_PCI is not set
726CONFIG_8139CP=y 957# CONFIG_8139CP is not set
727CONFIG_8139TOO=y 958CONFIG_8139TOO=y
728# CONFIG_8139TOO_PIO is not set 959CONFIG_8139TOO_PIO=y
729# CONFIG_8139TOO_TUNE_TWISTER is not set 960# CONFIG_8139TOO_TUNE_TWISTER is not set
730# CONFIG_8139TOO_8129 is not set 961# CONFIG_8139TOO_8129 is not set
731# CONFIG_8139_OLD_RX_RESET is not set 962# CONFIG_8139_OLD_RX_RESET is not set
963# CONFIG_R6040 is not set
732# CONFIG_SIS900 is not set 964# CONFIG_SIS900 is not set
733# CONFIG_EPIC100 is not set 965# CONFIG_EPIC100 is not set
734# CONFIG_SUNDANCE is not set 966# CONFIG_SUNDANCE is not set
967# CONFIG_TLAN is not set
735# CONFIG_VIA_RHINE is not set 968# CONFIG_VIA_RHINE is not set
736# CONFIG_SC92031 is not set 969# CONFIG_SC92031 is not set
737CONFIG_NETDEV_1000=y 970CONFIG_NETDEV_1000=y
738# CONFIG_ACENIC is not set 971# CONFIG_ACENIC is not set
739# CONFIG_DL2K is not set 972# CONFIG_DL2K is not set
740CONFIG_E1000=y 973CONFIG_E1000=y
741# CONFIG_E1000_NAPI is not set
742# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set 974# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set
975# CONFIG_E1000E is not set
976# CONFIG_IP1000 is not set
977# CONFIG_IGB is not set
743# CONFIG_NS83820 is not set 978# CONFIG_NS83820 is not set
744# CONFIG_HAMACHI is not set 979# CONFIG_HAMACHI is not set
745# CONFIG_YELLOWFIN is not set 980# CONFIG_YELLOWFIN is not set
746# CONFIG_R8169 is not set 981# CONFIG_R8169 is not set
747# CONFIG_SIS190 is not set 982# CONFIG_SIS190 is not set
748# CONFIG_SKGE is not set 983# CONFIG_SKGE is not set
749# CONFIG_SKY2 is not set 984CONFIG_SKY2=y
985# CONFIG_SKY2_DEBUG is not set
750# CONFIG_VIA_VELOCITY is not set 986# CONFIG_VIA_VELOCITY is not set
751CONFIG_TIGON3=y 987CONFIG_TIGON3=y
752CONFIG_BNX2=y 988# CONFIG_BNX2 is not set
753# CONFIG_QLA3XXX is not set 989# CONFIG_QLA3XXX is not set
754# CONFIG_ATL1 is not set 990# CONFIG_ATL1 is not set
991# CONFIG_ATL1E is not set
755CONFIG_NETDEV_10000=y 992CONFIG_NETDEV_10000=y
756# CONFIG_CHELSIO_T1 is not set 993# CONFIG_CHELSIO_T1 is not set
757# CONFIG_CHELSIO_T3 is not set 994# CONFIG_CHELSIO_T3 is not set
995# CONFIG_IXGBE is not set
758# CONFIG_IXGB is not set 996# CONFIG_IXGB is not set
759CONFIG_S2IO=m 997# CONFIG_S2IO is not set
760# CONFIG_S2IO_NAPI is not set
761# CONFIG_MYRI10GE is not set 998# CONFIG_MYRI10GE is not set
762# CONFIG_NETXEN_NIC is not set 999# CONFIG_NETXEN_NIC is not set
1000# CONFIG_NIU is not set
763# CONFIG_MLX4_CORE is not set 1001# CONFIG_MLX4_CORE is not set
764# CONFIG_TR is not set 1002# CONFIG_TEHUTI is not set
1003# CONFIG_BNX2X is not set
1004# CONFIG_SFC is not set
1005CONFIG_TR=y
1006# CONFIG_IBMOL is not set
1007# CONFIG_3C359 is not set
1008# CONFIG_TMS380TR is not set
765 1009
766# 1010#
767# Wireless LAN 1011# Wireless LAN
768# 1012#
769# CONFIG_WLAN_PRE80211 is not set 1013# CONFIG_WLAN_PRE80211 is not set
770# CONFIG_WLAN_80211 is not set 1014CONFIG_WLAN_80211=y
1015# CONFIG_PCMCIA_RAYCS is not set
1016# CONFIG_IPW2100 is not set
1017# CONFIG_IPW2200 is not set
1018# CONFIG_LIBERTAS is not set
1019# CONFIG_AIRO is not set
1020# CONFIG_HERMES is not set
1021# CONFIG_ATMEL is not set
1022# CONFIG_AIRO_CS is not set
1023# CONFIG_PCMCIA_WL3501 is not set
1024# CONFIG_PRISM54 is not set
1025# CONFIG_USB_ZD1201 is not set
1026# CONFIG_USB_NET_RNDIS_WLAN is not set
1027# CONFIG_RTL8180 is not set
1028# CONFIG_RTL8187 is not set
1029# CONFIG_ADM8211 is not set
1030# CONFIG_MAC80211_HWSIM is not set
1031# CONFIG_P54_COMMON is not set
1032CONFIG_ATH5K=y
1033# CONFIG_ATH5K_DEBUG is not set
1034# CONFIG_ATH9K is not set
1035# CONFIG_IWLCORE is not set
1036# CONFIG_IWLWIFI_LEDS is not set
1037# CONFIG_IWLAGN is not set
1038# CONFIG_IWL3945 is not set
1039# CONFIG_HOSTAP is not set
1040# CONFIG_B43 is not set
1041# CONFIG_B43LEGACY is not set
1042# CONFIG_ZD1211RW is not set
1043# CONFIG_RT2X00 is not set
771 1044
772# 1045#
773# USB Network Adapters 1046# USB Network Adapters
@@ -776,16 +1049,26 @@ CONFIG_S2IO=m
776# CONFIG_USB_KAWETH is not set 1049# CONFIG_USB_KAWETH is not set
777# CONFIG_USB_PEGASUS is not set 1050# CONFIG_USB_PEGASUS is not set
778# CONFIG_USB_RTL8150 is not set 1051# CONFIG_USB_RTL8150 is not set
779# CONFIG_USB_USBNET_MII is not set
780# CONFIG_USB_USBNET is not set 1052# CONFIG_USB_USBNET is not set
1053CONFIG_NET_PCMCIA=y
1054# CONFIG_PCMCIA_3C589 is not set
1055# CONFIG_PCMCIA_3C574 is not set
1056# CONFIG_PCMCIA_FMVJ18X is not set
1057# CONFIG_PCMCIA_PCNET is not set
1058# CONFIG_PCMCIA_NMCLAN is not set
1059# CONFIG_PCMCIA_SMC91C92 is not set
1060# CONFIG_PCMCIA_XIRC2PS is not set
1061# CONFIG_PCMCIA_AXNET is not set
781# CONFIG_WAN is not set 1062# CONFIG_WAN is not set
782# CONFIG_FDDI is not set 1063CONFIG_FDDI=y
1064# CONFIG_DEFXX is not set
1065# CONFIG_SKFP is not set
783# CONFIG_HIPPI is not set 1066# CONFIG_HIPPI is not set
784# CONFIG_PPP is not set 1067# CONFIG_PPP is not set
785# CONFIG_SLIP is not set 1068# CONFIG_SLIP is not set
786# CONFIG_NET_FC is not set 1069# CONFIG_NET_FC is not set
787# CONFIG_SHAPER is not set
788CONFIG_NETCONSOLE=y 1070CONFIG_NETCONSOLE=y
1071# CONFIG_NETCONSOLE_DYNAMIC is not set
789CONFIG_NETPOLL=y 1072CONFIG_NETPOLL=y
790# CONFIG_NETPOLL_TRAP is not set 1073# CONFIG_NETPOLL_TRAP is not set
791CONFIG_NET_POLL_CONTROLLER=y 1074CONFIG_NET_POLL_CONTROLLER=y
@@ -796,18 +1079,17 @@ CONFIG_NET_POLL_CONTROLLER=y
796# Input device support 1079# Input device support
797# 1080#
798CONFIG_INPUT=y 1081CONFIG_INPUT=y
799# CONFIG_INPUT_FF_MEMLESS is not set 1082CONFIG_INPUT_FF_MEMLESS=y
800# CONFIG_INPUT_POLLDEV is not set 1083CONFIG_INPUT_POLLDEV=y
801 1084
802# 1085#
803# Userland interfaces 1086# Userland interfaces
804# 1087#
805CONFIG_INPUT_MOUSEDEV=y 1088CONFIG_INPUT_MOUSEDEV=y
806CONFIG_INPUT_MOUSEDEV_PSAUX=y 1089# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
807CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 1090CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
808CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 1091CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
809# CONFIG_INPUT_JOYDEV is not set 1092# CONFIG_INPUT_JOYDEV is not set
810# CONFIG_INPUT_TSDEV is not set
811CONFIG_INPUT_EVDEV=y 1093CONFIG_INPUT_EVDEV=y
812# CONFIG_INPUT_EVBUG is not set 1094# CONFIG_INPUT_EVBUG is not set
813 1095
@@ -831,18 +1113,66 @@ CONFIG_MOUSE_PS2_TRACKPOINT=y
831# CONFIG_MOUSE_PS2_TOUCHKIT is not set 1113# CONFIG_MOUSE_PS2_TOUCHKIT is not set
832# CONFIG_MOUSE_SERIAL is not set 1114# CONFIG_MOUSE_SERIAL is not set
833# CONFIG_MOUSE_APPLETOUCH is not set 1115# CONFIG_MOUSE_APPLETOUCH is not set
1116# CONFIG_MOUSE_BCM5974 is not set
834# CONFIG_MOUSE_VSXXXAA is not set 1117# CONFIG_MOUSE_VSXXXAA is not set
835# CONFIG_INPUT_JOYSTICK is not set 1118CONFIG_INPUT_JOYSTICK=y
836# CONFIG_INPUT_TABLET is not set 1119# CONFIG_JOYSTICK_ANALOG is not set
837# CONFIG_INPUT_TOUCHSCREEN is not set 1120# CONFIG_JOYSTICK_A3D is not set
838# CONFIG_INPUT_MISC is not set 1121# CONFIG_JOYSTICK_ADI is not set
1122# CONFIG_JOYSTICK_COBRA is not set
1123# CONFIG_JOYSTICK_GF2K is not set
1124# CONFIG_JOYSTICK_GRIP is not set
1125# CONFIG_JOYSTICK_GRIP_MP is not set
1126# CONFIG_JOYSTICK_GUILLEMOT is not set
1127# CONFIG_JOYSTICK_INTERACT is not set
1128# CONFIG_JOYSTICK_SIDEWINDER is not set
1129# CONFIG_JOYSTICK_TMDC is not set
1130# CONFIG_JOYSTICK_IFORCE is not set
1131# CONFIG_JOYSTICK_WARRIOR is not set
1132# CONFIG_JOYSTICK_MAGELLAN is not set
1133# CONFIG_JOYSTICK_SPACEORB is not set
1134# CONFIG_JOYSTICK_SPACEBALL is not set
1135# CONFIG_JOYSTICK_STINGER is not set
1136# CONFIG_JOYSTICK_TWIDJOY is not set
1137# CONFIG_JOYSTICK_ZHENHUA is not set
1138# CONFIG_JOYSTICK_JOYDUMP is not set
1139# CONFIG_JOYSTICK_XPAD is not set
1140CONFIG_INPUT_TABLET=y
1141# CONFIG_TABLET_USB_ACECAD is not set
1142# CONFIG_TABLET_USB_AIPTEK is not set
1143# CONFIG_TABLET_USB_GTCO is not set
1144# CONFIG_TABLET_USB_KBTAB is not set
1145# CONFIG_TABLET_USB_WACOM is not set
1146CONFIG_INPUT_TOUCHSCREEN=y
1147# CONFIG_TOUCHSCREEN_FUJITSU is not set
1148# CONFIG_TOUCHSCREEN_GUNZE is not set
1149# CONFIG_TOUCHSCREEN_ELO is not set
1150# CONFIG_TOUCHSCREEN_MTOUCH is not set
1151# CONFIG_TOUCHSCREEN_INEXIO is not set
1152# CONFIG_TOUCHSCREEN_MK712 is not set
1153# CONFIG_TOUCHSCREEN_PENMOUNT is not set
1154# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set
1155# CONFIG_TOUCHSCREEN_TOUCHWIN is not set
1156# CONFIG_TOUCHSCREEN_UCB1400 is not set
1157# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set
1158# CONFIG_TOUCHSCREEN_TOUCHIT213 is not set
1159CONFIG_INPUT_MISC=y
1160# CONFIG_INPUT_PCSPKR is not set
1161# CONFIG_INPUT_APANEL is not set
1162# CONFIG_INPUT_ATLAS_BTNS is not set
1163# CONFIG_INPUT_ATI_REMOTE is not set
1164# CONFIG_INPUT_ATI_REMOTE2 is not set
1165# CONFIG_INPUT_KEYSPAN_REMOTE is not set
1166# CONFIG_INPUT_POWERMATE is not set
1167# CONFIG_INPUT_YEALINK is not set
1168# CONFIG_INPUT_UINPUT is not set
839 1169
840# 1170#
841# Hardware I/O ports 1171# Hardware I/O ports
842# 1172#
843CONFIG_SERIO=y 1173CONFIG_SERIO=y
844CONFIG_SERIO_I8042=y 1174CONFIG_SERIO_I8042=y
845# CONFIG_SERIO_SERPORT is not set 1175CONFIG_SERIO_SERPORT=y
846# CONFIG_SERIO_CT82C710 is not set 1176# CONFIG_SERIO_CT82C710 is not set
847# CONFIG_SERIO_PCIPS2 is not set 1177# CONFIG_SERIO_PCIPS2 is not set
848CONFIG_SERIO_LIBPS2=y 1178CONFIG_SERIO_LIBPS2=y
@@ -853,10 +1183,29 @@ CONFIG_SERIO_LIBPS2=y
853# Character devices 1183# Character devices
854# 1184#
855CONFIG_VT=y 1185CONFIG_VT=y
1186CONFIG_CONSOLE_TRANSLATIONS=y
856CONFIG_VT_CONSOLE=y 1187CONFIG_VT_CONSOLE=y
857CONFIG_HW_CONSOLE=y 1188CONFIG_HW_CONSOLE=y
858# CONFIG_VT_HW_CONSOLE_BINDING is not set 1189CONFIG_VT_HW_CONSOLE_BINDING=y
859# CONFIG_SERIAL_NONSTANDARD is not set 1190CONFIG_DEVKMEM=y
1191CONFIG_SERIAL_NONSTANDARD=y
1192# CONFIG_COMPUTONE is not set
1193# CONFIG_ROCKETPORT is not set
1194# CONFIG_CYCLADES is not set
1195# CONFIG_DIGIEPCA is not set
1196# CONFIG_MOXA_INTELLIO is not set
1197# CONFIG_MOXA_SMARTIO is not set
1198# CONFIG_ISI is not set
1199# CONFIG_SYNCLINK is not set
1200# CONFIG_SYNCLINKMP is not set
1201# CONFIG_SYNCLINK_GT is not set
1202# CONFIG_N_HDLC is not set
1203# CONFIG_RISCOM8 is not set
1204# CONFIG_SPECIALIX is not set
1205# CONFIG_SX is not set
1206# CONFIG_RIO is not set
1207# CONFIG_STALDRV is not set
1208# CONFIG_NOZOMI is not set
860 1209
861# 1210#
862# Serial drivers 1211# Serial drivers
@@ -866,9 +1215,14 @@ CONFIG_SERIAL_8250_CONSOLE=y
866CONFIG_FIX_EARLYCON_MEM=y 1215CONFIG_FIX_EARLYCON_MEM=y
867CONFIG_SERIAL_8250_PCI=y 1216CONFIG_SERIAL_8250_PCI=y
868CONFIG_SERIAL_8250_PNP=y 1217CONFIG_SERIAL_8250_PNP=y
869CONFIG_SERIAL_8250_NR_UARTS=4 1218# CONFIG_SERIAL_8250_CS is not set
1219CONFIG_SERIAL_8250_NR_UARTS=32
870CONFIG_SERIAL_8250_RUNTIME_UARTS=4 1220CONFIG_SERIAL_8250_RUNTIME_UARTS=4
871# CONFIG_SERIAL_8250_EXTENDED is not set 1221CONFIG_SERIAL_8250_EXTENDED=y
1222CONFIG_SERIAL_8250_MANY_PORTS=y
1223CONFIG_SERIAL_8250_SHARE_IRQ=y
1224CONFIG_SERIAL_8250_DETECT_IRQ=y
1225CONFIG_SERIAL_8250_RSA=y
872 1226
873# 1227#
874# Non-8250 serial port support 1228# Non-8250 serial port support
@@ -877,114 +1231,417 @@ CONFIG_SERIAL_CORE=y
877CONFIG_SERIAL_CORE_CONSOLE=y 1231CONFIG_SERIAL_CORE_CONSOLE=y
878# CONFIG_SERIAL_JSM is not set 1232# CONFIG_SERIAL_JSM is not set
879CONFIG_UNIX98_PTYS=y 1233CONFIG_UNIX98_PTYS=y
880CONFIG_LEGACY_PTYS=y 1234# CONFIG_LEGACY_PTYS is not set
881CONFIG_LEGACY_PTY_COUNT=256
882# CONFIG_IPMI_HANDLER is not set 1235# CONFIG_IPMI_HANDLER is not set
883# CONFIG_WATCHDOG is not set
884CONFIG_HW_RANDOM=y 1236CONFIG_HW_RANDOM=y
885CONFIG_HW_RANDOM_INTEL=y 1237# CONFIG_HW_RANDOM_INTEL is not set
886CONFIG_HW_RANDOM_AMD=y 1238# CONFIG_HW_RANDOM_AMD is not set
887# CONFIG_NVRAM is not set 1239CONFIG_NVRAM=y
888CONFIG_RTC=y
889# CONFIG_R3964 is not set 1240# CONFIG_R3964 is not set
890# CONFIG_APPLICOM is not set 1241# CONFIG_APPLICOM is not set
891CONFIG_AGP=y 1242
892CONFIG_AGP_AMD64=y 1243#
893CONFIG_AGP_INTEL=y 1244# PCMCIA character devices
894# CONFIG_AGP_SIS is not set 1245#
895# CONFIG_AGP_VIA is not set 1246# CONFIG_SYNCLINK_CS is not set
896# CONFIG_DRM is not set 1247# CONFIG_CARDMAN_4000 is not set
1248# CONFIG_CARDMAN_4040 is not set
1249# CONFIG_IPWIRELESS is not set
897# CONFIG_MWAVE is not set 1250# CONFIG_MWAVE is not set
898# CONFIG_PC8736x_GPIO is not set 1251# CONFIG_PC8736x_GPIO is not set
899CONFIG_RAW_DRIVER=y 1252# CONFIG_RAW_DRIVER is not set
900CONFIG_MAX_RAW_DEVS=256
901CONFIG_HPET=y 1253CONFIG_HPET=y
902# CONFIG_HPET_RTC_IRQ is not set 1254# CONFIG_HPET_MMAP is not set
903CONFIG_HPET_MMAP=y
904# CONFIG_HANGCHECK_TIMER is not set 1255# CONFIG_HANGCHECK_TIMER is not set
905# CONFIG_TCG_TPM is not set 1256# CONFIG_TCG_TPM is not set
906# CONFIG_TELCLOCK is not set 1257# CONFIG_TELCLOCK is not set
907CONFIG_DEVPORT=y 1258CONFIG_DEVPORT=y
908# CONFIG_I2C is not set 1259CONFIG_I2C=y
1260CONFIG_I2C_BOARDINFO=y
1261# CONFIG_I2C_CHARDEV is not set
1262CONFIG_I2C_HELPER_AUTO=y
1263
1264#
1265# I2C Hardware Bus support
1266#
1267
1268#
1269# PC SMBus host controller drivers
1270#
1271# CONFIG_I2C_ALI1535 is not set
1272# CONFIG_I2C_ALI1563 is not set
1273# CONFIG_I2C_ALI15X3 is not set
1274# CONFIG_I2C_AMD756 is not set
1275# CONFIG_I2C_AMD8111 is not set
1276CONFIG_I2C_I801=y
1277# CONFIG_I2C_ISCH is not set
1278# CONFIG_I2C_PIIX4 is not set
1279# CONFIG_I2C_NFORCE2 is not set
1280# CONFIG_I2C_SIS5595 is not set
1281# CONFIG_I2C_SIS630 is not set
1282# CONFIG_I2C_SIS96X is not set
1283# CONFIG_I2C_VIA is not set
1284# CONFIG_I2C_VIAPRO is not set
1285
1286#
1287# I2C system bus drivers (mostly embedded / system-on-chip)
1288#
1289# CONFIG_I2C_OCORES is not set
1290# CONFIG_I2C_SIMTEC is not set
1291
1292#
1293# External I2C/SMBus adapter drivers
1294#
1295# CONFIG_I2C_PARPORT_LIGHT is not set
1296# CONFIG_I2C_TAOS_EVM is not set
1297# CONFIG_I2C_TINY_USB is not set
1298
1299#
1300# Graphics adapter I2C/DDC channel drivers
1301#
1302# CONFIG_I2C_VOODOO3 is not set
909 1303
910# 1304#
911# SPI support 1305# Other I2C/SMBus bus drivers
912# 1306#
1307# CONFIG_I2C_PCA_PLATFORM is not set
1308# CONFIG_I2C_STUB is not set
1309
1310#
1311# Miscellaneous I2C Chip support
1312#
1313# CONFIG_DS1682 is not set
1314# CONFIG_AT24 is not set
1315# CONFIG_SENSORS_EEPROM is not set
1316# CONFIG_SENSORS_PCF8574 is not set
1317# CONFIG_PCF8575 is not set
1318# CONFIG_SENSORS_PCA9539 is not set
1319# CONFIG_SENSORS_PCF8591 is not set
1320# CONFIG_SENSORS_MAX6875 is not set
1321# CONFIG_SENSORS_TSL2550 is not set
1322# CONFIG_I2C_DEBUG_CORE is not set
1323# CONFIG_I2C_DEBUG_ALGO is not set
1324# CONFIG_I2C_DEBUG_BUS is not set
1325# CONFIG_I2C_DEBUG_CHIP is not set
913# CONFIG_SPI is not set 1326# CONFIG_SPI is not set
914# CONFIG_SPI_MASTER is not set 1327CONFIG_ARCH_WANT_OPTIONAL_GPIOLIB=y
1328# CONFIG_GPIOLIB is not set
915# CONFIG_W1 is not set 1329# CONFIG_W1 is not set
916# CONFIG_POWER_SUPPLY is not set 1330CONFIG_POWER_SUPPLY=y
1331# CONFIG_POWER_SUPPLY_DEBUG is not set
1332# CONFIG_PDA_POWER is not set
1333# CONFIG_BATTERY_DS2760 is not set
917# CONFIG_HWMON is not set 1334# CONFIG_HWMON is not set
1335CONFIG_THERMAL=y
1336CONFIG_WATCHDOG=y
1337# CONFIG_WATCHDOG_NOWAYOUT is not set
1338
1339#
1340# Watchdog Device Drivers
1341#
1342# CONFIG_SOFT_WATCHDOG is not set
1343# CONFIG_ACQUIRE_WDT is not set
1344# CONFIG_ADVANTECH_WDT is not set
1345# CONFIG_ALIM1535_WDT is not set
1346# CONFIG_ALIM7101_WDT is not set
1347# CONFIG_SC520_WDT is not set
1348# CONFIG_EUROTECH_WDT is not set
1349# CONFIG_IB700_WDT is not set
1350# CONFIG_IBMASR is not set
1351# CONFIG_WAFER_WDT is not set
1352# CONFIG_I6300ESB_WDT is not set
1353# CONFIG_ITCO_WDT is not set
1354# CONFIG_IT8712F_WDT is not set
1355# CONFIG_HP_WATCHDOG is not set
1356# CONFIG_SC1200_WDT is not set
1357# CONFIG_PC87413_WDT is not set
1358# CONFIG_60XX_WDT is not set
1359# CONFIG_SBC8360_WDT is not set
1360# CONFIG_CPU5_WDT is not set
1361# CONFIG_SMSC37B787_WDT is not set
1362# CONFIG_W83627HF_WDT is not set
1363# CONFIG_W83697HF_WDT is not set
1364# CONFIG_W83877F_WDT is not set
1365# CONFIG_W83977F_WDT is not set
1366# CONFIG_MACHZ_WDT is not set
1367# CONFIG_SBC_EPX_C3_WATCHDOG is not set
1368
1369#
1370# PCI-based Watchdog Cards
1371#
1372# CONFIG_PCIPCWATCHDOG is not set
1373# CONFIG_WDTPCI is not set
1374
1375#
1376# USB-based Watchdog Cards
1377#
1378# CONFIG_USBPCWATCHDOG is not set
1379
1380#
1381# Sonics Silicon Backplane
1382#
1383CONFIG_SSB_POSSIBLE=y
1384# CONFIG_SSB is not set
918 1385
919# 1386#
920# Multifunction device drivers 1387# Multifunction device drivers
921# 1388#
1389# CONFIG_MFD_CORE is not set
922# CONFIG_MFD_SM501 is not set 1390# CONFIG_MFD_SM501 is not set
1391# CONFIG_HTC_PASIC3 is not set
1392# CONFIG_MFD_TMIO is not set
923 1393
924# 1394#
925# Multimedia devices 1395# Multimedia devices
926# 1396#
1397
1398#
1399# Multimedia core support
1400#
927# CONFIG_VIDEO_DEV is not set 1401# CONFIG_VIDEO_DEV is not set
928# CONFIG_DVB_CORE is not set 1402# CONFIG_DVB_CORE is not set
1403# CONFIG_VIDEO_MEDIA is not set
1404
1405#
1406# Multimedia drivers
1407#
929CONFIG_DAB=y 1408CONFIG_DAB=y
930# CONFIG_USB_DABUSB is not set 1409# CONFIG_USB_DABUSB is not set
931 1410
932# 1411#
933# Graphics support 1412# Graphics support
934# 1413#
935# CONFIG_BACKLIGHT_LCD_SUPPORT is not set 1414CONFIG_AGP=y
1415CONFIG_AGP_AMD64=y
1416CONFIG_AGP_INTEL=y
1417# CONFIG_AGP_SIS is not set
1418# CONFIG_AGP_VIA is not set
1419CONFIG_DRM=y
1420# CONFIG_DRM_TDFX is not set
1421# CONFIG_DRM_R128 is not set
1422# CONFIG_DRM_RADEON is not set
1423# CONFIG_DRM_I810 is not set
1424# CONFIG_DRM_I830 is not set
1425CONFIG_DRM_I915=y
1426# CONFIG_DRM_MGA is not set
1427# CONFIG_DRM_SIS is not set
1428# CONFIG_DRM_VIA is not set
1429# CONFIG_DRM_SAVAGE is not set
1430# CONFIG_VGASTATE is not set
1431# CONFIG_VIDEO_OUTPUT_CONTROL is not set
1432CONFIG_FB=y
1433# CONFIG_FIRMWARE_EDID is not set
1434# CONFIG_FB_DDC is not set
1435CONFIG_FB_CFB_FILLRECT=y
1436CONFIG_FB_CFB_COPYAREA=y
1437CONFIG_FB_CFB_IMAGEBLIT=y
1438# CONFIG_FB_CFB_REV_PIXELS_IN_BYTE is not set
1439# CONFIG_FB_SYS_FILLRECT is not set
1440# CONFIG_FB_SYS_COPYAREA is not set
1441# CONFIG_FB_SYS_IMAGEBLIT is not set
1442# CONFIG_FB_FOREIGN_ENDIAN is not set
1443# CONFIG_FB_SYS_FOPS is not set
1444# CONFIG_FB_SVGALIB is not set
1445# CONFIG_FB_MACMODES is not set
1446# CONFIG_FB_BACKLIGHT is not set
1447CONFIG_FB_MODE_HELPERS=y
1448CONFIG_FB_TILEBLITTING=y
1449
1450#
1451# Frame buffer hardware drivers
1452#
1453# CONFIG_FB_CIRRUS is not set
1454# CONFIG_FB_PM2 is not set
1455# CONFIG_FB_CYBER2000 is not set
1456# CONFIG_FB_ARC is not set
1457# CONFIG_FB_ASILIANT is not set
1458# CONFIG_FB_IMSTT is not set
1459# CONFIG_FB_VGA16 is not set
1460# CONFIG_FB_UVESA is not set
1461# CONFIG_FB_VESA is not set
1462CONFIG_FB_EFI=y
1463# CONFIG_FB_IMAC is not set
1464# CONFIG_FB_N411 is not set
1465# CONFIG_FB_HGA is not set
1466# CONFIG_FB_S1D13XXX is not set
1467# CONFIG_FB_NVIDIA is not set
1468# CONFIG_FB_RIVA is not set
1469# CONFIG_FB_LE80578 is not set
1470# CONFIG_FB_INTEL is not set
1471# CONFIG_FB_MATROX is not set
1472# CONFIG_FB_RADEON is not set
1473# CONFIG_FB_ATY128 is not set
1474# CONFIG_FB_ATY is not set
1475# CONFIG_FB_S3 is not set
1476# CONFIG_FB_SAVAGE is not set
1477# CONFIG_FB_SIS is not set
1478# CONFIG_FB_NEOMAGIC is not set
1479# CONFIG_FB_KYRO is not set
1480# CONFIG_FB_3DFX is not set
1481# CONFIG_FB_VOODOO1 is not set
1482# CONFIG_FB_VT8623 is not set
1483# CONFIG_FB_TRIDENT is not set
1484# CONFIG_FB_ARK is not set
1485# CONFIG_FB_PM3 is not set
1486# CONFIG_FB_CARMINE is not set
1487# CONFIG_FB_GEODE is not set
1488# CONFIG_FB_VIRTUAL is not set
1489CONFIG_BACKLIGHT_LCD_SUPPORT=y
1490# CONFIG_LCD_CLASS_DEVICE is not set
1491CONFIG_BACKLIGHT_CLASS_DEVICE=y
1492# CONFIG_BACKLIGHT_CORGI is not set
1493# CONFIG_BACKLIGHT_PROGEAR is not set
1494# CONFIG_BACKLIGHT_MBP_NVIDIA is not set
936 1495
937# 1496#
938# Display device support 1497# Display device support
939# 1498#
940# CONFIG_DISPLAY_SUPPORT is not set 1499# CONFIG_DISPLAY_SUPPORT is not set
941# CONFIG_VGASTATE is not set
942# CONFIG_FB is not set
943 1500
944# 1501#
945# Console display driver support 1502# Console display driver support
946# 1503#
947CONFIG_VGA_CONSOLE=y 1504CONFIG_VGA_CONSOLE=y
948CONFIG_VGACON_SOFT_SCROLLBACK=y 1505CONFIG_VGACON_SOFT_SCROLLBACK=y
949CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=256 1506CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64
950CONFIG_VIDEO_SELECT=y
951CONFIG_DUMMY_CONSOLE=y 1507CONFIG_DUMMY_CONSOLE=y
952 1508# CONFIG_FRAMEBUFFER_CONSOLE is not set
953# 1509CONFIG_LOGO=y
954# Sound 1510# CONFIG_LOGO_LINUX_MONO is not set
955# 1511# CONFIG_LOGO_LINUX_VGA16 is not set
1512CONFIG_LOGO_LINUX_CLUT224=y
956CONFIG_SOUND=y 1513CONFIG_SOUND=y
957 1514CONFIG_SND=y
958# 1515CONFIG_SND_TIMER=y
959# Advanced Linux Sound Architecture 1516CONFIG_SND_PCM=y
960# 1517CONFIG_SND_HWDEP=y
961# CONFIG_SND is not set 1518CONFIG_SND_SEQUENCER=y
962 1519CONFIG_SND_SEQ_DUMMY=y
963# 1520CONFIG_SND_OSSEMUL=y
964# Open Sound System 1521CONFIG_SND_MIXER_OSS=y
965# 1522CONFIG_SND_PCM_OSS=y
966CONFIG_SOUND_PRIME=y 1523CONFIG_SND_PCM_OSS_PLUGINS=y
967# CONFIG_SOUND_TRIDENT is not set 1524CONFIG_SND_SEQUENCER_OSS=y
968# CONFIG_SOUND_MSNDCLAS is not set 1525CONFIG_SND_DYNAMIC_MINORS=y
969# CONFIG_SOUND_MSNDPIN is not set 1526CONFIG_SND_SUPPORT_OLD_API=y
970# CONFIG_SOUND_OSS is not set 1527CONFIG_SND_VERBOSE_PROCFS=y
1528# CONFIG_SND_VERBOSE_PRINTK is not set
1529# CONFIG_SND_DEBUG is not set
1530CONFIG_SND_VMASTER=y
1531CONFIG_SND_DRIVERS=y
1532# CONFIG_SND_PCSP is not set
1533# CONFIG_SND_DUMMY is not set
1534# CONFIG_SND_VIRMIDI is not set
1535# CONFIG_SND_MTPAV is not set
1536# CONFIG_SND_SERIAL_U16550 is not set
1537# CONFIG_SND_MPU401 is not set
1538CONFIG_SND_PCI=y
1539# CONFIG_SND_AD1889 is not set
1540# CONFIG_SND_ALS300 is not set
1541# CONFIG_SND_ALS4000 is not set
1542# CONFIG_SND_ALI5451 is not set
1543# CONFIG_SND_ATIIXP is not set
1544# CONFIG_SND_ATIIXP_MODEM is not set
1545# CONFIG_SND_AU8810 is not set
1546# CONFIG_SND_AU8820 is not set
1547# CONFIG_SND_AU8830 is not set
1548# CONFIG_SND_AW2 is not set
1549# CONFIG_SND_AZT3328 is not set
1550# CONFIG_SND_BT87X is not set
1551# CONFIG_SND_CA0106 is not set
1552# CONFIG_SND_CMIPCI is not set
1553# CONFIG_SND_OXYGEN is not set
1554# CONFIG_SND_CS4281 is not set
1555# CONFIG_SND_CS46XX is not set
1556# CONFIG_SND_CS5530 is not set
1557# CONFIG_SND_DARLA20 is not set
1558# CONFIG_SND_GINA20 is not set
1559# CONFIG_SND_LAYLA20 is not set
1560# CONFIG_SND_DARLA24 is not set
1561# CONFIG_SND_GINA24 is not set
1562# CONFIG_SND_LAYLA24 is not set
1563# CONFIG_SND_MONA is not set
1564# CONFIG_SND_MIA is not set
1565# CONFIG_SND_ECHO3G is not set
1566# CONFIG_SND_INDIGO is not set
1567# CONFIG_SND_INDIGOIO is not set
1568# CONFIG_SND_INDIGODJ is not set
1569# CONFIG_SND_EMU10K1 is not set
1570# CONFIG_SND_EMU10K1X is not set
1571# CONFIG_SND_ENS1370 is not set
1572# CONFIG_SND_ENS1371 is not set
1573# CONFIG_SND_ES1938 is not set
1574# CONFIG_SND_ES1968 is not set
1575# CONFIG_SND_FM801 is not set
1576CONFIG_SND_HDA_INTEL=y
1577CONFIG_SND_HDA_HWDEP=y
1578CONFIG_SND_HDA_CODEC_REALTEK=y
1579CONFIG_SND_HDA_CODEC_ANALOG=y
1580CONFIG_SND_HDA_CODEC_SIGMATEL=y
1581CONFIG_SND_HDA_CODEC_VIA=y
1582CONFIG_SND_HDA_CODEC_ATIHDMI=y
1583CONFIG_SND_HDA_CODEC_CONEXANT=y
1584CONFIG_SND_HDA_CODEC_CMEDIA=y
1585CONFIG_SND_HDA_CODEC_SI3054=y
1586CONFIG_SND_HDA_GENERIC=y
1587# CONFIG_SND_HDA_POWER_SAVE is not set
1588# CONFIG_SND_HDSP is not set
1589# CONFIG_SND_HDSPM is not set
1590# CONFIG_SND_HIFIER is not set
1591# CONFIG_SND_ICE1712 is not set
1592# CONFIG_SND_ICE1724 is not set
1593# CONFIG_SND_INTEL8X0 is not set
1594# CONFIG_SND_INTEL8X0M is not set
1595# CONFIG_SND_KORG1212 is not set
1596# CONFIG_SND_MAESTRO3 is not set
1597# CONFIG_SND_MIXART is not set
1598# CONFIG_SND_NM256 is not set
1599# CONFIG_SND_PCXHR is not set
1600# CONFIG_SND_RIPTIDE is not set
1601# CONFIG_SND_RME32 is not set
1602# CONFIG_SND_RME96 is not set
1603# CONFIG_SND_RME9652 is not set
1604# CONFIG_SND_SONICVIBES is not set
1605# CONFIG_SND_TRIDENT is not set
1606# CONFIG_SND_VIA82XX is not set
1607# CONFIG_SND_VIA82XX_MODEM is not set
1608# CONFIG_SND_VIRTUOSO is not set
1609# CONFIG_SND_VX222 is not set
1610# CONFIG_SND_YMFPCI is not set
1611CONFIG_SND_USB=y
1612# CONFIG_SND_USB_AUDIO is not set
1613# CONFIG_SND_USB_USX2Y is not set
1614# CONFIG_SND_USB_CAIAQ is not set
1615CONFIG_SND_PCMCIA=y
1616# CONFIG_SND_VXPOCKET is not set
1617# CONFIG_SND_PDAUDIOCF is not set
1618# CONFIG_SND_SOC is not set
1619# CONFIG_SOUND_PRIME is not set
971CONFIG_HID_SUPPORT=y 1620CONFIG_HID_SUPPORT=y
972CONFIG_HID=y 1621CONFIG_HID=y
973# CONFIG_HID_DEBUG is not set 1622CONFIG_HID_DEBUG=y
1623CONFIG_HIDRAW=y
974 1624
975# 1625#
976# USB Input Devices 1626# USB Input Devices
977# 1627#
978CONFIG_USB_HID=y 1628CONFIG_USB_HID=y
979# CONFIG_USB_HIDINPUT_POWERBOOK is not set 1629CONFIG_USB_HIDINPUT_POWERBOOK=y
980# CONFIG_HID_FF is not set 1630CONFIG_HID_FF=y
981# CONFIG_USB_HIDDEV is not set 1631CONFIG_HID_PID=y
1632CONFIG_LOGITECH_FF=y
1633# CONFIG_LOGIRUMBLEPAD2_FF is not set
1634CONFIG_PANTHERLORD_FF=y
1635CONFIG_THRUSTMASTER_FF=y
1636CONFIG_ZEROPLUS_FF=y
1637CONFIG_USB_HIDDEV=y
982CONFIG_USB_SUPPORT=y 1638CONFIG_USB_SUPPORT=y
983CONFIG_USB_ARCH_HAS_HCD=y 1639CONFIG_USB_ARCH_HAS_HCD=y
984CONFIG_USB_ARCH_HAS_OHCI=y 1640CONFIG_USB_ARCH_HAS_OHCI=y
985CONFIG_USB_ARCH_HAS_EHCI=y 1641CONFIG_USB_ARCH_HAS_EHCI=y
986CONFIG_USB=y 1642CONFIG_USB=y
987# CONFIG_USB_DEBUG is not set 1643CONFIG_USB_DEBUG=y
1644CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
988 1645
989# 1646#
990# Miscellaneous USB options 1647# Miscellaneous USB options
@@ -992,18 +1649,19 @@ CONFIG_USB=y
992CONFIG_USB_DEVICEFS=y 1649CONFIG_USB_DEVICEFS=y
993# CONFIG_USB_DEVICE_CLASS is not set 1650# CONFIG_USB_DEVICE_CLASS is not set
994# CONFIG_USB_DYNAMIC_MINORS is not set 1651# CONFIG_USB_DYNAMIC_MINORS is not set
995# CONFIG_USB_SUSPEND is not set 1652CONFIG_USB_SUSPEND=y
996# CONFIG_USB_PERSIST is not set
997# CONFIG_USB_OTG is not set 1653# CONFIG_USB_OTG is not set
1654CONFIG_USB_MON=y
998 1655
999# 1656#
1000# USB Host Controller Drivers 1657# USB Host Controller Drivers
1001# 1658#
1659# CONFIG_USB_C67X00_HCD is not set
1002CONFIG_USB_EHCI_HCD=y 1660CONFIG_USB_EHCI_HCD=y
1003# CONFIG_USB_EHCI_SPLIT_ISO is not set
1004# CONFIG_USB_EHCI_ROOT_HUB_TT is not set 1661# CONFIG_USB_EHCI_ROOT_HUB_TT is not set
1005# CONFIG_USB_EHCI_TT_NEWSCHED is not set 1662# CONFIG_USB_EHCI_TT_NEWSCHED is not set
1006# CONFIG_USB_ISP116X_HCD is not set 1663# CONFIG_USB_ISP116X_HCD is not set
1664# CONFIG_USB_ISP1760_HCD is not set
1007CONFIG_USB_OHCI_HCD=y 1665CONFIG_USB_OHCI_HCD=y
1008# CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set 1666# CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set
1009# CONFIG_USB_OHCI_BIG_ENDIAN_MMIO is not set 1667# CONFIG_USB_OHCI_BIG_ENDIAN_MMIO is not set
@@ -1017,6 +1675,7 @@ CONFIG_USB_UHCI_HCD=y
1017# 1675#
1018# CONFIG_USB_ACM is not set 1676# CONFIG_USB_ACM is not set
1019CONFIG_USB_PRINTER=y 1677CONFIG_USB_PRINTER=y
1678# CONFIG_USB_WDM is not set
1020 1679
1021# 1680#
1022# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' 1681# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support'
@@ -1036,23 +1695,21 @@ CONFIG_USB_STORAGE=y
1036# CONFIG_USB_STORAGE_SDDR55 is not set 1695# CONFIG_USB_STORAGE_SDDR55 is not set
1037# CONFIG_USB_STORAGE_JUMPSHOT is not set 1696# CONFIG_USB_STORAGE_JUMPSHOT is not set
1038# CONFIG_USB_STORAGE_ALAUDA is not set 1697# CONFIG_USB_STORAGE_ALAUDA is not set
1698# CONFIG_USB_STORAGE_ONETOUCH is not set
1039# CONFIG_USB_STORAGE_KARMA is not set 1699# CONFIG_USB_STORAGE_KARMA is not set
1040# CONFIG_USB_LIBUSUAL is not set 1700# CONFIG_USB_STORAGE_SIERRA is not set
1701# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set
1702CONFIG_USB_LIBUSUAL=y
1041 1703
1042# 1704#
1043# USB Imaging devices 1705# USB Imaging devices
1044# 1706#
1045# CONFIG_USB_MDC800 is not set 1707# CONFIG_USB_MDC800 is not set
1046# CONFIG_USB_MICROTEK is not set 1708# CONFIG_USB_MICROTEK is not set
1047CONFIG_USB_MON=y
1048 1709
1049# 1710#
1050# USB port drivers 1711# USB port drivers
1051# 1712#
1052
1053#
1054# USB Serial Converter support
1055#
1056# CONFIG_USB_SERIAL is not set 1713# CONFIG_USB_SERIAL is not set
1057 1714
1058# 1715#
@@ -1061,7 +1718,6 @@ CONFIG_USB_MON=y
1061# CONFIG_USB_EMI62 is not set 1718# CONFIG_USB_EMI62 is not set
1062# CONFIG_USB_EMI26 is not set 1719# CONFIG_USB_EMI26 is not set
1063# CONFIG_USB_ADUTUX is not set 1720# CONFIG_USB_ADUTUX is not set
1064# CONFIG_USB_AUERSWALD is not set
1065# CONFIG_USB_RIO500 is not set 1721# CONFIG_USB_RIO500 is not set
1066# CONFIG_USB_LEGOTOWER is not set 1722# CONFIG_USB_LEGOTOWER is not set
1067# CONFIG_USB_LCD is not set 1723# CONFIG_USB_LCD is not set
@@ -1078,98 +1734,132 @@ CONFIG_USB_MON=y
1078# CONFIG_USB_TRANCEVIBRATOR is not set 1734# CONFIG_USB_TRANCEVIBRATOR is not set
1079# CONFIG_USB_IOWARRIOR is not set 1735# CONFIG_USB_IOWARRIOR is not set
1080# CONFIG_USB_TEST is not set 1736# CONFIG_USB_TEST is not set
1737# CONFIG_USB_ISIGHTFW is not set
1738# CONFIG_USB_GADGET is not set
1739# CONFIG_MMC is not set
1740# CONFIG_MEMSTICK is not set
1741CONFIG_NEW_LEDS=y
1742CONFIG_LEDS_CLASS=y
1081 1743
1082# 1744#
1083# USB DSL modem support 1745# LED drivers
1084# 1746#
1747# CONFIG_LEDS_PCA9532 is not set
1748# CONFIG_LEDS_CLEVO_MAIL is not set
1749# CONFIG_LEDS_PCA955X is not set
1085 1750
1086# 1751#
1087# USB Gadget Support 1752# LED Triggers
1088# 1753#
1089# CONFIG_USB_GADGET is not set 1754CONFIG_LEDS_TRIGGERS=y
1090# CONFIG_MMC is not set 1755# CONFIG_LEDS_TRIGGER_TIMER is not set
1756# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
1757# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
1758# CONFIG_ACCESSIBILITY is not set
1759# CONFIG_INFINIBAND is not set
1760CONFIG_EDAC=y
1091 1761
1092# 1762#
1093# LED devices 1763# Reporting subsystems
1094# 1764#
1095# CONFIG_NEW_LEDS is not set 1765# CONFIG_EDAC_DEBUG is not set
1766# CONFIG_EDAC_MM_EDAC is not set
1767CONFIG_RTC_LIB=y
1768CONFIG_RTC_CLASS=y
1769# CONFIG_RTC_HCTOSYS is not set
1770# CONFIG_RTC_DEBUG is not set
1096 1771
1097# 1772#
1098# LED drivers 1773# RTC interfaces
1099# 1774#
1775CONFIG_RTC_INTF_SYSFS=y
1776CONFIG_RTC_INTF_PROC=y
1777CONFIG_RTC_INTF_DEV=y
1778# CONFIG_RTC_INTF_DEV_UIE_EMUL is not set
1779# CONFIG_RTC_DRV_TEST is not set
1100 1780
1101# 1781#
1102# LED Triggers 1782# I2C RTC drivers
1103# 1783#
1104# CONFIG_INFINIBAND is not set 1784# CONFIG_RTC_DRV_DS1307 is not set
1105# CONFIG_EDAC is not set 1785# CONFIG_RTC_DRV_DS1374 is not set
1786# CONFIG_RTC_DRV_DS1672 is not set
1787# CONFIG_RTC_DRV_MAX6900 is not set
1788# CONFIG_RTC_DRV_RS5C372 is not set
1789# CONFIG_RTC_DRV_ISL1208 is not set
1790# CONFIG_RTC_DRV_X1205 is not set
1791# CONFIG_RTC_DRV_PCF8563 is not set
1792# CONFIG_RTC_DRV_PCF8583 is not set
1793# CONFIG_RTC_DRV_M41T80 is not set
1794# CONFIG_RTC_DRV_S35390A is not set
1795# CONFIG_RTC_DRV_FM3130 is not set
1106 1796
1107# 1797#
1108# Real Time Clock 1798# SPI RTC drivers
1109# 1799#
1110# CONFIG_RTC_CLASS is not set
1111 1800
1112# 1801#
1113# DMA Engine support 1802# Platform RTC drivers
1114# 1803#
1115# CONFIG_DMA_ENGINE is not set 1804CONFIG_RTC_DRV_CMOS=y
1805# CONFIG_RTC_DRV_DS1511 is not set
1806# CONFIG_RTC_DRV_DS1553 is not set
1807# CONFIG_RTC_DRV_DS1742 is not set
1808# CONFIG_RTC_DRV_STK17TA8 is not set
1809# CONFIG_RTC_DRV_M48T86 is not set
1810# CONFIG_RTC_DRV_M48T59 is not set
1811# CONFIG_RTC_DRV_V3020 is not set
1116 1812
1117# 1813#
1118# DMA Clients 1814# on-CPU RTC drivers
1119# 1815#
1816CONFIG_DMADEVICES=y
1120 1817
1121# 1818#
1122# DMA Devices 1819# DMA Devices
1123# 1820#
1124CONFIG_VIRTUALIZATION=y 1821# CONFIG_INTEL_IOATDMA is not set
1125# CONFIG_KVM is not set
1126
1127#
1128# Userspace I/O
1129#
1130# CONFIG_UIO is not set 1822# CONFIG_UIO is not set
1131 1823
1132# 1824#
1133# Firmware Drivers 1825# Firmware Drivers
1134# 1826#
1135# CONFIG_EDD is not set 1827# CONFIG_EDD is not set
1828CONFIG_FIRMWARE_MEMMAP=y
1829CONFIG_EFI_VARS=y
1136# CONFIG_DELL_RBU is not set 1830# CONFIG_DELL_RBU is not set
1137# CONFIG_DCDBAS is not set 1831# CONFIG_DCDBAS is not set
1138CONFIG_DMIID=y 1832CONFIG_DMIID=y
1833CONFIG_ISCSI_IBFT_FIND=y
1834CONFIG_ISCSI_IBFT=y
1139 1835
1140# 1836#
1141# File systems 1837# File systems
1142# 1838#
1143CONFIG_EXT2_FS=y 1839# CONFIG_EXT2_FS is not set
1144CONFIG_EXT2_FS_XATTR=y
1145CONFIG_EXT2_FS_POSIX_ACL=y
1146# CONFIG_EXT2_FS_SECURITY is not set
1147# CONFIG_EXT2_FS_XIP is not set
1148CONFIG_EXT3_FS=y 1840CONFIG_EXT3_FS=y
1149CONFIG_EXT3_FS_XATTR=y 1841CONFIG_EXT3_FS_XATTR=y
1150CONFIG_EXT3_FS_POSIX_ACL=y 1842CONFIG_EXT3_FS_POSIX_ACL=y
1151# CONFIG_EXT3_FS_SECURITY is not set 1843CONFIG_EXT3_FS_SECURITY=y
1152# CONFIG_EXT4DEV_FS is not set 1844# CONFIG_EXT4DEV_FS is not set
1153CONFIG_JBD=y 1845CONFIG_JBD=y
1154# CONFIG_JBD_DEBUG is not set 1846# CONFIG_JBD_DEBUG is not set
1155CONFIG_FS_MBCACHE=y 1847CONFIG_FS_MBCACHE=y
1156CONFIG_REISERFS_FS=y 1848# CONFIG_REISERFS_FS is not set
1157# CONFIG_REISERFS_CHECK is not set
1158# CONFIG_REISERFS_PROC_INFO is not set
1159CONFIG_REISERFS_FS_XATTR=y
1160CONFIG_REISERFS_FS_POSIX_ACL=y
1161# CONFIG_REISERFS_FS_SECURITY is not set
1162# CONFIG_JFS_FS is not set 1849# CONFIG_JFS_FS is not set
1163CONFIG_FS_POSIX_ACL=y 1850CONFIG_FS_POSIX_ACL=y
1164# CONFIG_XFS_FS is not set 1851# CONFIG_XFS_FS is not set
1165# CONFIG_GFS2_FS is not set 1852# CONFIG_GFS2_FS is not set
1166# CONFIG_OCFS2_FS is not set 1853# CONFIG_OCFS2_FS is not set
1167# CONFIG_MINIX_FS is not set 1854CONFIG_DNOTIFY=y
1168# CONFIG_ROMFS_FS is not set
1169CONFIG_INOTIFY=y 1855CONFIG_INOTIFY=y
1170CONFIG_INOTIFY_USER=y 1856CONFIG_INOTIFY_USER=y
1171# CONFIG_QUOTA is not set 1857CONFIG_QUOTA=y
1172CONFIG_DNOTIFY=y 1858CONFIG_QUOTA_NETLINK_INTERFACE=y
1859# CONFIG_PRINT_QUOTA_WARNING is not set
1860# CONFIG_QFMT_V1 is not set
1861CONFIG_QFMT_V2=y
1862CONFIG_QUOTACTL=y
1173# CONFIG_AUTOFS_FS is not set 1863# CONFIG_AUTOFS_FS is not set
1174CONFIG_AUTOFS4_FS=y 1864CONFIG_AUTOFS4_FS=y
1175# CONFIG_FUSE_FS is not set 1865# CONFIG_FUSE_FS is not set
@@ -1180,7 +1870,7 @@ CONFIG_GENERIC_ACL=y
1180# 1870#
1181CONFIG_ISO9660_FS=y 1871CONFIG_ISO9660_FS=y
1182CONFIG_JOLIET=y 1872CONFIG_JOLIET=y
1183# CONFIG_ZISOFS is not set 1873CONFIG_ZISOFS=y
1184# CONFIG_UDF_FS is not set 1874# CONFIG_UDF_FS is not set
1185 1875
1186# 1876#
@@ -1198,13 +1888,13 @@ CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1"
1198# 1888#
1199CONFIG_PROC_FS=y 1889CONFIG_PROC_FS=y
1200CONFIG_PROC_KCORE=y 1890CONFIG_PROC_KCORE=y
1891CONFIG_PROC_VMCORE=y
1201CONFIG_PROC_SYSCTL=y 1892CONFIG_PROC_SYSCTL=y
1202CONFIG_SYSFS=y 1893CONFIG_SYSFS=y
1203CONFIG_TMPFS=y 1894CONFIG_TMPFS=y
1204CONFIG_TMPFS_POSIX_ACL=y 1895CONFIG_TMPFS_POSIX_ACL=y
1205CONFIG_HUGETLBFS=y 1896CONFIG_HUGETLBFS=y
1206CONFIG_HUGETLB_PAGE=y 1897CONFIG_HUGETLB_PAGE=y
1207CONFIG_RAMFS=y
1208# CONFIG_CONFIGFS_FS is not set 1898# CONFIG_CONFIGFS_FS is not set
1209 1899
1210# 1900#
@@ -1212,6 +1902,7 @@ CONFIG_RAMFS=y
1212# 1902#
1213# CONFIG_ADFS_FS is not set 1903# CONFIG_ADFS_FS is not set
1214# CONFIG_AFFS_FS is not set 1904# CONFIG_AFFS_FS is not set
1905# CONFIG_ECRYPT_FS is not set
1215# CONFIG_HFS_FS is not set 1906# CONFIG_HFS_FS is not set
1216# CONFIG_HFSPLUS_FS is not set 1907# CONFIG_HFSPLUS_FS is not set
1217# CONFIG_BEFS_FS is not set 1908# CONFIG_BEFS_FS is not set
@@ -1219,32 +1910,27 @@ CONFIG_RAMFS=y
1219# CONFIG_EFS_FS is not set 1910# CONFIG_EFS_FS is not set
1220# CONFIG_CRAMFS is not set 1911# CONFIG_CRAMFS is not set
1221# CONFIG_VXFS_FS is not set 1912# CONFIG_VXFS_FS is not set
1913# CONFIG_MINIX_FS is not set
1914# CONFIG_OMFS_FS is not set
1222# CONFIG_HPFS_FS is not set 1915# CONFIG_HPFS_FS is not set
1223# CONFIG_QNX4FS_FS is not set 1916# CONFIG_QNX4FS_FS is not set
1917# CONFIG_ROMFS_FS is not set
1224# CONFIG_SYSV_FS is not set 1918# CONFIG_SYSV_FS is not set
1225# CONFIG_UFS_FS is not set 1919# CONFIG_UFS_FS is not set
1226 1920CONFIG_NETWORK_FILESYSTEMS=y
1227#
1228# Network File Systems
1229#
1230CONFIG_NFS_FS=y 1921CONFIG_NFS_FS=y
1231CONFIG_NFS_V3=y 1922CONFIG_NFS_V3=y
1232# CONFIG_NFS_V3_ACL is not set 1923CONFIG_NFS_V3_ACL=y
1233# CONFIG_NFS_V4 is not set 1924CONFIG_NFS_V4=y
1234# CONFIG_NFS_DIRECTIO is not set
1235CONFIG_NFSD=y
1236CONFIG_NFSD_V3=y
1237# CONFIG_NFSD_V3_ACL is not set
1238# CONFIG_NFSD_V4 is not set
1239CONFIG_NFSD_TCP=y
1240CONFIG_ROOT_NFS=y 1925CONFIG_ROOT_NFS=y
1926# CONFIG_NFSD is not set
1241CONFIG_LOCKD=y 1927CONFIG_LOCKD=y
1242CONFIG_LOCKD_V4=y 1928CONFIG_LOCKD_V4=y
1243CONFIG_EXPORTFS=y 1929CONFIG_NFS_ACL_SUPPORT=y
1244CONFIG_NFS_COMMON=y 1930CONFIG_NFS_COMMON=y
1245CONFIG_SUNRPC=y 1931CONFIG_SUNRPC=y
1246# CONFIG_SUNRPC_BIND34 is not set 1932CONFIG_SUNRPC_GSS=y
1247# CONFIG_RPCSEC_GSS_KRB5 is not set 1933CONFIG_RPCSEC_GSS_KRB5=y
1248# CONFIG_RPCSEC_GSS_SPKM3 is not set 1934# CONFIG_RPCSEC_GSS_SPKM3 is not set
1249# CONFIG_SMB_FS is not set 1935# CONFIG_SMB_FS is not set
1250# CONFIG_CIFS is not set 1936# CONFIG_CIFS is not set
@@ -1255,14 +1941,26 @@ CONFIG_SUNRPC=y
1255# 1941#
1256# Partition Types 1942# Partition Types
1257# 1943#
1258# CONFIG_PARTITION_ADVANCED is not set 1944CONFIG_PARTITION_ADVANCED=y
1945# CONFIG_ACORN_PARTITION is not set
1946CONFIG_OSF_PARTITION=y
1947CONFIG_AMIGA_PARTITION=y
1948# CONFIG_ATARI_PARTITION is not set
1949CONFIG_MAC_PARTITION=y
1259CONFIG_MSDOS_PARTITION=y 1950CONFIG_MSDOS_PARTITION=y
1260 1951CONFIG_BSD_DISKLABEL=y
1261# 1952CONFIG_MINIX_SUBPARTITION=y
1262# Native Language Support 1953CONFIG_SOLARIS_X86_PARTITION=y
1263# 1954CONFIG_UNIXWARE_DISKLABEL=y
1955# CONFIG_LDM_PARTITION is not set
1956CONFIG_SGI_PARTITION=y
1957# CONFIG_ULTRIX_PARTITION is not set
1958CONFIG_SUN_PARTITION=y
1959CONFIG_KARMA_PARTITION=y
1960CONFIG_EFI_PARTITION=y
1961# CONFIG_SYSV68_PARTITION is not set
1264CONFIG_NLS=y 1962CONFIG_NLS=y
1265CONFIG_NLS_DEFAULT="iso8859-1" 1963CONFIG_NLS_DEFAULT="utf8"
1266CONFIG_NLS_CODEPAGE_437=y 1964CONFIG_NLS_CODEPAGE_437=y
1267# CONFIG_NLS_CODEPAGE_737 is not set 1965# CONFIG_NLS_CODEPAGE_737 is not set
1268# CONFIG_NLS_CODEPAGE_775 is not set 1966# CONFIG_NLS_CODEPAGE_775 is not set
@@ -1297,40 +1995,33 @@ CONFIG_NLS_ISO8859_1=y
1297# CONFIG_NLS_ISO8859_9 is not set 1995# CONFIG_NLS_ISO8859_9 is not set
1298# CONFIG_NLS_ISO8859_13 is not set 1996# CONFIG_NLS_ISO8859_13 is not set
1299# CONFIG_NLS_ISO8859_14 is not set 1997# CONFIG_NLS_ISO8859_14 is not set
1300CONFIG_NLS_ISO8859_15=y 1998# CONFIG_NLS_ISO8859_15 is not set
1301# CONFIG_NLS_KOI8_R is not set 1999# CONFIG_NLS_KOI8_R is not set
1302# CONFIG_NLS_KOI8_U is not set 2000# CONFIG_NLS_KOI8_U is not set
1303CONFIG_NLS_UTF8=y 2001CONFIG_NLS_UTF8=y
1304
1305#
1306# Distributed Lock Manager
1307#
1308# CONFIG_DLM is not set 2002# CONFIG_DLM is not set
1309 2003
1310# 2004#
1311# Instrumentation Support
1312#
1313CONFIG_PROFILING=y
1314CONFIG_OPROFILE=y
1315CONFIG_KPROBES=y
1316
1317#
1318# Kernel hacking 2005# Kernel hacking
1319# 2006#
1320CONFIG_TRACE_IRQFLAGS_SUPPORT=y 2007CONFIG_TRACE_IRQFLAGS_SUPPORT=y
1321# CONFIG_PRINTK_TIME is not set 2008CONFIG_PRINTK_TIME=y
1322# CONFIG_ENABLE_MUST_CHECK is not set 2009CONFIG_ENABLE_WARN_DEPRECATED=y
2010CONFIG_ENABLE_MUST_CHECK=y
2011CONFIG_FRAME_WARN=2048
1323CONFIG_MAGIC_SYSRQ=y 2012CONFIG_MAGIC_SYSRQ=y
1324CONFIG_UNUSED_SYMBOLS=y 2013# CONFIG_UNUSED_SYMBOLS is not set
1325CONFIG_DEBUG_FS=y 2014CONFIG_DEBUG_FS=y
1326# CONFIG_HEADERS_CHECK is not set 2015# CONFIG_HEADERS_CHECK is not set
1327CONFIG_DEBUG_KERNEL=y 2016CONFIG_DEBUG_KERNEL=y
1328# CONFIG_DEBUG_SHIRQ is not set 2017# CONFIG_DEBUG_SHIRQ is not set
1329CONFIG_DETECT_SOFTLOCKUP=y 2018# CONFIG_DETECT_SOFTLOCKUP is not set
1330# CONFIG_SCHED_DEBUG is not set 2019# CONFIG_SCHED_DEBUG is not set
1331# CONFIG_SCHEDSTATS is not set 2020CONFIG_SCHEDSTATS=y
1332CONFIG_TIMER_STATS=y 2021CONFIG_TIMER_STATS=y
1333# CONFIG_DEBUG_SLAB is not set 2022# CONFIG_DEBUG_OBJECTS is not set
2023# CONFIG_SLUB_DEBUG_ON is not set
2024# CONFIG_SLUB_STATS is not set
1334# CONFIG_DEBUG_RT_MUTEXES is not set 2025# CONFIG_DEBUG_RT_MUTEXES is not set
1335# CONFIG_RT_MUTEX_TESTER is not set 2026# CONFIG_RT_MUTEX_TESTER is not set
1336# CONFIG_DEBUG_SPINLOCK is not set 2027# CONFIG_DEBUG_SPINLOCK is not set
@@ -1344,30 +2035,179 @@ CONFIG_TIMER_STATS=y
1344CONFIG_DEBUG_BUGVERBOSE=y 2035CONFIG_DEBUG_BUGVERBOSE=y
1345# CONFIG_DEBUG_INFO is not set 2036# CONFIG_DEBUG_INFO is not set
1346# CONFIG_DEBUG_VM is not set 2037# CONFIG_DEBUG_VM is not set
2038# CONFIG_DEBUG_WRITECOUNT is not set
2039CONFIG_DEBUG_MEMORY_INIT=y
1347# CONFIG_DEBUG_LIST is not set 2040# CONFIG_DEBUG_LIST is not set
1348# CONFIG_FRAME_POINTER is not set 2041# CONFIG_DEBUG_SG is not set
1349CONFIG_OPTIMIZE_INLINING=y 2042CONFIG_FRAME_POINTER=y
2043# CONFIG_BOOT_PRINTK_DELAY is not set
1350# CONFIG_RCU_TORTURE_TEST is not set 2044# CONFIG_RCU_TORTURE_TEST is not set
2045# CONFIG_KPROBES_SANITY_TEST is not set
2046# CONFIG_BACKTRACE_SELF_TEST is not set
1351# CONFIG_LKDTM is not set 2047# CONFIG_LKDTM is not set
1352# CONFIG_FAULT_INJECTION is not set 2048# CONFIG_FAULT_INJECTION is not set
1353# CONFIG_DEBUG_RODATA is not set 2049# CONFIG_LATENCYTOP is not set
1354# CONFIG_IOMMU_DEBUG is not set 2050CONFIG_SYSCTL_SYSCALL_CHECK=y
2051CONFIG_HAVE_FTRACE=y
2052CONFIG_HAVE_DYNAMIC_FTRACE=y
2053# CONFIG_FTRACE is not set
2054# CONFIG_IRQSOFF_TRACER is not set
2055# CONFIG_SYSPROF_TRACER is not set
2056# CONFIG_SCHED_TRACER is not set
2057# CONFIG_CONTEXT_SWITCH_TRACER is not set
2058CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
2059# CONFIG_SAMPLES is not set
2060CONFIG_HAVE_ARCH_KGDB=y
2061# CONFIG_KGDB is not set
2062# CONFIG_STRICT_DEVMEM is not set
2063CONFIG_X86_VERBOSE_BOOTUP=y
2064CONFIG_EARLY_PRINTK=y
1355CONFIG_DEBUG_STACKOVERFLOW=y 2065CONFIG_DEBUG_STACKOVERFLOW=y
1356# CONFIG_DEBUG_STACK_USAGE is not set 2066CONFIG_DEBUG_STACK_USAGE=y
2067# CONFIG_DEBUG_PAGEALLOC is not set
2068# CONFIG_DEBUG_PER_CPU_MAPS is not set
2069# CONFIG_X86_PTDUMP is not set
2070CONFIG_DEBUG_RODATA=y
2071# CONFIG_DIRECT_GBPAGES is not set
2072# CONFIG_DEBUG_RODATA_TEST is not set
2073CONFIG_DEBUG_NX_TEST=m
2074# CONFIG_IOMMU_DEBUG is not set
2075# CONFIG_MMIOTRACE is not set
2076CONFIG_IO_DELAY_TYPE_0X80=0
2077CONFIG_IO_DELAY_TYPE_0XED=1
2078CONFIG_IO_DELAY_TYPE_UDELAY=2
2079CONFIG_IO_DELAY_TYPE_NONE=3
2080CONFIG_IO_DELAY_0X80=y
2081# CONFIG_IO_DELAY_0XED is not set
2082# CONFIG_IO_DELAY_UDELAY is not set
2083# CONFIG_IO_DELAY_NONE is not set
2084CONFIG_DEFAULT_IO_DELAY_TYPE=0
2085CONFIG_DEBUG_BOOT_PARAMS=y
2086# CONFIG_CPA_DEBUG is not set
2087CONFIG_OPTIMIZE_INLINING=y
1357 2088
1358# 2089#
1359# Security options 2090# Security options
1360# 2091#
1361# CONFIG_KEYS is not set 2092CONFIG_KEYS=y
1362# CONFIG_SECURITY is not set 2093CONFIG_KEYS_DEBUG_PROC_KEYS=y
1363# CONFIG_CRYPTO is not set 2094CONFIG_SECURITY=y
2095CONFIG_SECURITY_NETWORK=y
2096# CONFIG_SECURITY_NETWORK_XFRM is not set
2097CONFIG_SECURITY_FILE_CAPABILITIES=y
2098# CONFIG_SECURITY_ROOTPLUG is not set
2099CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536
2100CONFIG_SECURITY_SELINUX=y
2101CONFIG_SECURITY_SELINUX_BOOTPARAM=y
2102CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE=1
2103CONFIG_SECURITY_SELINUX_DISABLE=y
2104CONFIG_SECURITY_SELINUX_DEVELOP=y
2105CONFIG_SECURITY_SELINUX_AVC_STATS=y
2106CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
2107# CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT is not set
2108# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
2109# CONFIG_SECURITY_SMACK is not set
2110CONFIG_CRYPTO=y
2111
2112#
2113# Crypto core or helper
2114#
2115CONFIG_CRYPTO_ALGAPI=y
2116CONFIG_CRYPTO_AEAD=y
2117CONFIG_CRYPTO_BLKCIPHER=y
2118CONFIG_CRYPTO_HASH=y
2119CONFIG_CRYPTO_MANAGER=y
2120# CONFIG_CRYPTO_GF128MUL is not set
2121# CONFIG_CRYPTO_NULL is not set
2122# CONFIG_CRYPTO_CRYPTD is not set
2123CONFIG_CRYPTO_AUTHENC=y
2124# CONFIG_CRYPTO_TEST is not set
2125
2126#
2127# Authenticated Encryption with Associated Data
2128#
2129# CONFIG_CRYPTO_CCM is not set
2130# CONFIG_CRYPTO_GCM is not set
2131# CONFIG_CRYPTO_SEQIV is not set
2132
2133#
2134# Block modes
2135#
2136CONFIG_CRYPTO_CBC=y
2137# CONFIG_CRYPTO_CTR is not set
2138# CONFIG_CRYPTO_CTS is not set
2139CONFIG_CRYPTO_ECB=y
2140# CONFIG_CRYPTO_LRW is not set
2141# CONFIG_CRYPTO_PCBC is not set
2142# CONFIG_CRYPTO_XTS is not set
2143
2144#
2145# Hash modes
2146#
2147CONFIG_CRYPTO_HMAC=y
2148# CONFIG_CRYPTO_XCBC is not set
2149
2150#
2151# Digest
2152#
2153# CONFIG_CRYPTO_CRC32C is not set
2154# CONFIG_CRYPTO_MD4 is not set
2155CONFIG_CRYPTO_MD5=y
2156# CONFIG_CRYPTO_MICHAEL_MIC is not set
2157# CONFIG_CRYPTO_RMD128 is not set
2158# CONFIG_CRYPTO_RMD160 is not set
2159# CONFIG_CRYPTO_RMD256 is not set
2160# CONFIG_CRYPTO_RMD320 is not set
2161CONFIG_CRYPTO_SHA1=y
2162# CONFIG_CRYPTO_SHA256 is not set
2163# CONFIG_CRYPTO_SHA512 is not set
2164# CONFIG_CRYPTO_TGR192 is not set
2165# CONFIG_CRYPTO_WP512 is not set
2166
2167#
2168# Ciphers
2169#
2170CONFIG_CRYPTO_AES=y
2171# CONFIG_CRYPTO_AES_X86_64 is not set
2172# CONFIG_CRYPTO_ANUBIS is not set
2173CONFIG_CRYPTO_ARC4=y
2174# CONFIG_CRYPTO_BLOWFISH is not set
2175# CONFIG_CRYPTO_CAMELLIA is not set
2176# CONFIG_CRYPTO_CAST5 is not set
2177# CONFIG_CRYPTO_CAST6 is not set
2178CONFIG_CRYPTO_DES=y
2179# CONFIG_CRYPTO_FCRYPT is not set
2180# CONFIG_CRYPTO_KHAZAD is not set
2181# CONFIG_CRYPTO_SALSA20 is not set
2182# CONFIG_CRYPTO_SALSA20_X86_64 is not set
2183# CONFIG_CRYPTO_SEED is not set
2184# CONFIG_CRYPTO_SERPENT is not set
2185# CONFIG_CRYPTO_TEA is not set
2186# CONFIG_CRYPTO_TWOFISH is not set
2187# CONFIG_CRYPTO_TWOFISH_X86_64 is not set
2188
2189#
2190# Compression
2191#
2192# CONFIG_CRYPTO_DEFLATE is not set
2193# CONFIG_CRYPTO_LZO is not set
2194CONFIG_CRYPTO_HW=y
2195# CONFIG_CRYPTO_DEV_HIFN_795X is not set
2196CONFIG_HAVE_KVM=y
2197CONFIG_VIRTUALIZATION=y
2198# CONFIG_KVM is not set
2199# CONFIG_VIRTIO_PCI is not set
2200# CONFIG_VIRTIO_BALLOON is not set
1364 2201
1365# 2202#
1366# Library routines 2203# Library routines
1367# 2204#
1368CONFIG_BITREVERSE=y 2205CONFIG_BITREVERSE=y
2206CONFIG_GENERIC_FIND_FIRST_BIT=y
2207CONFIG_GENERIC_FIND_NEXT_BIT=y
1369# CONFIG_CRC_CCITT is not set 2208# CONFIG_CRC_CCITT is not set
1370# CONFIG_CRC16 is not set 2209# CONFIG_CRC16 is not set
2210CONFIG_CRC_T10DIF=y
1371# CONFIG_CRC_ITU_T is not set 2211# CONFIG_CRC_ITU_T is not set
1372CONFIG_CRC32=y 2212CONFIG_CRC32=y
1373# CONFIG_CRC7 is not set 2213# CONFIG_CRC7 is not set
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 3874c2de5403..903de4aa5094 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -10,6 +10,8 @@ obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
10obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o 10obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
11obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o 11obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
12 12
13obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
14
13aes-i586-y := aes-i586-asm_32.o aes_glue.o 15aes-i586-y := aes-i586-asm_32.o aes_glue.o
14twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o 16twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
15salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o 17salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
diff --git a/arch/x86/crypto/crc32c-intel.c b/arch/x86/crypto/crc32c-intel.c
new file mode 100644
index 000000000000..070afc5b6c94
--- /dev/null
+++ b/arch/x86/crypto/crc32c-intel.c
@@ -0,0 +1,197 @@
1/*
2 * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal.
3 * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE)
4 * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at:
5 * http://www.intel.com/products/processor/manuals/
6 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
7 * Volume 2A: Instruction Set Reference, A-M
8 *
9 * Copyright (c) 2008 Austin Zhang <austin_zhang@linux.intel.com>
10 * Copyright (c) 2008 Kent Liu <kent.liu@intel.com>
11 *
12 * This program is free software; you can redistribute it and/or modify it
13 * under the terms of the GNU General Public License as published by the Free
14 * Software Foundation; either version 2 of the License, or (at your option)
15 * any later version.
16 *
17 */
18#include <linux/init.h>
19#include <linux/module.h>
20#include <linux/string.h>
21#include <linux/kernel.h>
22#include <crypto/internal/hash.h>
23
24#include <asm/cpufeature.h>
25
26#define CHKSUM_BLOCK_SIZE 1
27#define CHKSUM_DIGEST_SIZE 4
28
29#define SCALE_F sizeof(unsigned long)
30
31#ifdef CONFIG_X86_64
32#define REX_PRE "0x48, "
33#else
34#define REX_PRE
35#endif
36
37static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
38{
39 while (length--) {
40 __asm__ __volatile__(
41 ".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1"
42 :"=S"(crc)
43 :"0"(crc), "c"(*data)
44 );
45 data++;
46 }
47
48 return crc;
49}
50
51static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len)
52{
53 unsigned int iquotient = len / SCALE_F;
54 unsigned int iremainder = len % SCALE_F;
55 unsigned long *ptmp = (unsigned long *)p;
56
57 while (iquotient--) {
58 __asm__ __volatile__(
59 ".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;"
60 :"=S"(crc)
61 :"0"(crc), "c"(*ptmp)
62 );
63 ptmp++;
64 }
65
66 if (iremainder)
67 crc = crc32c_intel_le_hw_byte(crc, (unsigned char *)ptmp,
68 iremainder);
69
70 return crc;
71}
72
73/*
74 * Setting the seed allows arbitrary accumulators and flexible XOR policy
75 * If your algorithm starts with ~0, then XOR with ~0 before you set
76 * the seed.
77 */
78static int crc32c_intel_setkey(struct crypto_ahash *hash, const u8 *key,
79 unsigned int keylen)
80{
81 u32 *mctx = crypto_ahash_ctx(hash);
82
83 if (keylen != sizeof(u32)) {
84 crypto_ahash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
85 return -EINVAL;
86 }
87 *mctx = le32_to_cpup((__le32 *)key);
88 return 0;
89}
90
91static int crc32c_intel_init(struct ahash_request *req)
92{
93 u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
94 u32 *crcp = ahash_request_ctx(req);
95
96 *crcp = *mctx;
97
98 return 0;
99}
100
101static int crc32c_intel_update(struct ahash_request *req)
102{
103 struct crypto_hash_walk walk;
104 u32 *crcp = ahash_request_ctx(req);
105 u32 crc = *crcp;
106 int nbytes;
107
108 for (nbytes = crypto_hash_walk_first(req, &walk); nbytes;
109 nbytes = crypto_hash_walk_done(&walk, 0))
110 crc = crc32c_intel_le_hw(crc, walk.data, nbytes);
111
112 *crcp = crc;
113 return 0;
114}
115
116static int crc32c_intel_final(struct ahash_request *req)
117{
118 u32 *crcp = ahash_request_ctx(req);
119
120 *(__le32 *)req->result = ~cpu_to_le32p(crcp);
121 return 0;
122}
123
124static int crc32c_intel_digest(struct ahash_request *req)
125{
126 struct crypto_hash_walk walk;
127 u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
128 u32 crc = *mctx;
129 int nbytes;
130
131 for (nbytes = crypto_hash_walk_first(req, &walk); nbytes;
132 nbytes = crypto_hash_walk_done(&walk, 0))
133 crc = crc32c_intel_le_hw(crc, walk.data, nbytes);
134
135 *(__le32 *)req->result = ~cpu_to_le32(crc);
136 return 0;
137}
138
139static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
140{
141 u32 *key = crypto_tfm_ctx(tfm);
142
143 *key = ~0;
144
145 tfm->crt_ahash.reqsize = sizeof(u32);
146
147 return 0;
148}
149
150static struct crypto_alg alg = {
151 .cra_name = "crc32c",
152 .cra_driver_name = "crc32c-intel",
153 .cra_priority = 200,
154 .cra_flags = CRYPTO_ALG_TYPE_AHASH,
155 .cra_blocksize = CHKSUM_BLOCK_SIZE,
156 .cra_alignmask = 3,
157 .cra_ctxsize = sizeof(u32),
158 .cra_module = THIS_MODULE,
159 .cra_list = LIST_HEAD_INIT(alg.cra_list),
160 .cra_init = crc32c_intel_cra_init,
161 .cra_type = &crypto_ahash_type,
162 .cra_u = {
163 .ahash = {
164 .digestsize = CHKSUM_DIGEST_SIZE,
165 .setkey = crc32c_intel_setkey,
166 .init = crc32c_intel_init,
167 .update = crc32c_intel_update,
168 .final = crc32c_intel_final,
169 .digest = crc32c_intel_digest,
170 }
171 }
172};
173
174
175static int __init crc32c_intel_mod_init(void)
176{
177 if (cpu_has_xmm4_2)
178 return crypto_register_alg(&alg);
179 else
180 return -ENODEV;
181}
182
183static void __exit crc32c_intel_mod_fini(void)
184{
185 crypto_unregister_alg(&alg);
186}
187
188module_init(crc32c_intel_mod_init);
189module_exit(crc32c_intel_mod_fini);
190
191MODULE_AUTHOR("Austin Zhang <austin.zhang@intel.com>, Kent Liu <kent.liu@intel.com>");
192MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware.");
193MODULE_LICENSE("GPL");
194
195MODULE_ALIAS("crc32c");
196MODULE_ALIAS("crc32c-intel");
197
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 58cccb6483b0..127ec3f07214 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -85,8 +85,10 @@ static void dump_thread32(struct pt_regs *regs, struct user32 *dump)
85 dump->regs.ax = regs->ax; 85 dump->regs.ax = regs->ax;
86 dump->regs.ds = current->thread.ds; 86 dump->regs.ds = current->thread.ds;
87 dump->regs.es = current->thread.es; 87 dump->regs.es = current->thread.es;
88 asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs; 88 savesegment(fs, fs);
89 asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs; 89 dump->regs.fs = fs;
90 savesegment(gs, gs);
91 dump->regs.gs = gs;
90 dump->regs.orig_ax = regs->orig_ax; 92 dump->regs.orig_ax = regs->orig_ax;
91 dump->regs.ip = regs->ip; 93 dump->regs.ip = regs->ip;
92 dump->regs.cs = regs->cs; 94 dump->regs.cs = regs->cs;
@@ -430,8 +432,9 @@ beyond_if:
430 current->mm->start_stack = 432 current->mm->start_stack =
431 (unsigned long)create_aout_tables((char __user *)bprm->p, bprm); 433 (unsigned long)create_aout_tables((char __user *)bprm->p, bprm);
432 /* start thread */ 434 /* start thread */
433 asm volatile("movl %0,%%fs" :: "r" (0)); \ 435 loadsegment(fs, 0);
434 asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); 436 loadsegment(ds, __USER32_DS);
437 loadsegment(es, __USER32_DS);
435 load_gs_index(0); 438 load_gs_index(0);
436 (regs)->ip = ex.a_entry; 439 (regs)->ip = ex.a_entry;
437 (regs)->sp = current->mm->start_stack; 440 (regs)->sp = current->mm->start_stack;
@@ -441,12 +444,6 @@ beyond_if:
441 regs->r8 = regs->r9 = regs->r10 = regs->r11 = 444 regs->r8 = regs->r9 = regs->r10 = regs->r11 =
442 regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; 445 regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
443 set_fs(USER_DS); 446 set_fs(USER_DS);
444 if (unlikely(current->ptrace & PT_PTRACED)) {
445 if (current->ptrace & PT_TRACE_EXEC)
446 ptrace_notify((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
447 else
448 send_sig(SIGTRAP, current, 0);
449 }
450 return 0; 447 return 0;
451} 448}
452 449
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index cb3856a18c85..4bc02b23674b 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -36,6 +36,11 @@
36 36
37#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) 37#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
38 38
39#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \
40 X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
41 X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
42 X86_EFLAGS_CF)
43
39asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset); 44asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset);
40void signal_fault(struct pt_regs *regs, void __user *frame, char *where); 45void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
41 46
@@ -174,9 +179,10 @@ struct sigframe
174 u32 pretcode; 179 u32 pretcode;
175 int sig; 180 int sig;
176 struct sigcontext_ia32 sc; 181 struct sigcontext_ia32 sc;
177 struct _fpstate_ia32 fpstate; 182 struct _fpstate_ia32 fpstate_unused; /* look at kernel/sigframe.h */
178 unsigned int extramask[_COMPAT_NSIG_WORDS-1]; 183 unsigned int extramask[_COMPAT_NSIG_WORDS-1];
179 char retcode[8]; 184 char retcode[8];
185 /* fp state follows here */
180}; 186};
181 187
182struct rt_sigframe 188struct rt_sigframe
@@ -187,8 +193,8 @@ struct rt_sigframe
187 u32 puc; 193 u32 puc;
188 compat_siginfo_t info; 194 compat_siginfo_t info;
189 struct ucontext_ia32 uc; 195 struct ucontext_ia32 uc;
190 struct _fpstate_ia32 fpstate;
191 char retcode[8]; 196 char retcode[8];
197 /* fp state follows here */
192}; 198};
193 199
194#define COPY(x) { \ 200#define COPY(x) { \
@@ -201,7 +207,7 @@ struct rt_sigframe
201 { unsigned int cur; \ 207 { unsigned int cur; \
202 unsigned short pre; \ 208 unsigned short pre; \
203 err |= __get_user(pre, &sc->seg); \ 209 err |= __get_user(pre, &sc->seg); \
204 asm volatile("movl %%" #seg ",%0" : "=r" (cur)); \ 210 savesegment(seg, cur); \
205 pre |= mask; \ 211 pre |= mask; \
206 if (pre != cur) loadsegment(seg, pre); } 212 if (pre != cur) loadsegment(seg, pre); }
207 213
@@ -210,7 +216,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
210 unsigned int *peax) 216 unsigned int *peax)
211{ 217{
212 unsigned int tmpflags, gs, oldgs, err = 0; 218 unsigned int tmpflags, gs, oldgs, err = 0;
213 struct _fpstate_ia32 __user *buf; 219 void __user *buf;
214 u32 tmp; 220 u32 tmp;
215 221
216 /* Always make any pending restarted system calls return -EINTR */ 222 /* Always make any pending restarted system calls return -EINTR */
@@ -230,7 +236,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
230 */ 236 */
231 err |= __get_user(gs, &sc->gs); 237 err |= __get_user(gs, &sc->gs);
232 gs |= 3; 238 gs |= 3;
233 asm("movl %%gs,%0" : "=r" (oldgs)); 239 savesegment(gs, oldgs);
234 if (gs != oldgs) 240 if (gs != oldgs)
235 load_gs_index(gs); 241 load_gs_index(gs);
236 242
@@ -248,32 +254,18 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
248 regs->ss |= 3; 254 regs->ss |= 3;
249 255
250 err |= __get_user(tmpflags, &sc->flags); 256 err |= __get_user(tmpflags, &sc->flags);
251 regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5); 257 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
252 /* disable syscall checks */ 258 /* disable syscall checks */
253 regs->orig_ax = -1; 259 regs->orig_ax = -1;
254 260
255 err |= __get_user(tmp, &sc->fpstate); 261 err |= __get_user(tmp, &sc->fpstate);
256 buf = compat_ptr(tmp); 262 buf = compat_ptr(tmp);
257 if (buf) { 263 err |= restore_i387_xstate_ia32(buf);
258 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
259 goto badframe;
260 err |= restore_i387_ia32(buf);
261 } else {
262 struct task_struct *me = current;
263
264 if (used_math()) {
265 clear_fpu(me);
266 clear_used_math();
267 }
268 }
269 264
270 err |= __get_user(tmp, &sc->ax); 265 err |= __get_user(tmp, &sc->ax);
271 *peax = tmp; 266 *peax = tmp;
272 267
273 return err; 268 return err;
274
275badframe:
276 return 1;
277} 269}
278 270
279asmlinkage long sys32_sigreturn(struct pt_regs *regs) 271asmlinkage long sys32_sigreturn(struct pt_regs *regs)
@@ -345,46 +337,42 @@ badframe:
345 */ 337 */
346 338
347static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, 339static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
348 struct _fpstate_ia32 __user *fpstate, 340 void __user *fpstate,
349 struct pt_regs *regs, unsigned int mask) 341 struct pt_regs *regs, unsigned int mask)
350{ 342{
351 int tmp, err = 0; 343 int tmp, err = 0;
352 344
353 tmp = 0; 345 savesegment(gs, tmp);
354 __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp));
355 err |= __put_user(tmp, (unsigned int __user *)&sc->gs); 346 err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
356 __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp)); 347 savesegment(fs, tmp);
357 err |= __put_user(tmp, (unsigned int __user *)&sc->fs); 348 err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
358 __asm__("movl %%ds,%0" : "=r"(tmp): "0"(tmp)); 349 savesegment(ds, tmp);
359 err |= __put_user(tmp, (unsigned int __user *)&sc->ds); 350 err |= __put_user(tmp, (unsigned int __user *)&sc->ds);
360 __asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp)); 351 savesegment(es, tmp);
361 err |= __put_user(tmp, (unsigned int __user *)&sc->es); 352 err |= __put_user(tmp, (unsigned int __user *)&sc->es);
362 353
363 err |= __put_user((u32)regs->di, &sc->di); 354 err |= __put_user(regs->di, &sc->di);
364 err |= __put_user((u32)regs->si, &sc->si); 355 err |= __put_user(regs->si, &sc->si);
365 err |= __put_user((u32)regs->bp, &sc->bp); 356 err |= __put_user(regs->bp, &sc->bp);
366 err |= __put_user((u32)regs->sp, &sc->sp); 357 err |= __put_user(regs->sp, &sc->sp);
367 err |= __put_user((u32)regs->bx, &sc->bx); 358 err |= __put_user(regs->bx, &sc->bx);
368 err |= __put_user((u32)regs->dx, &sc->dx); 359 err |= __put_user(regs->dx, &sc->dx);
369 err |= __put_user((u32)regs->cx, &sc->cx); 360 err |= __put_user(regs->cx, &sc->cx);
370 err |= __put_user((u32)regs->ax, &sc->ax); 361 err |= __put_user(regs->ax, &sc->ax);
371 err |= __put_user((u32)regs->cs, &sc->cs); 362 err |= __put_user(regs->cs, &sc->cs);
372 err |= __put_user((u32)regs->ss, &sc->ss); 363 err |= __put_user(regs->ss, &sc->ss);
373 err |= __put_user(current->thread.trap_no, &sc->trapno); 364 err |= __put_user(current->thread.trap_no, &sc->trapno);
374 err |= __put_user(current->thread.error_code, &sc->err); 365 err |= __put_user(current->thread.error_code, &sc->err);
375 err |= __put_user((u32)regs->ip, &sc->ip); 366 err |= __put_user(regs->ip, &sc->ip);
376 err |= __put_user((u32)regs->flags, &sc->flags); 367 err |= __put_user(regs->flags, &sc->flags);
377 err |= __put_user((u32)regs->sp, &sc->sp_at_signal); 368 err |= __put_user(regs->sp, &sc->sp_at_signal);
378 369
379 tmp = save_i387_ia32(fpstate); 370 tmp = save_i387_xstate_ia32(fpstate);
380 if (tmp < 0) 371 if (tmp < 0)
381 err = -EFAULT; 372 err = -EFAULT;
382 else { 373 else
383 clear_used_math();
384 stts();
385 err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL), 374 err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL),
386 &sc->fpstate); 375 &sc->fpstate);
387 }
388 376
389 /* non-iBCS2 extensions.. */ 377 /* non-iBCS2 extensions.. */
390 err |= __put_user(mask, &sc->oldmask); 378 err |= __put_user(mask, &sc->oldmask);
@@ -397,7 +385,8 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
397 * Determine which stack to use.. 385 * Determine which stack to use..
398 */ 386 */
399static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, 387static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
400 size_t frame_size) 388 size_t frame_size,
389 void **fpstate)
401{ 390{
402 unsigned long sp; 391 unsigned long sp;
403 392
@@ -416,6 +405,11 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
416 ka->sa.sa_restorer) 405 ka->sa.sa_restorer)
417 sp = (unsigned long) ka->sa.sa_restorer; 406 sp = (unsigned long) ka->sa.sa_restorer;
418 407
408 if (used_math()) {
409 sp = sp - sig_xstate_ia32_size;
410 *fpstate = (struct _fpstate_ia32 *) sp;
411 }
412
419 sp -= frame_size; 413 sp -= frame_size;
420 /* Align the stack pointer according to the i386 ABI, 414 /* Align the stack pointer according to the i386 ABI,
421 * i.e. so that on function entry ((sp + 4) & 15) == 0. */ 415 * i.e. so that on function entry ((sp + 4) & 15) == 0. */
@@ -429,6 +423,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
429 struct sigframe __user *frame; 423 struct sigframe __user *frame;
430 void __user *restorer; 424 void __user *restorer;
431 int err = 0; 425 int err = 0;
426 void __user *fpstate = NULL;
432 427
433 /* copy_to_user optimizes that into a single 8 byte store */ 428 /* copy_to_user optimizes that into a single 8 byte store */
434 static const struct { 429 static const struct {
@@ -443,25 +438,21 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
443 0, 438 0,
444 }; 439 };
445 440
446 frame = get_sigframe(ka, regs, sizeof(*frame)); 441 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
447 442
448 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 443 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
449 goto give_sigsegv; 444 return -EFAULT;
450 445
451 err |= __put_user(sig, &frame->sig); 446 if (__put_user(sig, &frame->sig))
452 if (err) 447 return -EFAULT;
453 goto give_sigsegv;
454 448
455 err |= ia32_setup_sigcontext(&frame->sc, &frame->fpstate, regs, 449 if (ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
456 set->sig[0]); 450 return -EFAULT;
457 if (err)
458 goto give_sigsegv;
459 451
460 if (_COMPAT_NSIG_WORDS > 1) { 452 if (_COMPAT_NSIG_WORDS > 1) {
461 err |= __copy_to_user(frame->extramask, &set->sig[1], 453 if (__copy_to_user(frame->extramask, &set->sig[1],
462 sizeof(frame->extramask)); 454 sizeof(frame->extramask)))
463 if (err) 455 return -EFAULT;
464 goto give_sigsegv;
465 } 456 }
466 457
467 if (ka->sa.sa_flags & SA_RESTORER) { 458 if (ka->sa.sa_flags & SA_RESTORER) {
@@ -482,7 +473,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
482 */ 473 */
483 err |= __copy_to_user(frame->retcode, &code, 8); 474 err |= __copy_to_user(frame->retcode, &code, 8);
484 if (err) 475 if (err)
485 goto give_sigsegv; 476 return -EFAULT;
486 477
487 /* Set up registers for signal handler */ 478 /* Set up registers for signal handler */
488 regs->sp = (unsigned long) frame; 479 regs->sp = (unsigned long) frame;
@@ -493,8 +484,8 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
493 regs->dx = 0; 484 regs->dx = 0;
494 regs->cx = 0; 485 regs->cx = 0;
495 486
496 asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); 487 loadsegment(ds, __USER32_DS);
497 asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); 488 loadsegment(es, __USER32_DS);
498 489
499 regs->cs = __USER32_CS; 490 regs->cs = __USER32_CS;
500 regs->ss = __USER32_DS; 491 regs->ss = __USER32_DS;
@@ -505,19 +496,15 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
505#endif 496#endif
506 497
507 return 0; 498 return 0;
508
509give_sigsegv:
510 force_sigsegv(sig, current);
511 return -EFAULT;
512} 499}
513 500
514int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 501int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
515 compat_sigset_t *set, struct pt_regs *regs) 502 compat_sigset_t *set, struct pt_regs *regs)
516{ 503{
517 struct rt_sigframe __user *frame; 504 struct rt_sigframe __user *frame;
518 struct exec_domain *ed = current_thread_info()->exec_domain;
519 void __user *restorer; 505 void __user *restorer;
520 int err = 0; 506 int err = 0;
507 void __user *fpstate = NULL;
521 508
522 /* __copy_to_user optimizes that into a single 8 byte store */ 509 /* __copy_to_user optimizes that into a single 8 byte store */
523 static const struct { 510 static const struct {
@@ -533,31 +520,33 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
533 0, 520 0,
534 }; 521 };
535 522
536 frame = get_sigframe(ka, regs, sizeof(*frame)); 523 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
537 524
538 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 525 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
539 goto give_sigsegv; 526 return -EFAULT;
540 527
541 err |= __put_user((ed && ed->signal_invmap && sig < 32 528 err |= __put_user(sig, &frame->sig);
542 ? ed->signal_invmap[sig] : sig), &frame->sig);
543 err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo); 529 err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo);
544 err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc); 530 err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc);
545 err |= copy_siginfo_to_user32(&frame->info, info); 531 err |= copy_siginfo_to_user32(&frame->info, info);
546 if (err) 532 if (err)
547 goto give_sigsegv; 533 return -EFAULT;
548 534
549 /* Create the ucontext. */ 535 /* Create the ucontext. */
550 err |= __put_user(0, &frame->uc.uc_flags); 536 if (cpu_has_xsave)
537 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
538 else
539 err |= __put_user(0, &frame->uc.uc_flags);
551 err |= __put_user(0, &frame->uc.uc_link); 540 err |= __put_user(0, &frame->uc.uc_link);
552 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 541 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
553 err |= __put_user(sas_ss_flags(regs->sp), 542 err |= __put_user(sas_ss_flags(regs->sp),
554 &frame->uc.uc_stack.ss_flags); 543 &frame->uc.uc_stack.ss_flags);
555 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); 544 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
556 err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, 545 err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
557 regs, set->sig[0]); 546 regs, set->sig[0]);
558 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); 547 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
559 if (err) 548 if (err)
560 goto give_sigsegv; 549 return -EFAULT;
561 550
562 if (ka->sa.sa_flags & SA_RESTORER) 551 if (ka->sa.sa_flags & SA_RESTORER)
563 restorer = ka->sa.sa_restorer; 552 restorer = ka->sa.sa_restorer;
@@ -572,7 +561,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
572 */ 561 */
573 err |= __copy_to_user(frame->retcode, &code, 8); 562 err |= __copy_to_user(frame->retcode, &code, 8);
574 if (err) 563 if (err)
575 goto give_sigsegv; 564 return -EFAULT;
576 565
577 /* Set up registers for signal handler */ 566 /* Set up registers for signal handler */
578 regs->sp = (unsigned long) frame; 567 regs->sp = (unsigned long) frame;
@@ -588,8 +577,8 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
588 regs->dx = (unsigned long) &frame->info; 577 regs->dx = (unsigned long) &frame->info;
589 regs->cx = (unsigned long) &frame->uc; 578 regs->cx = (unsigned long) &frame->uc;
590 579
591 asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); 580 loadsegment(ds, __USER32_DS);
592 asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); 581 loadsegment(es, __USER32_DS);
593 582
594 regs->cs = __USER32_CS; 583 regs->cs = __USER32_CS;
595 regs->ss = __USER32_DS; 584 regs->ss = __USER32_DS;
@@ -600,8 +589,4 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
600#endif 589#endif
601 590
602 return 0; 591 return 0;
603
604give_sigsegv:
605 force_sigsegv(sig, current);
606 return -EFAULT;
607} 592}
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index b5e329da166c..eb4314768bf7 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -15,6 +15,16 @@
15#include <asm/irqflags.h> 15#include <asm/irqflags.h>
16#include <linux/linkage.h> 16#include <linux/linkage.h>
17 17
18/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
19#include <linux/elf-em.h>
20#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
21#define __AUDIT_ARCH_LE 0x40000000
22
23#ifndef CONFIG_AUDITSYSCALL
24#define sysexit_audit int_ret_from_sys_call
25#define sysretl_audit int_ret_from_sys_call
26#endif
27
18#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) 28#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
19 29
20 .macro IA32_ARG_FIXUP noebp=0 30 .macro IA32_ARG_FIXUP noebp=0
@@ -29,24 +39,27 @@
29 .endm 39 .endm
30 40
31 /* clobbers %eax */ 41 /* clobbers %eax */
32 .macro CLEAR_RREGS 42 .macro CLEAR_RREGS _r9=rax
33 xorl %eax,%eax 43 xorl %eax,%eax
34 movq %rax,R11(%rsp) 44 movq %rax,R11(%rsp)
35 movq %rax,R10(%rsp) 45 movq %rax,R10(%rsp)
36 movq %rax,R9(%rsp) 46 movq %\_r9,R9(%rsp)
37 movq %rax,R8(%rsp) 47 movq %rax,R8(%rsp)
38 .endm 48 .endm
39 49
40 .macro LOAD_ARGS32 offset 50 /*
41 movl \offset(%rsp),%r11d 51 * Reload arg registers from stack in case ptrace changed them.
42 movl \offset+8(%rsp),%r10d 52 * We don't reload %eax because syscall_trace_enter() returned
53 * the value it wants us to use in the table lookup.
54 */
55 .macro LOAD_ARGS32 offset, _r9=0
56 .if \_r9
43 movl \offset+16(%rsp),%r9d 57 movl \offset+16(%rsp),%r9d
44 movl \offset+24(%rsp),%r8d 58 .endif
45 movl \offset+40(%rsp),%ecx 59 movl \offset+40(%rsp),%ecx
46 movl \offset+48(%rsp),%edx 60 movl \offset+48(%rsp),%edx
47 movl \offset+56(%rsp),%esi 61 movl \offset+56(%rsp),%esi
48 movl \offset+64(%rsp),%edi 62 movl \offset+64(%rsp),%edi
49 movl \offset+72(%rsp),%eax
50 .endm 63 .endm
51 64
52 .macro CFI_STARTPROC32 simple 65 .macro CFI_STARTPROC32 simple
@@ -61,6 +74,19 @@
61 CFI_UNDEFINED r15 74 CFI_UNDEFINED r15
62 .endm 75 .endm
63 76
77#ifdef CONFIG_PARAVIRT
78ENTRY(native_usergs_sysret32)
79 swapgs
80 sysretl
81ENDPROC(native_usergs_sysret32)
82
83ENTRY(native_irq_enable_sysexit)
84 swapgs
85 sti
86 sysexit
87ENDPROC(native_irq_enable_sysexit)
88#endif
89
64/* 90/*
65 * 32bit SYSENTER instruction entry. 91 * 32bit SYSENTER instruction entry.
66 * 92 *
@@ -85,14 +111,14 @@ ENTRY(ia32_sysenter_target)
85 CFI_SIGNAL_FRAME 111 CFI_SIGNAL_FRAME
86 CFI_DEF_CFA rsp,0 112 CFI_DEF_CFA rsp,0
87 CFI_REGISTER rsp,rbp 113 CFI_REGISTER rsp,rbp
88 swapgs 114 SWAPGS_UNSAFE_STACK
89 movq %gs:pda_kernelstack, %rsp 115 movq %gs:pda_kernelstack, %rsp
90 addq $(PDA_STACKOFFSET),%rsp 116 addq $(PDA_STACKOFFSET),%rsp
91 /* 117 /*
92 * No need to follow this irqs on/off section: the syscall 118 * No need to follow this irqs on/off section: the syscall
93 * disabled irqs, here we enable it straight after entry: 119 * disabled irqs, here we enable it straight after entry:
94 */ 120 */
95 sti 121 ENABLE_INTERRUPTS(CLBR_NONE)
96 movl %ebp,%ebp /* zero extension */ 122 movl %ebp,%ebp /* zero extension */
97 pushq $__USER32_DS 123 pushq $__USER32_DS
98 CFI_ADJUST_CFA_OFFSET 8 124 CFI_ADJUST_CFA_OFFSET 8
@@ -103,7 +129,7 @@ ENTRY(ia32_sysenter_target)
103 pushfq 129 pushfq
104 CFI_ADJUST_CFA_OFFSET 8 130 CFI_ADJUST_CFA_OFFSET 8
105 /*CFI_REL_OFFSET rflags,0*/ 131 /*CFI_REL_OFFSET rflags,0*/
106 movl 8*3-THREAD_SIZE+threadinfo_sysenter_return(%rsp), %r10d 132 movl 8*3-THREAD_SIZE+TI_sysenter_return(%rsp), %r10d
107 CFI_REGISTER rip,r10 133 CFI_REGISTER rip,r10
108 pushq $__USER32_CS 134 pushq $__USER32_CS
109 CFI_ADJUST_CFA_OFFSET 8 135 CFI_ADJUST_CFA_OFFSET 8
@@ -118,27 +144,29 @@ ENTRY(ia32_sysenter_target)
118 SAVE_ARGS 0,0,1 144 SAVE_ARGS 0,0,1
119 /* no need to do an access_ok check here because rbp has been 145 /* no need to do an access_ok check here because rbp has been
120 32bit zero extended */ 146 32bit zero extended */
1211: movl (%rbp),%r9d 1471: movl (%rbp),%ebp
122 .section __ex_table,"a" 148 .section __ex_table,"a"
123 .quad 1b,ia32_badarg 149 .quad 1b,ia32_badarg
124 .previous 150 .previous
125 GET_THREAD_INFO(%r10) 151 GET_THREAD_INFO(%r10)
126 orl $TS_COMPAT,threadinfo_status(%r10) 152 orl $TS_COMPAT,TI_status(%r10)
127 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) 153 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
128 CFI_REMEMBER_STATE 154 CFI_REMEMBER_STATE
129 jnz sysenter_tracesys 155 jnz sysenter_tracesys
130sysenter_do_call:
131 cmpl $(IA32_NR_syscalls-1),%eax 156 cmpl $(IA32_NR_syscalls-1),%eax
132 ja ia32_badsys 157 ja ia32_badsys
133 IA32_ARG_FIXUP 1 158sysenter_do_call:
159 IA32_ARG_FIXUP
160sysenter_dispatch:
134 call *ia32_sys_call_table(,%rax,8) 161 call *ia32_sys_call_table(,%rax,8)
135 movq %rax,RAX-ARGOFFSET(%rsp) 162 movq %rax,RAX-ARGOFFSET(%rsp)
136 GET_THREAD_INFO(%r10) 163 GET_THREAD_INFO(%r10)
137 cli 164 DISABLE_INTERRUPTS(CLBR_NONE)
138 TRACE_IRQS_OFF 165 TRACE_IRQS_OFF
139 testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10) 166 testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
140 jnz int_ret_from_sys_call 167 jnz sysexit_audit
141 andl $~TS_COMPAT,threadinfo_status(%r10) 168sysexit_from_sys_call:
169 andl $~TS_COMPAT,TI_status(%r10)
142 /* clear IF, that popfq doesn't enable interrupts early */ 170 /* clear IF, that popfq doesn't enable interrupts early */
143 andl $~0x200,EFLAGS-R11(%rsp) 171 andl $~0x200,EFLAGS-R11(%rsp)
144 movl RIP-R11(%rsp),%edx /* User %eip */ 172 movl RIP-R11(%rsp),%edx /* User %eip */
@@ -151,23 +179,71 @@ sysenter_do_call:
151 CFI_ADJUST_CFA_OFFSET -8 179 CFI_ADJUST_CFA_OFFSET -8
152 CFI_REGISTER rsp,rcx 180 CFI_REGISTER rsp,rcx
153 TRACE_IRQS_ON 181 TRACE_IRQS_ON
154 swapgs 182 ENABLE_INTERRUPTS_SYSEXIT32
155 sti /* sti only takes effect after the next instruction */
156 /* sysexit */
157 .byte 0xf, 0x35
158 183
159sysenter_tracesys: 184#ifdef CONFIG_AUDITSYSCALL
185 .macro auditsys_entry_common
186 movl %esi,%r9d /* 6th arg: 4th syscall arg */
187 movl %edx,%r8d /* 5th arg: 3rd syscall arg */
188 /* (already in %ecx) 4th arg: 2nd syscall arg */
189 movl %ebx,%edx /* 3rd arg: 1st syscall arg */
190 movl %eax,%esi /* 2nd arg: syscall number */
191 movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */
192 call audit_syscall_entry
193 movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */
194 cmpl $(IA32_NR_syscalls-1),%eax
195 ja ia32_badsys
196 movl %ebx,%edi /* reload 1st syscall arg */
197 movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */
198 movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */
199 movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */
200 movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */
201 .endm
202
203 .macro auditsys_exit exit,ebpsave=RBP
204 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
205 jnz int_ret_from_sys_call
206 TRACE_IRQS_ON
207 sti
208 movl %eax,%esi /* second arg, syscall return value */
209 cmpl $0,%eax /* is it < 0? */
210 setl %al /* 1 if so, 0 if not */
211 movzbl %al,%edi /* zero-extend that into %edi */
212 inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
213 call audit_syscall_exit
214 GET_THREAD_INFO(%r10)
215 movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */
216 movl \ebpsave-ARGOFFSET(%rsp),%ebp /* reload user register value */
217 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
218 cli
219 TRACE_IRQS_OFF
220 testl %edi,TI_flags(%r10)
221 jnz int_with_check
222 jmp \exit
223 .endm
224
225sysenter_auditsys:
160 CFI_RESTORE_STATE 226 CFI_RESTORE_STATE
161 xchgl %r9d,%ebp 227 auditsys_entry_common
228 movl %ebp,%r9d /* reload 6th syscall arg */
229 jmp sysenter_dispatch
230
231sysexit_audit:
232 auditsys_exit sysexit_from_sys_call
233#endif
234
235sysenter_tracesys:
236#ifdef CONFIG_AUDITSYSCALL
237 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
238 jz sysenter_auditsys
239#endif
162 SAVE_REST 240 SAVE_REST
163 CLEAR_RREGS 241 CLEAR_RREGS
164 movq %r9,R9(%rsp)
165 movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ 242 movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
166 movq %rsp,%rdi /* &pt_regs -> arg1 */ 243 movq %rsp,%rdi /* &pt_regs -> arg1 */
167 call syscall_trace_enter 244 call syscall_trace_enter
168 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ 245 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
169 RESTORE_REST 246 RESTORE_REST
170 xchgl %ebp,%r9d
171 cmpl $(IA32_NR_syscalls-1),%eax 247 cmpl $(IA32_NR_syscalls-1),%eax
172 ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ 248 ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
173 jmp sysenter_do_call 249 jmp sysenter_do_call
@@ -200,7 +276,7 @@ ENTRY(ia32_cstar_target)
200 CFI_DEF_CFA rsp,PDA_STACKOFFSET 276 CFI_DEF_CFA rsp,PDA_STACKOFFSET
201 CFI_REGISTER rip,rcx 277 CFI_REGISTER rip,rcx
202 /*CFI_REGISTER rflags,r11*/ 278 /*CFI_REGISTER rflags,r11*/
203 swapgs 279 SWAPGS_UNSAFE_STACK
204 movl %esp,%r8d 280 movl %esp,%r8d
205 CFI_REGISTER rsp,r8 281 CFI_REGISTER rsp,r8
206 movq %gs:pda_kernelstack,%rsp 282 movq %gs:pda_kernelstack,%rsp
@@ -208,7 +284,7 @@ ENTRY(ia32_cstar_target)
208 * No need to follow this irqs on/off section: the syscall 284 * No need to follow this irqs on/off section: the syscall
209 * disabled irqs and here we enable it straight after entry: 285 * disabled irqs and here we enable it straight after entry:
210 */ 286 */
211 sti 287 ENABLE_INTERRUPTS(CLBR_NONE)
212 SAVE_ARGS 8,1,1 288 SAVE_ARGS 8,1,1
213 movl %eax,%eax /* zero extension */ 289 movl %eax,%eax /* zero extension */
214 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) 290 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
@@ -230,22 +306,24 @@ ENTRY(ia32_cstar_target)
230 .quad 1b,ia32_badarg 306 .quad 1b,ia32_badarg
231 .previous 307 .previous
232 GET_THREAD_INFO(%r10) 308 GET_THREAD_INFO(%r10)
233 orl $TS_COMPAT,threadinfo_status(%r10) 309 orl $TS_COMPAT,TI_status(%r10)
234 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) 310 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
235 CFI_REMEMBER_STATE 311 CFI_REMEMBER_STATE
236 jnz cstar_tracesys 312 jnz cstar_tracesys
237cstar_do_call:
238 cmpl $IA32_NR_syscalls-1,%eax 313 cmpl $IA32_NR_syscalls-1,%eax
239 ja ia32_badsys 314 ja ia32_badsys
315cstar_do_call:
240 IA32_ARG_FIXUP 1 316 IA32_ARG_FIXUP 1
317cstar_dispatch:
241 call *ia32_sys_call_table(,%rax,8) 318 call *ia32_sys_call_table(,%rax,8)
242 movq %rax,RAX-ARGOFFSET(%rsp) 319 movq %rax,RAX-ARGOFFSET(%rsp)
243 GET_THREAD_INFO(%r10) 320 GET_THREAD_INFO(%r10)
244 cli 321 DISABLE_INTERRUPTS(CLBR_NONE)
245 TRACE_IRQS_OFF 322 TRACE_IRQS_OFF
246 testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10) 323 testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
247 jnz int_ret_from_sys_call 324 jnz sysretl_audit
248 andl $~TS_COMPAT,threadinfo_status(%r10) 325sysretl_from_sys_call:
326 andl $~TS_COMPAT,TI_status(%r10)
249 RESTORE_ARGS 1,-ARG_SKIP,1,1,1 327 RESTORE_ARGS 1,-ARG_SKIP,1,1,1
250 movl RIP-ARGOFFSET(%rsp),%ecx 328 movl RIP-ARGOFFSET(%rsp),%ecx
251 CFI_REGISTER rip,rcx 329 CFI_REGISTER rip,rcx
@@ -254,22 +332,34 @@ cstar_do_call:
254 TRACE_IRQS_ON 332 TRACE_IRQS_ON
255 movl RSP-ARGOFFSET(%rsp),%esp 333 movl RSP-ARGOFFSET(%rsp),%esp
256 CFI_RESTORE rsp 334 CFI_RESTORE rsp
257 swapgs 335 USERGS_SYSRET32
258 sysretl
259 336
260cstar_tracesys: 337#ifdef CONFIG_AUDITSYSCALL
338cstar_auditsys:
261 CFI_RESTORE_STATE 339 CFI_RESTORE_STATE
340 movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */
341 auditsys_entry_common
342 movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */
343 jmp cstar_dispatch
344
345sysretl_audit:
346 auditsys_exit sysretl_from_sys_call, RCX /* user %ebp in RCX slot */
347#endif
348
349cstar_tracesys:
350#ifdef CONFIG_AUDITSYSCALL
351 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
352 jz cstar_auditsys
353#endif
262 xchgl %r9d,%ebp 354 xchgl %r9d,%ebp
263 SAVE_REST 355 SAVE_REST
264 CLEAR_RREGS 356 CLEAR_RREGS r9
265 movq %r9,R9(%rsp)
266 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ 357 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
267 movq %rsp,%rdi /* &pt_regs -> arg1 */ 358 movq %rsp,%rdi /* &pt_regs -> arg1 */
268 call syscall_trace_enter 359 call syscall_trace_enter
269 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ 360 LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */
270 RESTORE_REST 361 RESTORE_REST
271 xchgl %ebp,%r9d 362 xchgl %ebp,%r9d
272 movl RSP-ARGOFFSET(%rsp), %r8d
273 cmpl $(IA32_NR_syscalls-1),%eax 363 cmpl $(IA32_NR_syscalls-1),%eax
274 ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ 364 ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
275 jmp cstar_do_call 365 jmp cstar_do_call
@@ -310,12 +400,13 @@ ENTRY(ia32_syscall)
310 /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/ 400 /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/
311 /*CFI_REL_OFFSET cs,CS-RIP*/ 401 /*CFI_REL_OFFSET cs,CS-RIP*/
312 CFI_REL_OFFSET rip,RIP-RIP 402 CFI_REL_OFFSET rip,RIP-RIP
313 swapgs 403 PARAVIRT_ADJUST_EXCEPTION_FRAME
404 SWAPGS
314 /* 405 /*
315 * No need to follow this irqs on/off section: the syscall 406 * No need to follow this irqs on/off section: the syscall
316 * disabled irqs and here we enable it straight after entry: 407 * disabled irqs and here we enable it straight after entry:
317 */ 408 */
318 sti 409 ENABLE_INTERRUPTS(CLBR_NONE)
319 movl %eax,%eax 410 movl %eax,%eax
320 pushq %rax 411 pushq %rax
321 CFI_ADJUST_CFA_OFFSET 8 412 CFI_ADJUST_CFA_OFFSET 8
@@ -324,8 +415,8 @@ ENTRY(ia32_syscall)
324 this could be a problem. */ 415 this could be a problem. */
325 SAVE_ARGS 0,0,1 416 SAVE_ARGS 0,0,1
326 GET_THREAD_INFO(%r10) 417 GET_THREAD_INFO(%r10)
327 orl $TS_COMPAT,threadinfo_status(%r10) 418 orl $TS_COMPAT,TI_status(%r10)
328 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) 419 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
329 jnz ia32_tracesys 420 jnz ia32_tracesys
330ia32_do_syscall: 421ia32_do_syscall:
331 cmpl $(IA32_NR_syscalls-1),%eax 422 cmpl $(IA32_NR_syscalls-1),%eax
@@ -370,13 +461,11 @@ quiet_ni_syscall:
370 PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi 461 PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
371 PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi 462 PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
372 PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx 463 PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
373 PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx
374 PTREGSCALL stub32_execve, sys32_execve, %rcx 464 PTREGSCALL stub32_execve, sys32_execve, %rcx
375 PTREGSCALL stub32_fork, sys_fork, %rdi 465 PTREGSCALL stub32_fork, sys_fork, %rdi
376 PTREGSCALL stub32_clone, sys32_clone, %rdx 466 PTREGSCALL stub32_clone, sys32_clone, %rdx
377 PTREGSCALL stub32_vfork, sys_vfork, %rdi 467 PTREGSCALL stub32_vfork, sys_vfork, %rdi
378 PTREGSCALL stub32_iopl, sys_iopl, %rsi 468 PTREGSCALL stub32_iopl, sys_iopl, %rsi
379 PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx
380 469
381ENTRY(ia32_ptregs_common) 470ENTRY(ia32_ptregs_common)
382 popq %r11 471 popq %r11
@@ -476,7 +565,7 @@ ia32_sys_call_table:
476 .quad sys_ssetmask 565 .quad sys_ssetmask
477 .quad sys_setreuid16 /* 70 */ 566 .quad sys_setreuid16 /* 70 */
478 .quad sys_setregid16 567 .quad sys_setregid16
479 .quad stub32_sigsuspend 568 .quad sys32_sigsuspend
480 .quad compat_sys_sigpending 569 .quad compat_sys_sigpending
481 .quad sys_sethostname 570 .quad sys_sethostname
482 .quad compat_sys_setrlimit /* 75 */ 571 .quad compat_sys_setrlimit /* 75 */
@@ -583,7 +672,7 @@ ia32_sys_call_table:
583 .quad sys32_rt_sigpending 672 .quad sys32_rt_sigpending
584 .quad compat_sys_rt_sigtimedwait 673 .quad compat_sys_rt_sigtimedwait
585 .quad sys32_rt_sigqueueinfo 674 .quad sys32_rt_sigqueueinfo
586 .quad stub32_rt_sigsuspend 675 .quad sys_rt_sigsuspend
587 .quad sys32_pread /* 180 */ 676 .quad sys32_pread /* 180 */
588 .quad sys32_pwrite 677 .quad sys32_pwrite
589 .quad sys_chown16 678 .quad sys_chown16
@@ -731,4 +820,10 @@ ia32_sys_call_table:
731 .quad sys32_fallocate 820 .quad sys32_fallocate
732 .quad compat_sys_timerfd_settime /* 325 */ 821 .quad compat_sys_timerfd_settime /* 325 */
733 .quad compat_sys_timerfd_gettime 822 .quad compat_sys_timerfd_gettime
823 .quad compat_sys_signalfd4
824 .quad sys_eventfd2
825 .quad sys_epoll_create1
826 .quad sys_dup3 /* 330 */
827 .quad sys_pipe2
828 .quad sys_inotify_init1
734ia32_syscall_end: 829ia32_syscall_end:
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index f00afdf61e67..beda4232ce69 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -238,7 +238,7 @@ asmlinkage long sys32_pipe(int __user *fd)
238 int retval; 238 int retval;
239 int fds[2]; 239 int fds[2];
240 240
241 retval = do_pipe(fds); 241 retval = do_pipe_flags(fds, 0);
242 if (retval) 242 if (retval)
243 goto out; 243 goto out;
244 if (copy_to_user(fd, fds, sizeof(fds))) 244 if (copy_to_user(fd, fds, sizeof(fds)))
@@ -556,15 +556,6 @@ asmlinkage long sys32_rt_sigqueueinfo(int pid, int sig,
556 return ret; 556 return ret;
557} 557}
558 558
559/* These are here just in case some old ia32 binary calls it. */
560asmlinkage long sys32_pause(void)
561{
562 current->state = TASK_INTERRUPTIBLE;
563 schedule();
564 return -ERESTARTNOHAND;
565}
566
567
568#ifdef CONFIG_SYSCTL_SYSCALL 559#ifdef CONFIG_SYSCTL_SYSCALL
569struct sysctl_ia32 { 560struct sysctl_ia32 {
570 unsigned int name; 561 unsigned int name;
diff --git a/arch/x86/kernel/.gitignore b/arch/x86/kernel/.gitignore
index 4ea38a39aed4..08f4fd731469 100644
--- a/arch/x86/kernel/.gitignore
+++ b/arch/x86/kernel/.gitignore
@@ -1,2 +1,3 @@
1vsyscall.lds 1vsyscall.lds
2vsyscall_32.lds 2vsyscall_32.lds
3vmlinux.lds
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d001739d8b06..50632e16d01c 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -2,10 +2,17 @@
2# Makefile for the linux kernel. 2# Makefile for the linux kernel.
3# 3#
4 4
5extra-y := head_$(BITS).o head$(BITS).o init_task.o vmlinux.lds 5extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinux.lds
6 6
7CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) 7CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
8 8
9ifdef CONFIG_FTRACE
10# Do not profile debug and lowlevel utilities
11CFLAGS_REMOVE_tsc.o = -pg
12CFLAGS_REMOVE_rtc.o = -pg
13CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
14endif
15
9# 16#
10# vsyscalls (which work on the user stack) should have 17# vsyscalls (which work on the user stack) should have
11# no stack-protector checks: 18# no stack-protector checks:
@@ -13,25 +20,26 @@ CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
13nostackp := $(call cc-option, -fno-stack-protector) 20nostackp := $(call cc-option, -fno-stack-protector)
14CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) 21CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
15CFLAGS_hpet.o := $(nostackp) 22CFLAGS_hpet.o := $(nostackp)
16CFLAGS_tsc_64.o := $(nostackp) 23CFLAGS_tsc.o := $(nostackp)
17CFLAGS_paravirt.o := $(nostackp) 24CFLAGS_paravirt.o := $(nostackp)
18 25
19obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o 26obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
20obj-y += traps_$(BITS).o irq_$(BITS).o 27obj-y += traps.o irq_$(BITS).o dumpstack_$(BITS).o
21obj-y += time_$(BITS).o ioport.o ldt.o 28obj-y += time_$(BITS).o ioport.o ldt.o
22obj-y += setup_$(BITS).o i8259_$(BITS).o setup.o 29obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
30obj-$(CONFIG_X86_VISWS) += visws_quirks.o
31obj-$(CONFIG_X86_32) += probe_roms_32.o
23obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 32obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
24obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o 33obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
25obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o 34obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
26obj-y += bootflag.o e820_$(BITS).o 35obj-y += bootflag.o e820.o
27obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o 36obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
28obj-y += alternative.o i8253.o pci-nommu.o 37obj-y += alternative.o i8253.o pci-nommu.o
29obj-$(CONFIG_X86_64) += bugs_64.o 38obj-y += tsc.o io_delay.o rtc.o
30obj-y += tsc_$(BITS).o io_delay.o rtc.o
31 39
32obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 40obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
33obj-y += process.o 41obj-y += process.o
34obj-y += i387.o 42obj-y += i387.o xsave.o
35obj-y += ptrace.o 43obj-y += ptrace.o
36obj-y += ds.o 44obj-y += ds.o
37obj-$(CONFIG_X86_32) += tls.o 45obj-$(CONFIG_X86_32) += tls.o
@@ -44,7 +52,6 @@ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o
44obj-$(CONFIG_MCA) += mca_32.o 52obj-$(CONFIG_MCA) += mca_32.o
45obj-$(CONFIG_X86_MSR) += msr.o 53obj-$(CONFIG_X86_MSR) += msr.o
46obj-$(CONFIG_X86_CPUID) += cpuid.o 54obj-$(CONFIG_X86_CPUID) += cpuid.o
47obj-$(CONFIG_MICROCODE) += microcode.o
48obj-$(CONFIG_PCI) += early-quirks.o 55obj-$(CONFIG_PCI) += early-quirks.o
49apm-y := apm_32.o 56apm-y := apm_32.o
50obj-$(CONFIG_APM) += apm.o 57obj-$(CONFIG_APM) += apm.o
@@ -54,18 +61,19 @@ obj-$(CONFIG_X86_32_SMP) += smpcommon.o
54obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o 61obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o
55obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o 62obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
56obj-$(CONFIG_X86_MPPARSE) += mpparse.o 63obj-$(CONFIG_X86_MPPARSE) += mpparse.o
57obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi_$(BITS).o 64obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi.o
58obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o 65obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o
59obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o 66obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
67obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
60obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o 68obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
61obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 69obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
62obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 70obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
63obj-$(CONFIG_X86_NUMAQ) += numaq_32.o 71obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
72obj-$(CONFIG_X86_ES7000) += es7000_32.o
64obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o 73obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o
65obj-y += vsmp_64.o 74obj-y += vsmp_64.o
66obj-$(CONFIG_KPROBES) += kprobes.o 75obj-$(CONFIG_KPROBES) += kprobes.o
67obj-$(CONFIG_MODULES) += module_$(BITS).o 76obj-$(CONFIG_MODULES) += module_$(BITS).o
68obj-$(CONFIG_ACPI_SRAT) += srat_32.o
69obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o 77obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
70obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o 78obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
71obj-$(CONFIG_KGDB) += kgdb.o 79obj-$(CONFIG_KGDB) += kgdb.o
@@ -82,7 +90,7 @@ obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
82obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o 90obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
83obj-$(CONFIG_KVM_GUEST) += kvm.o 91obj-$(CONFIG_KVM_GUEST) += kvm.o
84obj-$(CONFIG_KVM_CLOCK) += kvmclock.o 92obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
85obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o 93obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o paravirt-spinlocks.o
86obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o 94obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
87 95
88obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o 96obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
@@ -92,15 +100,24 @@ scx200-y += scx200_32.o
92 100
93obj-$(CONFIG_OLPC) += olpc.o 101obj-$(CONFIG_OLPC) += olpc.o
94 102
103microcode-y := microcode_core.o
104microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
105microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
106obj-$(CONFIG_MICROCODE) += microcode.o
107
95### 108###
96# 64 bit specific files 109# 64 bit specific files
97ifeq ($(CONFIG_X86_64),y) 110ifeq ($(CONFIG_X86_64),y)
98 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o 111 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
112 obj-y += bios_uv.o
113 obj-y += genx2apic_cluster.o
114 obj-y += genx2apic_phys.o
99 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o 115 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
100 obj-$(CONFIG_AUDIT) += audit_64.o 116 obj-$(CONFIG_AUDIT) += audit_64.o
101 117
102 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o 118 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
103 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o 119 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
120 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
104 obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o 121 obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o
105 122
106 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o 123 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 33c5216fd3e1..eb875cdc7367 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -37,6 +37,7 @@
37#include <asm/pgtable.h> 37#include <asm/pgtable.h>
38#include <asm/io_apic.h> 38#include <asm/io_apic.h>
39#include <asm/apic.h> 39#include <asm/apic.h>
40#include <asm/genapic.h>
40#include <asm/io.h> 41#include <asm/io.h>
41#include <asm/mpspec.h> 42#include <asm/mpspec.h>
42#include <asm/smp.h> 43#include <asm/smp.h>
@@ -57,7 +58,6 @@ EXPORT_SYMBOL(acpi_disabled);
57#ifdef CONFIG_X86_64 58#ifdef CONFIG_X86_64
58 59
59#include <asm/proto.h> 60#include <asm/proto.h>
60#include <asm/genapic.h>
61 61
62#else /* X86 */ 62#else /* X86 */
63 63
@@ -106,21 +106,6 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
106 */ 106 */
107enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC; 107enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
108 108
109#ifdef CONFIG_X86_64
110
111/* rely on all ACPI tables being in the direct mapping */
112char *__init __acpi_map_table(unsigned long phys_addr, unsigned long size)
113{
114 if (!phys_addr || !size)
115 return NULL;
116
117 if (phys_addr+size <= (max_pfn_mapped << PAGE_SHIFT) + PAGE_SIZE)
118 return __va(phys_addr);
119
120 return NULL;
121}
122
123#else
124 109
125/* 110/*
126 * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END, 111 * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
@@ -139,11 +124,15 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
139 unsigned long base, offset, mapped_size; 124 unsigned long base, offset, mapped_size;
140 int idx; 125 int idx;
141 126
142 if (phys + size < 8 * 1024 * 1024) 127 if (!phys || !size)
128 return NULL;
129
130 if (phys+size <= (max_low_pfn_mapped << PAGE_SHIFT))
143 return __va(phys); 131 return __va(phys);
144 132
145 offset = phys & (PAGE_SIZE - 1); 133 offset = phys & (PAGE_SIZE - 1);
146 mapped_size = PAGE_SIZE - offset; 134 mapped_size = PAGE_SIZE - offset;
135 clear_fixmap(FIX_ACPI_END);
147 set_fixmap(FIX_ACPI_END, phys); 136 set_fixmap(FIX_ACPI_END, phys);
148 base = fix_to_virt(FIX_ACPI_END); 137 base = fix_to_virt(FIX_ACPI_END);
149 138
@@ -155,19 +144,29 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
155 if (--idx < FIX_ACPI_BEGIN) 144 if (--idx < FIX_ACPI_BEGIN)
156 return NULL; /* cannot handle this */ 145 return NULL; /* cannot handle this */
157 phys += PAGE_SIZE; 146 phys += PAGE_SIZE;
147 clear_fixmap(idx);
158 set_fixmap(idx, phys); 148 set_fixmap(idx, phys);
159 mapped_size += PAGE_SIZE; 149 mapped_size += PAGE_SIZE;
160 } 150 }
161 151
162 return ((unsigned char *)base + offset); 152 return ((unsigned char *)base + offset);
163} 153}
164#endif
165 154
166#ifdef CONFIG_PCI_MMCONFIG 155#ifdef CONFIG_PCI_MMCONFIG
167/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ 156/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */
168struct acpi_mcfg_allocation *pci_mmcfg_config; 157struct acpi_mcfg_allocation *pci_mmcfg_config;
169int pci_mmcfg_config_num; 158int pci_mmcfg_config_num;
170 159
160static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
161
162static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
163{
164 if (!strcmp(mcfg->header.oem_id, "SGI"))
165 acpi_mcfg_64bit_base_addr = TRUE;
166
167 return 0;
168}
169
171int __init acpi_parse_mcfg(struct acpi_table_header *header) 170int __init acpi_parse_mcfg(struct acpi_table_header *header)
172{ 171{
173 struct acpi_table_mcfg *mcfg; 172 struct acpi_table_mcfg *mcfg;
@@ -200,8 +199,12 @@ int __init acpi_parse_mcfg(struct acpi_table_header *header)
200 } 199 }
201 200
202 memcpy(pci_mmcfg_config, &mcfg[1], config_size); 201 memcpy(pci_mmcfg_config, &mcfg[1], config_size);
202
203 acpi_mcfg_oem_check(mcfg);
204
203 for (i = 0; i < pci_mmcfg_config_num; ++i) { 205 for (i = 0; i < pci_mmcfg_config_num; ++i) {
204 if (pci_mmcfg_config[i].address > 0xFFFFFFFF) { 206 if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) &&
207 !acpi_mcfg_64bit_base_addr) {
205 printk(KERN_ERR PREFIX 208 printk(KERN_ERR PREFIX
206 "MMCONFIG not in low 4GB of memory\n"); 209 "MMCONFIG not in low 4GB of memory\n");
207 kfree(pci_mmcfg_config); 210 kfree(pci_mmcfg_config);
@@ -249,10 +252,8 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
249 return; 252 return;
250 } 253 }
251 254
252#ifdef CONFIG_X86_32
253 if (boot_cpu_physical_apicid != -1U) 255 if (boot_cpu_physical_apicid != -1U)
254 ver = apic_version[boot_cpu_physical_apicid]; 256 ver = apic_version[boot_cpu_physical_apicid];
255#endif
256 257
257 generic_processor_info(id, ver); 258 generic_processor_info(id, ver);
258} 259}
@@ -338,8 +339,6 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e
338 339
339#ifdef CONFIG_X86_IO_APIC 340#ifdef CONFIG_X86_IO_APIC
340 341
341struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
342
343static int __init 342static int __init
344acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end) 343acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
345{ 344{
@@ -514,8 +513,6 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity)
514 * Make sure all (legacy) PCI IRQs are set as level-triggered. 513 * Make sure all (legacy) PCI IRQs are set as level-triggered.
515 */ 514 */
516 if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { 515 if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
517 extern void eisa_set_level_irq(unsigned int irq);
518
519 if (triggering == ACPI_LEVEL_SENSITIVE) 516 if (triggering == ACPI_LEVEL_SENSITIVE)
520 eisa_set_level_irq(gsi); 517 eisa_set_level_irq(gsi);
521 } 518 }
@@ -775,11 +772,9 @@ static void __init acpi_register_lapic_address(unsigned long address)
775 772
776 set_fixmap_nocache(FIX_APIC_BASE, address); 773 set_fixmap_nocache(FIX_APIC_BASE, address);
777 if (boot_cpu_physical_apicid == -1U) { 774 if (boot_cpu_physical_apicid == -1U) {
778 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); 775 boot_cpu_physical_apicid = read_apic_id();
779#ifdef CONFIG_X86_32
780 apic_version[boot_cpu_physical_apicid] = 776 apic_version[boot_cpu_physical_apicid] =
781 GET_APIC_VERSION(apic_read(APIC_LVR)); 777 GET_APIC_VERSION(apic_read(APIC_LVR));
782#endif
783 } 778 }
784} 779}
785 780
@@ -860,6 +855,364 @@ static int __init acpi_parse_madt_lapic_entries(void)
860#endif /* CONFIG_X86_LOCAL_APIC */ 855#endif /* CONFIG_X86_LOCAL_APIC */
861 856
862#ifdef CONFIG_X86_IO_APIC 857#ifdef CONFIG_X86_IO_APIC
858#define MP_ISA_BUS 0
859
860#ifdef CONFIG_X86_ES7000
861extern int es7000_plat;
862#endif
863
864static struct {
865 int apic_id;
866 int gsi_base;
867 int gsi_end;
868 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
869} mp_ioapic_routing[MAX_IO_APICS];
870
871static int mp_find_ioapic(int gsi)
872{
873 int i = 0;
874
875 /* Find the IOAPIC that manages this GSI. */
876 for (i = 0; i < nr_ioapics; i++) {
877 if ((gsi >= mp_ioapic_routing[i].gsi_base)
878 && (gsi <= mp_ioapic_routing[i].gsi_end))
879 return i;
880 }
881
882 printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
883 return -1;
884}
885
886static u8 __init uniq_ioapic_id(u8 id)
887{
888#ifdef CONFIG_X86_32
889 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
890 !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
891 return io_apic_get_unique_id(nr_ioapics, id);
892 else
893 return id;
894#else
895 int i;
896 DECLARE_BITMAP(used, 256);
897 bitmap_zero(used, 256);
898 for (i = 0; i < nr_ioapics; i++) {
899 struct mp_config_ioapic *ia = &mp_ioapics[i];
900 __set_bit(ia->mp_apicid, used);
901 }
902 if (!test_bit(id, used))
903 return id;
904 return find_first_zero_bit(used, 256);
905#endif
906}
907
908static int bad_ioapic(unsigned long address)
909{
910 if (nr_ioapics >= MAX_IO_APICS) {
911 printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
912 "(found %d)\n", MAX_IO_APICS, nr_ioapics);
913 panic("Recompile kernel with bigger MAX_IO_APICS!\n");
914 }
915 if (!address) {
916 printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
917 " found in table, skipping!\n");
918 return 1;
919 }
920 return 0;
921}
922
923void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
924{
925 int idx = 0;
926
927 if (bad_ioapic(address))
928 return;
929
930 idx = nr_ioapics;
931
932 mp_ioapics[idx].mp_type = MP_IOAPIC;
933 mp_ioapics[idx].mp_flags = MPC_APIC_USABLE;
934 mp_ioapics[idx].mp_apicaddr = address;
935
936 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
937 mp_ioapics[idx].mp_apicid = uniq_ioapic_id(id);
938#ifdef CONFIG_X86_32
939 mp_ioapics[idx].mp_apicver = io_apic_get_version(idx);
940#else
941 mp_ioapics[idx].mp_apicver = 0;
942#endif
943 /*
944 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
945 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
946 */
947 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mp_apicid;
948 mp_ioapic_routing[idx].gsi_base = gsi_base;
949 mp_ioapic_routing[idx].gsi_end = gsi_base +
950 io_apic_get_redir_entries(idx);
951
952 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
953 "GSI %d-%d\n", idx, mp_ioapics[idx].mp_apicid,
954 mp_ioapics[idx].mp_apicver, mp_ioapics[idx].mp_apicaddr,
955 mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
956
957 nr_ioapics++;
958}
959
960static void assign_to_mp_irq(struct mp_config_intsrc *m,
961 struct mp_config_intsrc *mp_irq)
962{
963 memcpy(mp_irq, m, sizeof(struct mp_config_intsrc));
964}
965
966static int mp_irq_cmp(struct mp_config_intsrc *mp_irq,
967 struct mp_config_intsrc *m)
968{
969 return memcmp(mp_irq, m, sizeof(struct mp_config_intsrc));
970}
971
972static void save_mp_irq(struct mp_config_intsrc *m)
973{
974 int i;
975
976 for (i = 0; i < mp_irq_entries; i++) {
977 if (!mp_irq_cmp(&mp_irqs[i], m))
978 return;
979 }
980
981 assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
982 if (++mp_irq_entries == MAX_IRQ_SOURCES)
983 panic("Max # of irq sources exceeded!!\n");
984}
985
986void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
987{
988 int ioapic;
989 int pin;
990 struct mp_config_intsrc mp_irq;
991
992 /*
993 * Convert 'gsi' to 'ioapic.pin'.
994 */
995 ioapic = mp_find_ioapic(gsi);
996 if (ioapic < 0)
997 return;
998 pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
999
1000 /*
1001 * TBD: This check is for faulty timer entries, where the override
1002 * erroneously sets the trigger to level, resulting in a HUGE
1003 * increase of timer interrupts!
1004 */
1005 if ((bus_irq == 0) && (trigger == 3))
1006 trigger = 1;
1007
1008 mp_irq.mp_type = MP_INTSRC;
1009 mp_irq.mp_irqtype = mp_INT;
1010 mp_irq.mp_irqflag = (trigger << 2) | polarity;
1011 mp_irq.mp_srcbus = MP_ISA_BUS;
1012 mp_irq.mp_srcbusirq = bus_irq; /* IRQ */
1013 mp_irq.mp_dstapic = mp_ioapics[ioapic].mp_apicid; /* APIC ID */
1014 mp_irq.mp_dstirq = pin; /* INTIN# */
1015
1016 save_mp_irq(&mp_irq);
1017}
1018
1019void __init mp_config_acpi_legacy_irqs(void)
1020{
1021 int i;
1022 int ioapic;
1023 unsigned int dstapic;
1024 struct mp_config_intsrc mp_irq;
1025
1026#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
1027 /*
1028 * Fabricate the legacy ISA bus (bus #31).
1029 */
1030 mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
1031#endif
1032 set_bit(MP_ISA_BUS, mp_bus_not_pci);
1033 pr_debug("Bus #%d is ISA\n", MP_ISA_BUS);
1034
1035#ifdef CONFIG_X86_ES7000
1036 /*
1037 * Older generations of ES7000 have no legacy identity mappings
1038 */
1039 if (es7000_plat == 1)
1040 return;
1041#endif
1042
1043 /*
1044 * Locate the IOAPIC that manages the ISA IRQs (0-15).
1045 */
1046 ioapic = mp_find_ioapic(0);
1047 if (ioapic < 0)
1048 return;
1049 dstapic = mp_ioapics[ioapic].mp_apicid;
1050
1051 /*
1052 * Use the default configuration for the IRQs 0-15. Unless
1053 * overridden by (MADT) interrupt source override entries.
1054 */
1055 for (i = 0; i < 16; i++) {
1056 int idx;
1057
1058 for (idx = 0; idx < mp_irq_entries; idx++) {
1059 struct mp_config_intsrc *irq = mp_irqs + idx;
1060
1061 /* Do we already have a mapping for this ISA IRQ? */
1062 if (irq->mp_srcbus == MP_ISA_BUS
1063 && irq->mp_srcbusirq == i)
1064 break;
1065
1066 /* Do we already have a mapping for this IOAPIC pin */
1067 if (irq->mp_dstapic == dstapic &&
1068 irq->mp_dstirq == i)
1069 break;
1070 }
1071
1072 if (idx != mp_irq_entries) {
1073 printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
1074 continue; /* IRQ already used */
1075 }
1076
1077 mp_irq.mp_type = MP_INTSRC;
1078 mp_irq.mp_irqflag = 0; /* Conforming */
1079 mp_irq.mp_srcbus = MP_ISA_BUS;
1080 mp_irq.mp_dstapic = dstapic;
1081 mp_irq.mp_irqtype = mp_INT;
1082 mp_irq.mp_srcbusirq = i; /* Identity mapped */
1083 mp_irq.mp_dstirq = i;
1084
1085 save_mp_irq(&mp_irq);
1086 }
1087}
1088
1089int mp_register_gsi(u32 gsi, int triggering, int polarity)
1090{
1091 int ioapic;
1092 int ioapic_pin;
1093#ifdef CONFIG_X86_32
1094#define MAX_GSI_NUM 4096
1095#define IRQ_COMPRESSION_START 64
1096
1097 static int pci_irq = IRQ_COMPRESSION_START;
1098 /*
1099 * Mapping between Global System Interrupts, which
1100 * represent all possible interrupts, and IRQs
1101 * assigned to actual devices.
1102 */
1103 static int gsi_to_irq[MAX_GSI_NUM];
1104#else
1105
1106 if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
1107 return gsi;
1108#endif
1109
1110 /* Don't set up the ACPI SCI because it's already set up */
1111 if (acpi_gbl_FADT.sci_interrupt == gsi)
1112 return gsi;
1113
1114 ioapic = mp_find_ioapic(gsi);
1115 if (ioapic < 0) {
1116 printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
1117 return gsi;
1118 }
1119
1120 ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
1121
1122#ifdef CONFIG_X86_32
1123 if (ioapic_renumber_irq)
1124 gsi = ioapic_renumber_irq(ioapic, gsi);
1125#endif
1126
1127 /*
1128 * Avoid pin reprogramming. PRTs typically include entries
1129 * with redundant pin->gsi mappings (but unique PCI devices);
1130 * we only program the IOAPIC on the first.
1131 */
1132 if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
1133 printk(KERN_ERR "Invalid reference to IOAPIC pin "
1134 "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
1135 ioapic_pin);
1136 return gsi;
1137 }
1138 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
1139 pr_debug(KERN_DEBUG "Pin %d-%d already programmed\n",
1140 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
1141#ifdef CONFIG_X86_32
1142 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
1143#else
1144 return gsi;
1145#endif
1146 }
1147
1148 set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
1149#ifdef CONFIG_X86_32
1150 /*
1151 * For GSI >= 64, use IRQ compression
1152 */
1153 if ((gsi >= IRQ_COMPRESSION_START)
1154 && (triggering == ACPI_LEVEL_SENSITIVE)) {
1155 /*
1156 * For PCI devices assign IRQs in order, avoiding gaps
1157 * due to unused I/O APIC pins.
1158 */
1159 int irq = gsi;
1160 if (gsi < MAX_GSI_NUM) {
1161 /*
1162 * Retain the VIA chipset work-around (gsi > 15), but
1163 * avoid a problem where the 8254 timer (IRQ0) is setup
1164 * via an override (so it's not on pin 0 of the ioapic),
1165 * and at the same time, the pin 0 interrupt is a PCI
1166 * type. The gsi > 15 test could cause these two pins
1167 * to be shared as IRQ0, and they are not shareable.
1168 * So test for this condition, and if necessary, avoid
1169 * the pin collision.
1170 */
1171 gsi = pci_irq++;
1172 /*
1173 * Don't assign IRQ used by ACPI SCI
1174 */
1175 if (gsi == acpi_gbl_FADT.sci_interrupt)
1176 gsi = pci_irq++;
1177 gsi_to_irq[irq] = gsi;
1178 } else {
1179 printk(KERN_ERR "GSI %u is too high\n", gsi);
1180 return gsi;
1181 }
1182 }
1183#endif
1184 io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
1185 triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
1186 polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
1187 return gsi;
1188}
1189
1190int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
1191 u32 gsi, int triggering, int polarity)
1192{
1193#ifdef CONFIG_X86_MPPARSE
1194 struct mp_config_intsrc mp_irq;
1195 int ioapic;
1196
1197 if (!acpi_ioapic)
1198 return 0;
1199
1200 /* print the entry should happen on mptable identically */
1201 mp_irq.mp_type = MP_INTSRC;
1202 mp_irq.mp_irqtype = mp_INT;
1203 mp_irq.mp_irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
1204 (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
1205 mp_irq.mp_srcbus = number;
1206 mp_irq.mp_srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
1207 ioapic = mp_find_ioapic(gsi);
1208 mp_irq.mp_dstapic = mp_ioapic_routing[ioapic].apic_id;
1209 mp_irq.mp_dstirq = gsi - mp_ioapic_routing[ioapic].gsi_base;
1210
1211 save_mp_irq(&mp_irq);
1212#endif
1213 return 0;
1214}
1215
863/* 1216/*
864 * Parse IOAPIC related entries in MADT 1217 * Parse IOAPIC related entries in MADT
865 * returns 0 on success, < 0 on error 1218 * returns 0 on success, < 0 on error
@@ -993,7 +1346,9 @@ static void __init acpi_process_madt(void)
993 acpi_ioapic = 1; 1346 acpi_ioapic = 1;
994 1347
995 smp_found_config = 1; 1348 smp_found_config = 1;
1349#ifdef CONFIG_X86_32
996 setup_apic_routing(); 1350 setup_apic_routing();
1351#endif
997 } 1352 }
998 } 1353 }
999 if (error == -EINVAL) { 1354 if (error == -EINVAL) {
@@ -1009,8 +1364,6 @@ static void __init acpi_process_madt(void)
1009 return; 1364 return;
1010} 1365}
1011 1366
1012#ifdef __i386__
1013
1014static int __init disable_acpi_irq(const struct dmi_system_id *d) 1367static int __init disable_acpi_irq(const struct dmi_system_id *d)
1015{ 1368{
1016 if (!acpi_force) { 1369 if (!acpi_force) {
@@ -1061,6 +1414,24 @@ static int __init force_acpi_ht(const struct dmi_system_id *d)
1061} 1414}
1062 1415
1063/* 1416/*
1417 * Force ignoring BIOS IRQ0 pin2 override
1418 */
1419static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
1420{
1421 /*
1422 * The ati_ixp4x0_rev() early PCI quirk should have set
1423 * the acpi_skip_timer_override flag already:
1424 */
1425 if (!acpi_skip_timer_override) {
1426 WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n");
1427 pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n",
1428 d->ident);
1429 acpi_skip_timer_override = 1;
1430 }
1431 return 0;
1432}
1433
1434/*
1064 * If your system is blacklisted here, but you find that acpi=force 1435 * If your system is blacklisted here, but you find that acpi=force
1065 * works for you, please contact acpi-devel@sourceforge.net 1436 * works for you, please contact acpi-devel@sourceforge.net
1066 */ 1437 */
@@ -1227,11 +1598,51 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1227 DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), 1598 DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
1228 }, 1599 },
1229 }, 1600 },
1601 /*
1602 * HP laptops which use a DSDT reporting as HP/SB400/10000,
1603 * which includes some code which overrides all temperature
1604 * trip points to 16C if the INTIN2 input of the I/O APIC
1605 * is enabled. This input is incorrectly designated the
1606 * ISA IRQ 0 via an interrupt source override even though
1607 * it is wired to the output of the master 8259A and INTIN0
1608 * is not connected at all. Force ignoring BIOS IRQ0 pin2
1609 * override in that cases.
1610 */
1611 {
1612 .callback = dmi_ignore_irq0_timer_override,
1613 .ident = "HP nx6115 laptop",
1614 .matches = {
1615 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
1616 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6115"),
1617 },
1618 },
1619 {
1620 .callback = dmi_ignore_irq0_timer_override,
1621 .ident = "HP NX6125 laptop",
1622 .matches = {
1623 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
1624 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6125"),
1625 },
1626 },
1627 {
1628 .callback = dmi_ignore_irq0_timer_override,
1629 .ident = "HP NX6325 laptop",
1630 .matches = {
1631 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
1632 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"),
1633 },
1634 },
1635 {
1636 .callback = dmi_ignore_irq0_timer_override,
1637 .ident = "HP 6715b laptop",
1638 .matches = {
1639 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
1640 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"),
1641 },
1642 },
1230 {} 1643 {}
1231}; 1644};
1232 1645
1233#endif /* __i386__ */
1234
1235/* 1646/*
1236 * acpi_boot_table_init() and acpi_boot_init() 1647 * acpi_boot_table_init() and acpi_boot_init()
1237 * called from setup_arch(), always. 1648 * called from setup_arch(), always.
@@ -1259,9 +1670,7 @@ int __init acpi_boot_table_init(void)
1259{ 1670{
1260 int error; 1671 int error;
1261 1672
1262#ifdef __i386__
1263 dmi_check_system(acpi_dmi_table); 1673 dmi_check_system(acpi_dmi_table);
1264#endif
1265 1674
1266 /* 1675 /*
1267 * If acpi_disabled, bail out 1676 * If acpi_disabled, bail out
@@ -1386,6 +1795,20 @@ static int __init parse_pci(char *arg)
1386} 1795}
1387early_param("pci", parse_pci); 1796early_param("pci", parse_pci);
1388 1797
1798int __init acpi_mps_check(void)
1799{
1800#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE)
1801/* mptable code is not built-in*/
1802 if (acpi_disabled || acpi_noirq) {
1803 printk(KERN_WARNING "MPS support code is not built-in.\n"
1804 "Using acpi=off or acpi=noirq or pci=noacpi "
1805 "may have problem\n");
1806 return 1;
1807 }
1808#endif
1809 return 0;
1810}
1811
1389#ifdef CONFIG_X86_IO_APIC 1812#ifdef CONFIG_X86_IO_APIC
1390static int __init parse_acpi_skip_timer_override(char *arg) 1813static int __init parse_acpi_skip_timer_override(char *arg)
1391{ 1814{
diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c
index de2d2e4ebad9..7c074eec39fb 100644
--- a/arch/x86/kernel/acpi/processor.c
+++ b/arch/x86/kernel/acpi/processor.c
@@ -56,6 +56,12 @@ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c)
56 if (cpu_has(c, X86_FEATURE_ACPI)) 56 if (cpu_has(c, X86_FEATURE_ACPI))
57 buf[2] |= ACPI_PDC_T_FFH; 57 buf[2] |= ACPI_PDC_T_FFH;
58 58
59 /*
60 * If mwait/monitor is unsupported, C2/C3_FFH will be disabled
61 */
62 if (!cpu_has(c, X86_FEATURE_MWAIT))
63 buf[2] &= ~(ACPI_PDC_C_C2C3_FFH);
64
59 obj->type = ACPI_TYPE_BUFFER; 65 obj->type = ACPI_TYPE_BUFFER;
60 obj->buffer.length = 12; 66 obj->buffer.length = 12;
61 obj->buffer.pointer = (u8 *) buf; 67 obj->buffer.pointer = (u8 *) buf;
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index f9b77fb37e5b..3355973b12ac 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -5,6 +5,7 @@
5#include <asm/msr-index.h> 5#include <asm/msr-index.h>
6#include <asm/page.h> 6#include <asm/page.h>
7#include <asm/pgtable.h> 7#include <asm/pgtable.h>
8#include <asm/processor-flags.h>
8 9
9 .code16 10 .code16
10 .section ".header", "a" 11 .section ".header", "a"
@@ -24,6 +25,11 @@ pmode_gdt: .quad 0
24realmode_flags: .long 0 25realmode_flags: .long 0
25real_magic: .long 0 26real_magic: .long 0
26trampoline_segment: .word 0 27trampoline_segment: .word 0
28_pad1: .byte 0
29wakeup_jmp: .byte 0xea /* ljmpw */
30wakeup_jmp_off: .word 3f
31wakeup_jmp_seg: .word 0
32wakeup_gdt: .quad 0, 0, 0
27signature: .long 0x51ee1111 33signature: .long 0x51ee1111
28 34
29 .text 35 .text
@@ -34,11 +40,34 @@ _start:
34 cli 40 cli
35 cld 41 cld
36 42
43 /* Apparently some dimwit BIOS programmers don't know how to
44 program a PM to RM transition, and we might end up here with
45 junk in the data segment descriptor registers. The only way
46 to repair that is to go into PM and fix it ourselves... */
47 movw $16, %cx
48 lgdtl %cs:wakeup_gdt
49 movl %cr0, %eax
50 orb $X86_CR0_PE, %al
51 movl %eax, %cr0
52 jmp 1f
531: ljmpw $8, $2f
542:
55 movw %cx, %ds
56 movw %cx, %es
57 movw %cx, %ss
58 movw %cx, %fs
59 movw %cx, %gs
60
61 andb $~X86_CR0_PE, %al
62 movl %eax, %cr0
63 jmp wakeup_jmp
643:
37 /* Set up segments */ 65 /* Set up segments */
38 movw %cs, %ax 66 movw %cs, %ax
39 movw %ax, %ds 67 movw %ax, %ds
40 movw %ax, %es 68 movw %ax, %es
41 movw %ax, %ss 69 movw %ax, %ss
70 lidtl wakeup_idt
42 71
43 movl $wakeup_stack_end, %esp 72 movl $wakeup_stack_end, %esp
44 73
@@ -98,7 +127,14 @@ bogus_real_magic:
98 jmp 1b 127 jmp 1b
99 128
100 .data 129 .data
101 .balign 4 130 .balign 8
131
132 /* This is the standard real-mode IDT */
133wakeup_idt:
134 .word 0xffff /* limit */
135 .long 0 /* address */
136 .word 0
137
102 .globl HEAP, heap_end 138 .globl HEAP, heap_end
103HEAP: 139HEAP:
104 .long wakeup_heap 140 .long wakeup_heap
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h
index ef8166fe8020..69d38d0b2b64 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.h
+++ b/arch/x86/kernel/acpi/realmode/wakeup.h
@@ -24,6 +24,11 @@ struct wakeup_header {
24 u32 realmode_flags; 24 u32 realmode_flags;
25 u32 real_magic; 25 u32 real_magic;
26 u16 trampoline_segment; /* segment with trampoline code, 64-bit only */ 26 u16 trampoline_segment; /* segment with trampoline code, 64-bit only */
27 u8 _pad1;
28 u8 wakeup_jmp;
29 u16 wakeup_jmp_off;
30 u16 wakeup_jmp_seg;
31 u64 wakeup_gdt[3];
27 u32 signature; /* To check we have correct structure */ 32 u32 signature; /* To check we have correct structure */
28} __attribute__((__packed__)); 33} __attribute__((__packed__));
29 34
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index afc25ee9964b..426e5d91b63a 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -9,6 +9,7 @@
9#include <linux/bootmem.h> 9#include <linux/bootmem.h>
10#include <linux/dmi.h> 10#include <linux/dmi.h>
11#include <linux/cpumask.h> 11#include <linux/cpumask.h>
12#include <asm/segment.h>
12 13
13#include "realmode/wakeup.h" 14#include "realmode/wakeup.h"
14#include "sleep.h" 15#include "sleep.h"
@@ -19,7 +20,7 @@ unsigned long acpi_realmode_flags;
19/* address in low memory of the wakeup routine. */ 20/* address in low memory of the wakeup routine. */
20static unsigned long acpi_realmode; 21static unsigned long acpi_realmode;
21 22
22#ifdef CONFIG_64BIT 23#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
23static char temp_stack[10240]; 24static char temp_stack[10240];
24#endif 25#endif
25 26
@@ -50,6 +51,29 @@ int acpi_save_state_mem(void)
50 51
51 header->video_mode = saved_video_mode; 52 header->video_mode = saved_video_mode;
52 53
54 header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
55
56 /*
57 * Set up the wakeup GDT. We set these up as Big Real Mode,
58 * that is, with limits set to 4 GB. At least the Lenovo
59 * Thinkpad X61 is known to need this for the video BIOS
60 * initialization quirk to work; this is likely to also
61 * be the case for other laptops or integrated video devices.
62 */
63
64 /* GDT[0]: GDT self-pointer */
65 header->wakeup_gdt[0] =
66 (u64)(sizeof(header->wakeup_gdt) - 1) +
67 ((u64)(acpi_wakeup_address +
68 ((char *)&header->wakeup_gdt - (char *)acpi_realmode))
69 << 16);
70 /* GDT[1]: big real mode-like code segment */
71 header->wakeup_gdt[1] =
72 GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
73 /* GDT[2]: big real mode-like data segment */
74 header->wakeup_gdt[2] =
75 GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff);
76
53#ifndef CONFIG_64BIT 77#ifndef CONFIG_64BIT
54 store_gdt((struct desc_ptr *)&header->pmode_gdt); 78 store_gdt((struct desc_ptr *)&header->pmode_gdt);
55 79
@@ -62,7 +86,7 @@ int acpi_save_state_mem(void)
62#endif /* !CONFIG_64BIT */ 86#endif /* !CONFIG_64BIT */
63 87
64 header->pmode_cr0 = read_cr0(); 88 header->pmode_cr0 = read_cr0();
65 header->pmode_cr4 = read_cr4(); 89 header->pmode_cr4 = read_cr4_safe();
66 header->realmode_flags = acpi_realmode_flags; 90 header->realmode_flags = acpi_realmode_flags;
67 header->real_magic = 0x12345678; 91 header->real_magic = 0x12345678;
68 92
@@ -72,7 +96,9 @@ int acpi_save_state_mem(void)
72 saved_magic = 0x12345678; 96 saved_magic = 0x12345678;
73#else /* CONFIG_64BIT */ 97#else /* CONFIG_64BIT */
74 header->trampoline_segment = setup_trampoline() >> 4; 98 header->trampoline_segment = setup_trampoline() >> 4;
75 init_rsp = (unsigned long)temp_stack + 4096; 99#ifdef CONFIG_SMP
100 stack_start.sp = temp_stack + 4096;
101#endif
76 initial_code = (unsigned long)wakeup_long64; 102 initial_code = (unsigned long)wakeup_long64;
77 saved_magic = 0x123456789abcdef0; 103 saved_magic = 0x123456789abcdef0;
78#endif /* CONFIG_64BIT */ 104#endif /* CONFIG_64BIT */
@@ -111,7 +137,7 @@ void __init acpi_reserve_bootmem(void)
111 return; 137 return;
112 } 138 }
113 139
114 acpi_wakeup_address = acpi_realmode; 140 acpi_wakeup_address = virt_to_phys((void *)acpi_realmode);
115} 141}
116 142
117 143
@@ -124,6 +150,12 @@ static int __init acpi_sleep_setup(char *str)
124 acpi_realmode_flags |= 2; 150 acpi_realmode_flags |= 2;
125 if (strncmp(str, "s3_beep", 7) == 0) 151 if (strncmp(str, "s3_beep", 7) == 0)
126 acpi_realmode_flags |= 4; 152 acpi_realmode_flags |= 4;
153#ifdef CONFIG_HIBERNATION
154 if (strncmp(str, "s4_nohwsig", 10) == 0)
155 acpi_no_s4_hw_signature();
156#endif
157 if (strncmp(str, "old_ordering", 12) == 0)
158 acpi_old_suspend_ordering();
127 str = strchr(str, ','); 159 str = strchr(str, ',');
128 if (str != NULL) 160 if (str != NULL)
129 str += strspn(str, ", \t"); 161 str += strspn(str, ", \t");
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 65c7857a90dd..a84ac7b570e6 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1,6 +1,6 @@
1#include <linux/module.h> 1#include <linux/module.h>
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include <linux/spinlock.h> 3#include <linux/mutex.h>
4#include <linux/list.h> 4#include <linux/list.h>
5#include <linux/kprobes.h> 5#include <linux/kprobes.h>
6#include <linux/mm.h> 6#include <linux/mm.h>
@@ -143,37 +143,27 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = {
143#ifdef CONFIG_X86_64 143#ifdef CONFIG_X86_64
144 144
145extern char __vsyscall_0; 145extern char __vsyscall_0;
146static inline const unsigned char*const * find_nop_table(void) 146const unsigned char *const *find_nop_table(void)
147{ 147{
148 return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || 148 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
149 boot_cpu_data.x86 < 6 ? k8_nops : p6_nops; 149 boot_cpu_has(X86_FEATURE_NOPL))
150 return p6_nops;
151 else
152 return k8_nops;
150} 153}
151 154
152#else /* CONFIG_X86_64 */ 155#else /* CONFIG_X86_64 */
153 156
154static const struct nop { 157const unsigned char *const *find_nop_table(void)
155 int cpuid;
156 const unsigned char *const *noptable;
157} noptypes[] = {
158 { X86_FEATURE_K8, k8_nops },
159 { X86_FEATURE_K7, k7_nops },
160 { X86_FEATURE_P4, p6_nops },
161 { X86_FEATURE_P3, p6_nops },
162 { -1, NULL }
163};
164
165static const unsigned char*const * find_nop_table(void)
166{ 158{
167 const unsigned char *const *noptable = intel_nops; 159 if (boot_cpu_has(X86_FEATURE_K8))
168 int i; 160 return k8_nops;
169 161 else if (boot_cpu_has(X86_FEATURE_K7))
170 for (i = 0; noptypes[i].cpuid >= 0; i++) { 162 return k7_nops;
171 if (boot_cpu_has(noptypes[i].cpuid)) { 163 else if (boot_cpu_has(X86_FEATURE_NOPL))
172 noptable = noptypes[i].noptable; 164 return p6_nops;
173 break; 165 else
174 } 166 return intel_nops;
175 }
176 return noptable;
177} 167}
178 168
179#endif /* CONFIG_X86_64 */ 169#endif /* CONFIG_X86_64 */
@@ -241,25 +231,25 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
241 continue; 231 continue;
242 if (*ptr > text_end) 232 if (*ptr > text_end)
243 continue; 233 continue;
244 text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */ 234 /* turn DS segment override prefix into lock prefix */
235 text_poke(*ptr, ((unsigned char []){0xf0}), 1);
245 }; 236 };
246} 237}
247 238
248static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) 239static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
249{ 240{
250 u8 **ptr; 241 u8 **ptr;
251 char insn[1];
252 242
253 if (noreplace_smp) 243 if (noreplace_smp)
254 return; 244 return;
255 245
256 add_nops(insn, 1);
257 for (ptr = start; ptr < end; ptr++) { 246 for (ptr = start; ptr < end; ptr++) {
258 if (*ptr < text) 247 if (*ptr < text)
259 continue; 248 continue;
260 if (*ptr > text_end) 249 if (*ptr > text_end)
261 continue; 250 continue;
262 text_poke(*ptr, insn, 1); 251 /* turn lock prefix into DS segment override prefix */
252 text_poke(*ptr, ((unsigned char []){0x3E}), 1);
263 }; 253 };
264} 254}
265 255
@@ -279,7 +269,7 @@ struct smp_alt_module {
279 struct list_head next; 269 struct list_head next;
280}; 270};
281static LIST_HEAD(smp_alt_modules); 271static LIST_HEAD(smp_alt_modules);
282static DEFINE_SPINLOCK(smp_alt); 272static DEFINE_MUTEX(smp_alt);
283static int smp_mode = 1; /* protected by smp_alt */ 273static int smp_mode = 1; /* protected by smp_alt */
284 274
285void alternatives_smp_module_add(struct module *mod, char *name, 275void alternatives_smp_module_add(struct module *mod, char *name,
@@ -312,12 +302,12 @@ void alternatives_smp_module_add(struct module *mod, char *name,
312 __func__, smp->locks, smp->locks_end, 302 __func__, smp->locks, smp->locks_end,
313 smp->text, smp->text_end, smp->name); 303 smp->text, smp->text_end, smp->name);
314 304
315 spin_lock(&smp_alt); 305 mutex_lock(&smp_alt);
316 list_add_tail(&smp->next, &smp_alt_modules); 306 list_add_tail(&smp->next, &smp_alt_modules);
317 if (boot_cpu_has(X86_FEATURE_UP)) 307 if (boot_cpu_has(X86_FEATURE_UP))
318 alternatives_smp_unlock(smp->locks, smp->locks_end, 308 alternatives_smp_unlock(smp->locks, smp->locks_end,
319 smp->text, smp->text_end); 309 smp->text, smp->text_end);
320 spin_unlock(&smp_alt); 310 mutex_unlock(&smp_alt);
321} 311}
322 312
323void alternatives_smp_module_del(struct module *mod) 313void alternatives_smp_module_del(struct module *mod)
@@ -327,17 +317,17 @@ void alternatives_smp_module_del(struct module *mod)
327 if (smp_alt_once || noreplace_smp) 317 if (smp_alt_once || noreplace_smp)
328 return; 318 return;
329 319
330 spin_lock(&smp_alt); 320 mutex_lock(&smp_alt);
331 list_for_each_entry(item, &smp_alt_modules, next) { 321 list_for_each_entry(item, &smp_alt_modules, next) {
332 if (mod != item->mod) 322 if (mod != item->mod)
333 continue; 323 continue;
334 list_del(&item->next); 324 list_del(&item->next);
335 spin_unlock(&smp_alt); 325 mutex_unlock(&smp_alt);
336 DPRINTK("%s: %s\n", __func__, item->name); 326 DPRINTK("%s: %s\n", __func__, item->name);
337 kfree(item); 327 kfree(item);
338 return; 328 return;
339 } 329 }
340 spin_unlock(&smp_alt); 330 mutex_unlock(&smp_alt);
341} 331}
342 332
343void alternatives_smp_switch(int smp) 333void alternatives_smp_switch(int smp)
@@ -359,7 +349,7 @@ void alternatives_smp_switch(int smp)
359 return; 349 return;
360 BUG_ON(!smp && (num_online_cpus() > 1)); 350 BUG_ON(!smp && (num_online_cpus() > 1));
361 351
362 spin_lock(&smp_alt); 352 mutex_lock(&smp_alt);
363 353
364 /* 354 /*
365 * Avoid unnecessary switches because it forces JIT based VMs to 355 * Avoid unnecessary switches because it forces JIT based VMs to
@@ -383,7 +373,7 @@ void alternatives_smp_switch(int smp)
383 mod->text, mod->text_end); 373 mod->text, mod->text_end);
384 } 374 }
385 smp_mode = smp; 375 smp_mode = smp;
386 spin_unlock(&smp_alt); 376 mutex_unlock(&smp_alt);
387} 377}
388 378
389#endif 379#endif
@@ -454,7 +444,7 @@ void __init alternative_instructions(void)
454 _text, _etext); 444 _text, _etext);
455 445
456 /* Only switch to UP mode if we don't immediately boot others */ 446 /* Only switch to UP mode if we don't immediately boot others */
457 if (num_possible_cpus() == 1 || setup_max_cpus <= 1) 447 if (num_present_cpus() == 1 || setup_max_cpus <= 1)
458 alternatives_smp_switch(0); 448 alternatives_smp_switch(0);
459 } 449 }
460#endif 450#endif
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
new file mode 100644
index 000000000000..34e4d112b1ef
--- /dev/null
+++ b/arch/x86/kernel/amd_iommu.c
@@ -0,0 +1,1383 @@
1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#include <linux/pci.h>
21#include <linux/gfp.h>
22#include <linux/bitops.h>
23#include <linux/scatterlist.h>
24#include <linux/iommu-helper.h>
25#include <asm/proto.h>
26#include <asm/iommu.h>
27#include <asm/amd_iommu_types.h>
28#include <asm/amd_iommu.h>
29
30#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
31
32#define EXIT_LOOP_COUNT 10000000
33
34static DEFINE_RWLOCK(amd_iommu_devtable_lock);
35
36/* A list of preallocated protection domains */
37static LIST_HEAD(iommu_pd_list);
38static DEFINE_SPINLOCK(iommu_pd_list_lock);
39
40/*
41 * general struct to manage commands send to an IOMMU
42 */
43struct iommu_cmd {
44 u32 data[4];
45};
46
47static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
48 struct unity_map_entry *e);
49
50/* returns !0 if the IOMMU is caching non-present entries in its TLB */
51static int iommu_has_npcache(struct amd_iommu *iommu)
52{
53 return iommu->cap & IOMMU_CAP_NPCACHE;
54}
55
56/****************************************************************************
57 *
58 * Interrupt handling functions
59 *
60 ****************************************************************************/
61
62static void iommu_print_event(void *__evt)
63{
64 u32 *event = __evt;
65 int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK;
66 int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
67 int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
68 int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
69 u64 address = (u64)(((u64)event[3]) << 32) | event[2];
70
71 printk(KERN_ERR "AMD IOMMU: Event logged [");
72
73 switch (type) {
74 case EVENT_TYPE_ILL_DEV:
75 printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x "
76 "address=0x%016llx flags=0x%04x]\n",
77 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
78 address, flags);
79 break;
80 case EVENT_TYPE_IO_FAULT:
81 printk("IO_PAGE_FAULT device=%02x:%02x.%x "
82 "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
83 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
84 domid, address, flags);
85 break;
86 case EVENT_TYPE_DEV_TAB_ERR:
87 printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
88 "address=0x%016llx flags=0x%04x]\n",
89 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
90 address, flags);
91 break;
92 case EVENT_TYPE_PAGE_TAB_ERR:
93 printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
94 "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
95 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
96 domid, address, flags);
97 break;
98 case EVENT_TYPE_ILL_CMD:
99 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
100 break;
101 case EVENT_TYPE_CMD_HARD_ERR:
102 printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
103 "flags=0x%04x]\n", address, flags);
104 break;
105 case EVENT_TYPE_IOTLB_INV_TO:
106 printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x "
107 "address=0x%016llx]\n",
108 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
109 address);
110 break;
111 case EVENT_TYPE_INV_DEV_REQ:
112 printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x "
113 "address=0x%016llx flags=0x%04x]\n",
114 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
115 address, flags);
116 break;
117 default:
118 printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type);
119 }
120}
121
122static void iommu_poll_events(struct amd_iommu *iommu)
123{
124 u32 head, tail;
125 unsigned long flags;
126
127 spin_lock_irqsave(&iommu->lock, flags);
128
129 head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
130 tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
131
132 while (head != tail) {
133 iommu_print_event(iommu->evt_buf + head);
134 head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
135 }
136
137 writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
138
139 spin_unlock_irqrestore(&iommu->lock, flags);
140}
141
142irqreturn_t amd_iommu_int_handler(int irq, void *data)
143{
144 struct amd_iommu *iommu;
145
146 list_for_each_entry(iommu, &amd_iommu_list, list)
147 iommu_poll_events(iommu);
148
149 return IRQ_HANDLED;
150}
151
152/****************************************************************************
153 *
154 * IOMMU command queuing functions
155 *
156 ****************************************************************************/
157
158/*
159 * Writes the command to the IOMMUs command buffer and informs the
160 * hardware about the new command. Must be called with iommu->lock held.
161 */
162static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
163{
164 u32 tail, head;
165 u8 *target;
166
167 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
168 target = iommu->cmd_buf + tail;
169 memcpy_toio(target, cmd, sizeof(*cmd));
170 tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
171 head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
172 if (tail == head)
173 return -ENOMEM;
174 writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
175
176 return 0;
177}
178
179/*
180 * General queuing function for commands. Takes iommu->lock and calls
181 * __iommu_queue_command().
182 */
183static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
184{
185 unsigned long flags;
186 int ret;
187
188 spin_lock_irqsave(&iommu->lock, flags);
189 ret = __iommu_queue_command(iommu, cmd);
190 spin_unlock_irqrestore(&iommu->lock, flags);
191
192 return ret;
193}
194
195/*
196 * This function is called whenever we need to ensure that the IOMMU has
197 * completed execution of all commands we sent. It sends a
198 * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
199 * us about that by writing a value to a physical address we pass with
200 * the command.
201 */
202static int iommu_completion_wait(struct amd_iommu *iommu)
203{
204 int ret = 0, ready = 0;
205 unsigned status = 0;
206 struct iommu_cmd cmd;
207 unsigned long flags, i = 0;
208
209 memset(&cmd, 0, sizeof(cmd));
210 cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
211 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
212
213 iommu->need_sync = 0;
214
215 spin_lock_irqsave(&iommu->lock, flags);
216
217 ret = __iommu_queue_command(iommu, &cmd);
218
219 if (ret)
220 goto out;
221
222 while (!ready && (i < EXIT_LOOP_COUNT)) {
223 ++i;
224 /* wait for the bit to become one */
225 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
226 ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
227 }
228
229 /* set bit back to zero */
230 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
231 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
232
233 if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit()))
234 printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n");
235out:
236 spin_unlock_irqrestore(&iommu->lock, flags);
237
238 return 0;
239}
240
241/*
242 * Command send function for invalidating a device table entry
243 */
244static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
245{
246 struct iommu_cmd cmd;
247 int ret;
248
249 BUG_ON(iommu == NULL);
250
251 memset(&cmd, 0, sizeof(cmd));
252 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
253 cmd.data[0] = devid;
254
255 ret = iommu_queue_command(iommu, &cmd);
256
257 iommu->need_sync = 1;
258
259 return ret;
260}
261
262/*
263 * Generic command send function for invalidaing TLB entries
264 */
265static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
266 u64 address, u16 domid, int pde, int s)
267{
268 struct iommu_cmd cmd;
269 int ret;
270
271 memset(&cmd, 0, sizeof(cmd));
272 address &= PAGE_MASK;
273 CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
274 cmd.data[1] |= domid;
275 cmd.data[2] = lower_32_bits(address);
276 cmd.data[3] = upper_32_bits(address);
277 if (s) /* size bit - we flush more than one 4kb page */
278 cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
279 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
280 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
281
282 ret = iommu_queue_command(iommu, &cmd);
283
284 iommu->need_sync = 1;
285
286 return ret;
287}
288
289/*
290 * TLB invalidation function which is called from the mapping functions.
291 * It invalidates a single PTE if the range to flush is within a single
292 * page. Otherwise it flushes the whole TLB of the IOMMU.
293 */
294static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
295 u64 address, size_t size)
296{
297 int s = 0;
298 unsigned pages = iommu_num_pages(address, size);
299
300 address &= PAGE_MASK;
301
302 if (pages > 1) {
303 /*
304 * If we have to flush more than one page, flush all
305 * TLB entries for this domain
306 */
307 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
308 s = 1;
309 }
310
311 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, s);
312
313 return 0;
314}
315
316/* Flush the whole IO/TLB for a given protection domain */
317static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
318{
319 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
320
321 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1);
322}
323
324/****************************************************************************
325 *
326 * The functions below are used the create the page table mappings for
327 * unity mapped regions.
328 *
329 ****************************************************************************/
330
331/*
332 * Generic mapping functions. It maps a physical address into a DMA
333 * address space. It allocates the page table pages if necessary.
334 * In the future it can be extended to a generic mapping function
335 * supporting all features of AMD IOMMU page tables like level skipping
336 * and full 64 bit address spaces.
337 */
338static int iommu_map(struct protection_domain *dom,
339 unsigned long bus_addr,
340 unsigned long phys_addr,
341 int prot)
342{
343 u64 __pte, *pte, *page;
344
345 bus_addr = PAGE_ALIGN(bus_addr);
346 phys_addr = PAGE_ALIGN(bus_addr);
347
348 /* only support 512GB address spaces for now */
349 if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
350 return -EINVAL;
351
352 pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)];
353
354 if (!IOMMU_PTE_PRESENT(*pte)) {
355 page = (u64 *)get_zeroed_page(GFP_KERNEL);
356 if (!page)
357 return -ENOMEM;
358 *pte = IOMMU_L2_PDE(virt_to_phys(page));
359 }
360
361 pte = IOMMU_PTE_PAGE(*pte);
362 pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
363
364 if (!IOMMU_PTE_PRESENT(*pte)) {
365 page = (u64 *)get_zeroed_page(GFP_KERNEL);
366 if (!page)
367 return -ENOMEM;
368 *pte = IOMMU_L1_PDE(virt_to_phys(page));
369 }
370
371 pte = IOMMU_PTE_PAGE(*pte);
372 pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)];
373
374 if (IOMMU_PTE_PRESENT(*pte))
375 return -EBUSY;
376
377 __pte = phys_addr | IOMMU_PTE_P;
378 if (prot & IOMMU_PROT_IR)
379 __pte |= IOMMU_PTE_IR;
380 if (prot & IOMMU_PROT_IW)
381 __pte |= IOMMU_PTE_IW;
382
383 *pte = __pte;
384
385 return 0;
386}
387
388/*
389 * This function checks if a specific unity mapping entry is needed for
390 * this specific IOMMU.
391 */
392static int iommu_for_unity_map(struct amd_iommu *iommu,
393 struct unity_map_entry *entry)
394{
395 u16 bdf, i;
396
397 for (i = entry->devid_start; i <= entry->devid_end; ++i) {
398 bdf = amd_iommu_alias_table[i];
399 if (amd_iommu_rlookup_table[bdf] == iommu)
400 return 1;
401 }
402
403 return 0;
404}
405
406/*
407 * Init the unity mappings for a specific IOMMU in the system
408 *
409 * Basically iterates over all unity mapping entries and applies them to
410 * the default domain DMA of that IOMMU if necessary.
411 */
412static int iommu_init_unity_mappings(struct amd_iommu *iommu)
413{
414 struct unity_map_entry *entry;
415 int ret;
416
417 list_for_each_entry(entry, &amd_iommu_unity_map, list) {
418 if (!iommu_for_unity_map(iommu, entry))
419 continue;
420 ret = dma_ops_unity_map(iommu->default_dom, entry);
421 if (ret)
422 return ret;
423 }
424
425 return 0;
426}
427
428/*
429 * This function actually applies the mapping to the page table of the
430 * dma_ops domain.
431 */
432static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
433 struct unity_map_entry *e)
434{
435 u64 addr;
436 int ret;
437
438 for (addr = e->address_start; addr < e->address_end;
439 addr += PAGE_SIZE) {
440 ret = iommu_map(&dma_dom->domain, addr, addr, e->prot);
441 if (ret)
442 return ret;
443 /*
444 * if unity mapping is in aperture range mark the page
445 * as allocated in the aperture
446 */
447 if (addr < dma_dom->aperture_size)
448 __set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap);
449 }
450
451 return 0;
452}
453
454/*
455 * Inits the unity mappings required for a specific device
456 */
457static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
458 u16 devid)
459{
460 struct unity_map_entry *e;
461 int ret;
462
463 list_for_each_entry(e, &amd_iommu_unity_map, list) {
464 if (!(devid >= e->devid_start && devid <= e->devid_end))
465 continue;
466 ret = dma_ops_unity_map(dma_dom, e);
467 if (ret)
468 return ret;
469 }
470
471 return 0;
472}
473
474/****************************************************************************
475 *
476 * The next functions belong to the address allocator for the dma_ops
477 * interface functions. They work like the allocators in the other IOMMU
478 * drivers. Its basically a bitmap which marks the allocated pages in
479 * the aperture. Maybe it could be enhanced in the future to a more
480 * efficient allocator.
481 *
482 ****************************************************************************/
483
484/*
485 * The address allocator core function.
486 *
487 * called with domain->lock held
488 */
489static unsigned long dma_ops_alloc_addresses(struct device *dev,
490 struct dma_ops_domain *dom,
491 unsigned int pages,
492 unsigned long align_mask,
493 u64 dma_mask)
494{
495 unsigned long limit;
496 unsigned long address;
497 unsigned long boundary_size;
498
499 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
500 PAGE_SIZE) >> PAGE_SHIFT;
501 limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0,
502 dma_mask >> PAGE_SHIFT);
503
504 if (dom->next_bit >= limit) {
505 dom->next_bit = 0;
506 dom->need_flush = true;
507 }
508
509 address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
510 0 , boundary_size, align_mask);
511 if (address == -1) {
512 address = iommu_area_alloc(dom->bitmap, limit, 0, pages,
513 0, boundary_size, align_mask);
514 dom->need_flush = true;
515 }
516
517 if (likely(address != -1)) {
518 dom->next_bit = address + pages;
519 address <<= PAGE_SHIFT;
520 } else
521 address = bad_dma_address;
522
523 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
524
525 return address;
526}
527
528/*
529 * The address free function.
530 *
531 * called with domain->lock held
532 */
533static void dma_ops_free_addresses(struct dma_ops_domain *dom,
534 unsigned long address,
535 unsigned int pages)
536{
537 address >>= PAGE_SHIFT;
538 iommu_area_free(dom->bitmap, address, pages);
539}
540
541/****************************************************************************
542 *
543 * The next functions belong to the domain allocation. A domain is
544 * allocated for every IOMMU as the default domain. If device isolation
545 * is enabled, every device get its own domain. The most important thing
546 * about domains is the page table mapping the DMA address space they
547 * contain.
548 *
549 ****************************************************************************/
550
551static u16 domain_id_alloc(void)
552{
553 unsigned long flags;
554 int id;
555
556 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
557 id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
558 BUG_ON(id == 0);
559 if (id > 0 && id < MAX_DOMAIN_ID)
560 __set_bit(id, amd_iommu_pd_alloc_bitmap);
561 else
562 id = 0;
563 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
564
565 return id;
566}
567
568/*
569 * Used to reserve address ranges in the aperture (e.g. for exclusion
570 * ranges.
571 */
572static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
573 unsigned long start_page,
574 unsigned int pages)
575{
576 unsigned int last_page = dom->aperture_size >> PAGE_SHIFT;
577
578 if (start_page + pages > last_page)
579 pages = last_page - start_page;
580
581 iommu_area_reserve(dom->bitmap, start_page, pages);
582}
583
584static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
585{
586 int i, j;
587 u64 *p1, *p2, *p3;
588
589 p1 = dma_dom->domain.pt_root;
590
591 if (!p1)
592 return;
593
594 for (i = 0; i < 512; ++i) {
595 if (!IOMMU_PTE_PRESENT(p1[i]))
596 continue;
597
598 p2 = IOMMU_PTE_PAGE(p1[i]);
599 for (j = 0; j < 512; ++i) {
600 if (!IOMMU_PTE_PRESENT(p2[j]))
601 continue;
602 p3 = IOMMU_PTE_PAGE(p2[j]);
603 free_page((unsigned long)p3);
604 }
605
606 free_page((unsigned long)p2);
607 }
608
609 free_page((unsigned long)p1);
610}
611
612/*
613 * Free a domain, only used if something went wrong in the
614 * allocation path and we need to free an already allocated page table
615 */
616static void dma_ops_domain_free(struct dma_ops_domain *dom)
617{
618 if (!dom)
619 return;
620
621 dma_ops_free_pagetable(dom);
622
623 kfree(dom->pte_pages);
624
625 kfree(dom->bitmap);
626
627 kfree(dom);
628}
629
630/*
631 * Allocates a new protection domain usable for the dma_ops functions.
632 * It also intializes the page table and the address allocator data
633 * structures required for the dma_ops interface
634 */
635static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
636 unsigned order)
637{
638 struct dma_ops_domain *dma_dom;
639 unsigned i, num_pte_pages;
640 u64 *l2_pde;
641 u64 address;
642
643 /*
644 * Currently the DMA aperture must be between 32 MB and 1GB in size
645 */
646 if ((order < 25) || (order > 30))
647 return NULL;
648
649 dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
650 if (!dma_dom)
651 return NULL;
652
653 spin_lock_init(&dma_dom->domain.lock);
654
655 dma_dom->domain.id = domain_id_alloc();
656 if (dma_dom->domain.id == 0)
657 goto free_dma_dom;
658 dma_dom->domain.mode = PAGE_MODE_3_LEVEL;
659 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
660 dma_dom->domain.priv = dma_dom;
661 if (!dma_dom->domain.pt_root)
662 goto free_dma_dom;
663 dma_dom->aperture_size = (1ULL << order);
664 dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8),
665 GFP_KERNEL);
666 if (!dma_dom->bitmap)
667 goto free_dma_dom;
668 /*
669 * mark the first page as allocated so we never return 0 as
670 * a valid dma-address. So we can use 0 as error value
671 */
672 dma_dom->bitmap[0] = 1;
673 dma_dom->next_bit = 0;
674
675 dma_dom->need_flush = false;
676 dma_dom->target_dev = 0xffff;
677
678 /* Intialize the exclusion range if necessary */
679 if (iommu->exclusion_start &&
680 iommu->exclusion_start < dma_dom->aperture_size) {
681 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
682 int pages = iommu_num_pages(iommu->exclusion_start,
683 iommu->exclusion_length);
684 dma_ops_reserve_addresses(dma_dom, startpage, pages);
685 }
686
687 /*
688 * At the last step, build the page tables so we don't need to
689 * allocate page table pages in the dma_ops mapping/unmapping
690 * path.
691 */
692 num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512);
693 dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *),
694 GFP_KERNEL);
695 if (!dma_dom->pte_pages)
696 goto free_dma_dom;
697
698 l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL);
699 if (l2_pde == NULL)
700 goto free_dma_dom;
701
702 dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde));
703
704 for (i = 0; i < num_pte_pages; ++i) {
705 dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL);
706 if (!dma_dom->pte_pages[i])
707 goto free_dma_dom;
708 address = virt_to_phys(dma_dom->pte_pages[i]);
709 l2_pde[i] = IOMMU_L1_PDE(address);
710 }
711
712 return dma_dom;
713
714free_dma_dom:
715 dma_ops_domain_free(dma_dom);
716
717 return NULL;
718}
719
720/*
721 * Find out the protection domain structure for a given PCI device. This
722 * will give us the pointer to the page table root for example.
723 */
724static struct protection_domain *domain_for_device(u16 devid)
725{
726 struct protection_domain *dom;
727 unsigned long flags;
728
729 read_lock_irqsave(&amd_iommu_devtable_lock, flags);
730 dom = amd_iommu_pd_table[devid];
731 read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
732
733 return dom;
734}
735
736/*
737 * If a device is not yet associated with a domain, this function does
738 * assigns it visible for the hardware
739 */
740static void set_device_domain(struct amd_iommu *iommu,
741 struct protection_domain *domain,
742 u16 devid)
743{
744 unsigned long flags;
745
746 u64 pte_root = virt_to_phys(domain->pt_root);
747
748 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
749 << DEV_ENTRY_MODE_SHIFT;
750 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
751
752 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
753 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
754 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
755 amd_iommu_dev_table[devid].data[2] = domain->id;
756
757 amd_iommu_pd_table[devid] = domain;
758 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
759
760 iommu_queue_inv_dev_entry(iommu, devid);
761
762 iommu->need_sync = 1;
763}
764
765/*****************************************************************************
766 *
767 * The next functions belong to the dma_ops mapping/unmapping code.
768 *
769 *****************************************************************************/
770
771/*
772 * This function checks if the driver got a valid device from the caller to
773 * avoid dereferencing invalid pointers.
774 */
775static bool check_device(struct device *dev)
776{
777 if (!dev || !dev->dma_mask)
778 return false;
779
780 return true;
781}
782
783/*
784 * In this function the list of preallocated protection domains is traversed to
785 * find the domain for a specific device
786 */
787static struct dma_ops_domain *find_protection_domain(u16 devid)
788{
789 struct dma_ops_domain *entry, *ret = NULL;
790 unsigned long flags;
791
792 if (list_empty(&iommu_pd_list))
793 return NULL;
794
795 spin_lock_irqsave(&iommu_pd_list_lock, flags);
796
797 list_for_each_entry(entry, &iommu_pd_list, list) {
798 if (entry->target_dev == devid) {
799 ret = entry;
800 list_del(&ret->list);
801 break;
802 }
803 }
804
805 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
806
807 return ret;
808}
809
810/*
811 * In the dma_ops path we only have the struct device. This function
812 * finds the corresponding IOMMU, the protection domain and the
813 * requestor id for a given device.
814 * If the device is not yet associated with a domain this is also done
815 * in this function.
816 */
817static int get_device_resources(struct device *dev,
818 struct amd_iommu **iommu,
819 struct protection_domain **domain,
820 u16 *bdf)
821{
822 struct dma_ops_domain *dma_dom;
823 struct pci_dev *pcidev;
824 u16 _bdf;
825
826 *iommu = NULL;
827 *domain = NULL;
828 *bdf = 0xffff;
829
830 if (dev->bus != &pci_bus_type)
831 return 0;
832
833 pcidev = to_pci_dev(dev);
834 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
835
836 /* device not translated by any IOMMU in the system? */
837 if (_bdf > amd_iommu_last_bdf)
838 return 0;
839
840 *bdf = amd_iommu_alias_table[_bdf];
841
842 *iommu = amd_iommu_rlookup_table[*bdf];
843 if (*iommu == NULL)
844 return 0;
845 *domain = domain_for_device(*bdf);
846 if (*domain == NULL) {
847 dma_dom = find_protection_domain(*bdf);
848 if (!dma_dom)
849 dma_dom = (*iommu)->default_dom;
850 *domain = &dma_dom->domain;
851 set_device_domain(*iommu, *domain, *bdf);
852 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
853 "device ", (*domain)->id);
854 print_devid(_bdf, 1);
855 }
856
857 return 1;
858}
859
860/*
861 * This is the generic map function. It maps one 4kb page at paddr to
862 * the given address in the DMA address space for the domain.
863 */
864static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
865 struct dma_ops_domain *dom,
866 unsigned long address,
867 phys_addr_t paddr,
868 int direction)
869{
870 u64 *pte, __pte;
871
872 WARN_ON(address > dom->aperture_size);
873
874 paddr &= PAGE_MASK;
875
876 pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
877 pte += IOMMU_PTE_L0_INDEX(address);
878
879 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
880
881 if (direction == DMA_TO_DEVICE)
882 __pte |= IOMMU_PTE_IR;
883 else if (direction == DMA_FROM_DEVICE)
884 __pte |= IOMMU_PTE_IW;
885 else if (direction == DMA_BIDIRECTIONAL)
886 __pte |= IOMMU_PTE_IR | IOMMU_PTE_IW;
887
888 WARN_ON(*pte);
889
890 *pte = __pte;
891
892 return (dma_addr_t)address;
893}
894
895/*
896 * The generic unmapping function for on page in the DMA address space.
897 */
898static void dma_ops_domain_unmap(struct amd_iommu *iommu,
899 struct dma_ops_domain *dom,
900 unsigned long address)
901{
902 u64 *pte;
903
904 if (address >= dom->aperture_size)
905 return;
906
907 WARN_ON(address & 0xfffULL || address > dom->aperture_size);
908
909 pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
910 pte += IOMMU_PTE_L0_INDEX(address);
911
912 WARN_ON(!*pte);
913
914 *pte = 0ULL;
915}
916
917/*
918 * This function contains common code for mapping of a physically
919 * contiguous memory region into DMA address space. It is uses by all
920 * mapping functions provided by this IOMMU driver.
921 * Must be called with the domain lock held.
922 */
923static dma_addr_t __map_single(struct device *dev,
924 struct amd_iommu *iommu,
925 struct dma_ops_domain *dma_dom,
926 phys_addr_t paddr,
927 size_t size,
928 int dir,
929 bool align,
930 u64 dma_mask)
931{
932 dma_addr_t offset = paddr & ~PAGE_MASK;
933 dma_addr_t address, start;
934 unsigned int pages;
935 unsigned long align_mask = 0;
936 int i;
937
938 pages = iommu_num_pages(paddr, size);
939 paddr &= PAGE_MASK;
940
941 if (align)
942 align_mask = (1UL << get_order(size)) - 1;
943
944 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
945 dma_mask);
946 if (unlikely(address == bad_dma_address))
947 goto out;
948
949 start = address;
950 for (i = 0; i < pages; ++i) {
951 dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
952 paddr += PAGE_SIZE;
953 start += PAGE_SIZE;
954 }
955 address += offset;
956
957 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
958 iommu_flush_tlb(iommu, dma_dom->domain.id);
959 dma_dom->need_flush = false;
960 } else if (unlikely(iommu_has_npcache(iommu)))
961 iommu_flush_pages(iommu, dma_dom->domain.id, address, size);
962
963out:
964 return address;
965}
966
967/*
968 * Does the reverse of the __map_single function. Must be called with
969 * the domain lock held too
970 */
971static void __unmap_single(struct amd_iommu *iommu,
972 struct dma_ops_domain *dma_dom,
973 dma_addr_t dma_addr,
974 size_t size,
975 int dir)
976{
977 dma_addr_t i, start;
978 unsigned int pages;
979
980 if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size))
981 return;
982
983 pages = iommu_num_pages(dma_addr, size);
984 dma_addr &= PAGE_MASK;
985 start = dma_addr;
986
987 for (i = 0; i < pages; ++i) {
988 dma_ops_domain_unmap(iommu, dma_dom, start);
989 start += PAGE_SIZE;
990 }
991
992 dma_ops_free_addresses(dma_dom, dma_addr, pages);
993
994 if (amd_iommu_unmap_flush)
995 iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size);
996}
997
998/*
999 * The exported map_single function for dma_ops.
1000 */
1001static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
1002 size_t size, int dir)
1003{
1004 unsigned long flags;
1005 struct amd_iommu *iommu;
1006 struct protection_domain *domain;
1007 u16 devid;
1008 dma_addr_t addr;
1009 u64 dma_mask;
1010
1011 if (!check_device(dev))
1012 return bad_dma_address;
1013
1014 dma_mask = *dev->dma_mask;
1015
1016 get_device_resources(dev, &iommu, &domain, &devid);
1017
1018 if (iommu == NULL || domain == NULL)
1019 /* device not handled by any AMD IOMMU */
1020 return (dma_addr_t)paddr;
1021
1022 spin_lock_irqsave(&domain->lock, flags);
1023 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false,
1024 dma_mask);
1025 if (addr == bad_dma_address)
1026 goto out;
1027
1028 if (unlikely(iommu->need_sync))
1029 iommu_completion_wait(iommu);
1030
1031out:
1032 spin_unlock_irqrestore(&domain->lock, flags);
1033
1034 return addr;
1035}
1036
1037/*
1038 * The exported unmap_single function for dma_ops.
1039 */
1040static void unmap_single(struct device *dev, dma_addr_t dma_addr,
1041 size_t size, int dir)
1042{
1043 unsigned long flags;
1044 struct amd_iommu *iommu;
1045 struct protection_domain *domain;
1046 u16 devid;
1047
1048 if (!check_device(dev) ||
1049 !get_device_resources(dev, &iommu, &domain, &devid))
1050 /* device not handled by any AMD IOMMU */
1051 return;
1052
1053 spin_lock_irqsave(&domain->lock, flags);
1054
1055 __unmap_single(iommu, domain->priv, dma_addr, size, dir);
1056
1057 if (unlikely(iommu->need_sync))
1058 iommu_completion_wait(iommu);
1059
1060 spin_unlock_irqrestore(&domain->lock, flags);
1061}
1062
1063/*
1064 * This is a special map_sg function which is used if we should map a
1065 * device which is not handled by an AMD IOMMU in the system.
1066 */
1067static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
1068 int nelems, int dir)
1069{
1070 struct scatterlist *s;
1071 int i;
1072
1073 for_each_sg(sglist, s, nelems, i) {
1074 s->dma_address = (dma_addr_t)sg_phys(s);
1075 s->dma_length = s->length;
1076 }
1077
1078 return nelems;
1079}
1080
1081/*
1082 * The exported map_sg function for dma_ops (handles scatter-gather
1083 * lists).
1084 */
1085static int map_sg(struct device *dev, struct scatterlist *sglist,
1086 int nelems, int dir)
1087{
1088 unsigned long flags;
1089 struct amd_iommu *iommu;
1090 struct protection_domain *domain;
1091 u16 devid;
1092 int i;
1093 struct scatterlist *s;
1094 phys_addr_t paddr;
1095 int mapped_elems = 0;
1096 u64 dma_mask;
1097
1098 if (!check_device(dev))
1099 return 0;
1100
1101 dma_mask = *dev->dma_mask;
1102
1103 get_device_resources(dev, &iommu, &domain, &devid);
1104
1105 if (!iommu || !domain)
1106 return map_sg_no_iommu(dev, sglist, nelems, dir);
1107
1108 spin_lock_irqsave(&domain->lock, flags);
1109
1110 for_each_sg(sglist, s, nelems, i) {
1111 paddr = sg_phys(s);
1112
1113 s->dma_address = __map_single(dev, iommu, domain->priv,
1114 paddr, s->length, dir, false,
1115 dma_mask);
1116
1117 if (s->dma_address) {
1118 s->dma_length = s->length;
1119 mapped_elems++;
1120 } else
1121 goto unmap;
1122 }
1123
1124 if (unlikely(iommu->need_sync))
1125 iommu_completion_wait(iommu);
1126
1127out:
1128 spin_unlock_irqrestore(&domain->lock, flags);
1129
1130 return mapped_elems;
1131unmap:
1132 for_each_sg(sglist, s, mapped_elems, i) {
1133 if (s->dma_address)
1134 __unmap_single(iommu, domain->priv, s->dma_address,
1135 s->dma_length, dir);
1136 s->dma_address = s->dma_length = 0;
1137 }
1138
1139 mapped_elems = 0;
1140
1141 goto out;
1142}
1143
1144/*
1145 * The exported map_sg function for dma_ops (handles scatter-gather
1146 * lists).
1147 */
1148static void unmap_sg(struct device *dev, struct scatterlist *sglist,
1149 int nelems, int dir)
1150{
1151 unsigned long flags;
1152 struct amd_iommu *iommu;
1153 struct protection_domain *domain;
1154 struct scatterlist *s;
1155 u16 devid;
1156 int i;
1157
1158 if (!check_device(dev) ||
1159 !get_device_resources(dev, &iommu, &domain, &devid))
1160 return;
1161
1162 spin_lock_irqsave(&domain->lock, flags);
1163
1164 for_each_sg(sglist, s, nelems, i) {
1165 __unmap_single(iommu, domain->priv, s->dma_address,
1166 s->dma_length, dir);
1167 s->dma_address = s->dma_length = 0;
1168 }
1169
1170 if (unlikely(iommu->need_sync))
1171 iommu_completion_wait(iommu);
1172
1173 spin_unlock_irqrestore(&domain->lock, flags);
1174}
1175
1176/*
1177 * The exported alloc_coherent function for dma_ops.
1178 */
1179static void *alloc_coherent(struct device *dev, size_t size,
1180 dma_addr_t *dma_addr, gfp_t flag)
1181{
1182 unsigned long flags;
1183 void *virt_addr;
1184 struct amd_iommu *iommu;
1185 struct protection_domain *domain;
1186 u16 devid;
1187 phys_addr_t paddr;
1188 u64 dma_mask = dev->coherent_dma_mask;
1189
1190 if (!check_device(dev))
1191 return NULL;
1192
1193 if (!get_device_resources(dev, &iommu, &domain, &devid))
1194 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
1195
1196 flag |= __GFP_ZERO;
1197 virt_addr = (void *)__get_free_pages(flag, get_order(size));
1198 if (!virt_addr)
1199 return 0;
1200
1201 paddr = virt_to_phys(virt_addr);
1202
1203 if (!iommu || !domain) {
1204 *dma_addr = (dma_addr_t)paddr;
1205 return virt_addr;
1206 }
1207
1208 if (!dma_mask)
1209 dma_mask = *dev->dma_mask;
1210
1211 spin_lock_irqsave(&domain->lock, flags);
1212
1213 *dma_addr = __map_single(dev, iommu, domain->priv, paddr,
1214 size, DMA_BIDIRECTIONAL, true, dma_mask);
1215
1216 if (*dma_addr == bad_dma_address) {
1217 free_pages((unsigned long)virt_addr, get_order(size));
1218 virt_addr = NULL;
1219 goto out;
1220 }
1221
1222 if (unlikely(iommu->need_sync))
1223 iommu_completion_wait(iommu);
1224
1225out:
1226 spin_unlock_irqrestore(&domain->lock, flags);
1227
1228 return virt_addr;
1229}
1230
1231/*
1232 * The exported free_coherent function for dma_ops.
1233 */
1234static void free_coherent(struct device *dev, size_t size,
1235 void *virt_addr, dma_addr_t dma_addr)
1236{
1237 unsigned long flags;
1238 struct amd_iommu *iommu;
1239 struct protection_domain *domain;
1240 u16 devid;
1241
1242 if (!check_device(dev))
1243 return;
1244
1245 get_device_resources(dev, &iommu, &domain, &devid);
1246
1247 if (!iommu || !domain)
1248 goto free_mem;
1249
1250 spin_lock_irqsave(&domain->lock, flags);
1251
1252 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
1253
1254 if (unlikely(iommu->need_sync))
1255 iommu_completion_wait(iommu);
1256
1257 spin_unlock_irqrestore(&domain->lock, flags);
1258
1259free_mem:
1260 free_pages((unsigned long)virt_addr, get_order(size));
1261}
1262
1263/*
1264 * This function is called by the DMA layer to find out if we can handle a
1265 * particular device. It is part of the dma_ops.
1266 */
1267static int amd_iommu_dma_supported(struct device *dev, u64 mask)
1268{
1269 u16 bdf;
1270 struct pci_dev *pcidev;
1271
1272 /* No device or no PCI device */
1273 if (!dev || dev->bus != &pci_bus_type)
1274 return 0;
1275
1276 pcidev = to_pci_dev(dev);
1277
1278 bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
1279
1280 /* Out of our scope? */
1281 if (bdf > amd_iommu_last_bdf)
1282 return 0;
1283
1284 return 1;
1285}
1286
1287/*
1288 * The function for pre-allocating protection domains.
1289 *
1290 * If the driver core informs the DMA layer if a driver grabs a device
1291 * we don't need to preallocate the protection domains anymore.
1292 * For now we have to.
1293 */
1294void prealloc_protection_domains(void)
1295{
1296 struct pci_dev *dev = NULL;
1297 struct dma_ops_domain *dma_dom;
1298 struct amd_iommu *iommu;
1299 int order = amd_iommu_aperture_order;
1300 u16 devid;
1301
1302 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
1303 devid = (dev->bus->number << 8) | dev->devfn;
1304 if (devid > amd_iommu_last_bdf)
1305 continue;
1306 devid = amd_iommu_alias_table[devid];
1307 if (domain_for_device(devid))
1308 continue;
1309 iommu = amd_iommu_rlookup_table[devid];
1310 if (!iommu)
1311 continue;
1312 dma_dom = dma_ops_domain_alloc(iommu, order);
1313 if (!dma_dom)
1314 continue;
1315 init_unity_mappings_for_device(dma_dom, devid);
1316 dma_dom->target_dev = devid;
1317
1318 list_add_tail(&dma_dom->list, &iommu_pd_list);
1319 }
1320}
1321
1322static struct dma_mapping_ops amd_iommu_dma_ops = {
1323 .alloc_coherent = alloc_coherent,
1324 .free_coherent = free_coherent,
1325 .map_single = map_single,
1326 .unmap_single = unmap_single,
1327 .map_sg = map_sg,
1328 .unmap_sg = unmap_sg,
1329 .dma_supported = amd_iommu_dma_supported,
1330};
1331
1332/*
1333 * The function which clues the AMD IOMMU driver into dma_ops.
1334 */
1335int __init amd_iommu_init_dma_ops(void)
1336{
1337 struct amd_iommu *iommu;
1338 int order = amd_iommu_aperture_order;
1339 int ret;
1340
1341 /*
1342 * first allocate a default protection domain for every IOMMU we
1343 * found in the system. Devices not assigned to any other
1344 * protection domain will be assigned to the default one.
1345 */
1346 list_for_each_entry(iommu, &amd_iommu_list, list) {
1347 iommu->default_dom = dma_ops_domain_alloc(iommu, order);
1348 if (iommu->default_dom == NULL)
1349 return -ENOMEM;
1350 ret = iommu_init_unity_mappings(iommu);
1351 if (ret)
1352 goto free_domains;
1353 }
1354
1355 /*
1356 * If device isolation is enabled, pre-allocate the protection
1357 * domains for each device.
1358 */
1359 if (amd_iommu_isolate)
1360 prealloc_protection_domains();
1361
1362 iommu_detected = 1;
1363 force_iommu = 1;
1364 bad_dma_address = 0;
1365#ifdef CONFIG_GART_IOMMU
1366 gart_iommu_aperture_disabled = 1;
1367 gart_iommu_aperture = 0;
1368#endif
1369
1370 /* Make the driver finally visible to the drivers */
1371 dma_ops = &amd_iommu_dma_ops;
1372
1373 return 0;
1374
1375free_domains:
1376
1377 list_for_each_entry(iommu, &amd_iommu_list, list) {
1378 if (iommu->default_dom)
1379 dma_ops_domain_free(iommu->default_dom);
1380 }
1381
1382 return ret;
1383}
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
new file mode 100644
index 000000000000..4cd8083c58be
--- /dev/null
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -0,0 +1,1234 @@
1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#include <linux/pci.h>
21#include <linux/acpi.h>
22#include <linux/gfp.h>
23#include <linux/list.h>
24#include <linux/sysdev.h>
25#include <linux/interrupt.h>
26#include <linux/msi.h>
27#include <asm/pci-direct.h>
28#include <asm/amd_iommu_types.h>
29#include <asm/amd_iommu.h>
30#include <asm/iommu.h>
31
32/*
33 * definitions for the ACPI scanning code
34 */
35#define IVRS_HEADER_LENGTH 48
36
37#define ACPI_IVHD_TYPE 0x10
38#define ACPI_IVMD_TYPE_ALL 0x20
39#define ACPI_IVMD_TYPE 0x21
40#define ACPI_IVMD_TYPE_RANGE 0x22
41
42#define IVHD_DEV_ALL 0x01
43#define IVHD_DEV_SELECT 0x02
44#define IVHD_DEV_SELECT_RANGE_START 0x03
45#define IVHD_DEV_RANGE_END 0x04
46#define IVHD_DEV_ALIAS 0x42
47#define IVHD_DEV_ALIAS_RANGE 0x43
48#define IVHD_DEV_EXT_SELECT 0x46
49#define IVHD_DEV_EXT_SELECT_RANGE 0x47
50
51#define IVHD_FLAG_HT_TUN_EN 0x00
52#define IVHD_FLAG_PASSPW_EN 0x01
53#define IVHD_FLAG_RESPASSPW_EN 0x02
54#define IVHD_FLAG_ISOC_EN 0x03
55
56#define IVMD_FLAG_EXCL_RANGE 0x08
57#define IVMD_FLAG_UNITY_MAP 0x01
58
59#define ACPI_DEVFLAG_INITPASS 0x01
60#define ACPI_DEVFLAG_EXTINT 0x02
61#define ACPI_DEVFLAG_NMI 0x04
62#define ACPI_DEVFLAG_SYSMGT1 0x10
63#define ACPI_DEVFLAG_SYSMGT2 0x20
64#define ACPI_DEVFLAG_LINT0 0x40
65#define ACPI_DEVFLAG_LINT1 0x80
66#define ACPI_DEVFLAG_ATSDIS 0x10000000
67
68/*
69 * ACPI table definitions
70 *
71 * These data structures are laid over the table to parse the important values
72 * out of it.
73 */
74
75/*
76 * structure describing one IOMMU in the ACPI table. Typically followed by one
77 * or more ivhd_entrys.
78 */
79struct ivhd_header {
80 u8 type;
81 u8 flags;
82 u16 length;
83 u16 devid;
84 u16 cap_ptr;
85 u64 mmio_phys;
86 u16 pci_seg;
87 u16 info;
88 u32 reserved;
89} __attribute__((packed));
90
91/*
92 * A device entry describing which devices a specific IOMMU translates and
93 * which requestor ids they use.
94 */
95struct ivhd_entry {
96 u8 type;
97 u16 devid;
98 u8 flags;
99 u32 ext;
100} __attribute__((packed));
101
102/*
103 * An AMD IOMMU memory definition structure. It defines things like exclusion
104 * ranges for devices and regions that should be unity mapped.
105 */
106struct ivmd_header {
107 u8 type;
108 u8 flags;
109 u16 length;
110 u16 devid;
111 u16 aux;
112 u64 resv;
113 u64 range_start;
114 u64 range_length;
115} __attribute__((packed));
116
117static int __initdata amd_iommu_detected;
118
119u16 amd_iommu_last_bdf; /* largest PCI device id we have
120 to handle */
121LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
122 we find in ACPI */
123unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
124int amd_iommu_isolate; /* if 1, device isolation is enabled */
125bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
126
127LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
128 system */
129
130/*
131 * Pointer to the device table which is shared by all AMD IOMMUs
132 * it is indexed by the PCI device id or the HT unit id and contains
133 * information about the domain the device belongs to as well as the
134 * page table root pointer.
135 */
136struct dev_table_entry *amd_iommu_dev_table;
137
138/*
139 * The alias table is a driver specific data structure which contains the
140 * mappings of the PCI device ids to the actual requestor ids on the IOMMU.
141 * More than one device can share the same requestor id.
142 */
143u16 *amd_iommu_alias_table;
144
145/*
146 * The rlookup table is used to find the IOMMU which is responsible
147 * for a specific device. It is also indexed by the PCI device id.
148 */
149struct amd_iommu **amd_iommu_rlookup_table;
150
151/*
152 * The pd table (protection domain table) is used to find the protection domain
153 * data structure a device belongs to. Indexed with the PCI device id too.
154 */
155struct protection_domain **amd_iommu_pd_table;
156
157/*
158 * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
159 * to know which ones are already in use.
160 */
161unsigned long *amd_iommu_pd_alloc_bitmap;
162
163static u32 dev_table_size; /* size of the device table */
164static u32 alias_table_size; /* size of the alias table */
165static u32 rlookup_table_size; /* size if the rlookup table */
166
167static inline void update_last_devid(u16 devid)
168{
169 if (devid > amd_iommu_last_bdf)
170 amd_iommu_last_bdf = devid;
171}
172
173static inline unsigned long tbl_size(int entry_size)
174{
175 unsigned shift = PAGE_SHIFT +
176 get_order(amd_iommu_last_bdf * entry_size);
177
178 return 1UL << shift;
179}
180
181/****************************************************************************
182 *
183 * AMD IOMMU MMIO register space handling functions
184 *
185 * These functions are used to program the IOMMU device registers in
186 * MMIO space required for that driver.
187 *
188 ****************************************************************************/
189
190/*
191 * This function set the exclusion range in the IOMMU. DMA accesses to the
192 * exclusion range are passed through untranslated
193 */
194static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
195{
196 u64 start = iommu->exclusion_start & PAGE_MASK;
197 u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
198 u64 entry;
199
200 if (!iommu->exclusion_start)
201 return;
202
203 entry = start | MMIO_EXCL_ENABLE_MASK;
204 memcpy_toio(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET,
205 &entry, sizeof(entry));
206
207 entry = limit;
208 memcpy_toio(iommu->mmio_base + MMIO_EXCL_LIMIT_OFFSET,
209 &entry, sizeof(entry));
210}
211
212/* Programs the physical address of the device table into the IOMMU hardware */
213static void __init iommu_set_device_table(struct amd_iommu *iommu)
214{
215 u32 entry;
216
217 BUG_ON(iommu->mmio_base == NULL);
218
219 entry = virt_to_phys(amd_iommu_dev_table);
220 entry |= (dev_table_size >> 12) - 1;
221 memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
222 &entry, sizeof(entry));
223}
224
225/* Generic functions to enable/disable certain features of the IOMMU. */
226static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
227{
228 u32 ctrl;
229
230 ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
231 ctrl |= (1 << bit);
232 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
233}
234
235static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
236{
237 u32 ctrl;
238
239 ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
240 ctrl &= ~(1 << bit);
241 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
242}
243
244/* Function to enable the hardware */
245void __init iommu_enable(struct amd_iommu *iommu)
246{
247 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU "
248 "at %02x:%02x.%x cap 0x%hx\n",
249 iommu->dev->bus->number,
250 PCI_SLOT(iommu->dev->devfn),
251 PCI_FUNC(iommu->dev->devfn),
252 iommu->cap_ptr);
253
254 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
255}
256
257/* Function to enable IOMMU event logging and event interrupts */
258void __init iommu_enable_event_logging(struct amd_iommu *iommu)
259{
260 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
261 iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
262}
263
264/*
265 * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
266 * the system has one.
267 */
268static u8 * __init iommu_map_mmio_space(u64 address)
269{
270 u8 *ret;
271
272 if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu"))
273 return NULL;
274
275 ret = ioremap_nocache(address, MMIO_REGION_LENGTH);
276 if (ret != NULL)
277 return ret;
278
279 release_mem_region(address, MMIO_REGION_LENGTH);
280
281 return NULL;
282}
283
284static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
285{
286 if (iommu->mmio_base)
287 iounmap(iommu->mmio_base);
288 release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
289}
290
291/****************************************************************************
292 *
293 * The functions below belong to the first pass of AMD IOMMU ACPI table
294 * parsing. In this pass we try to find out the highest device id this
295 * code has to handle. Upon this information the size of the shared data
296 * structures is determined later.
297 *
298 ****************************************************************************/
299
300/*
301 * This function calculates the length of a given IVHD entry
302 */
303static inline int ivhd_entry_length(u8 *ivhd)
304{
305 return 0x04 << (*ivhd >> 6);
306}
307
308/*
309 * This function reads the last device id the IOMMU has to handle from the PCI
310 * capability header for this IOMMU
311 */
312static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
313{
314 u32 cap;
315
316 cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
317 update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
318
319 return 0;
320}
321
322/*
323 * After reading the highest device id from the IOMMU PCI capability header
324 * this function looks if there is a higher device id defined in the ACPI table
325 */
326static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
327{
328 u8 *p = (void *)h, *end = (void *)h;
329 struct ivhd_entry *dev;
330
331 p += sizeof(*h);
332 end += h->length;
333
334 find_last_devid_on_pci(PCI_BUS(h->devid),
335 PCI_SLOT(h->devid),
336 PCI_FUNC(h->devid),
337 h->cap_ptr);
338
339 while (p < end) {
340 dev = (struct ivhd_entry *)p;
341 switch (dev->type) {
342 case IVHD_DEV_SELECT:
343 case IVHD_DEV_RANGE_END:
344 case IVHD_DEV_ALIAS:
345 case IVHD_DEV_EXT_SELECT:
346 /* all the above subfield types refer to device ids */
347 update_last_devid(dev->devid);
348 break;
349 default:
350 break;
351 }
352 p += ivhd_entry_length(p);
353 }
354
355 WARN_ON(p != end);
356
357 return 0;
358}
359
360/*
361 * Iterate over all IVHD entries in the ACPI table and find the highest device
362 * id which we need to handle. This is the first of three functions which parse
363 * the ACPI table. So we check the checksum here.
364 */
365static int __init find_last_devid_acpi(struct acpi_table_header *table)
366{
367 int i;
368 u8 checksum = 0, *p = (u8 *)table, *end = (u8 *)table;
369 struct ivhd_header *h;
370
371 /*
372 * Validate checksum here so we don't need to do it when
373 * we actually parse the table
374 */
375 for (i = 0; i < table->length; ++i)
376 checksum += p[i];
377 if (checksum != 0)
378 /* ACPI table corrupt */
379 return -ENODEV;
380
381 p += IVRS_HEADER_LENGTH;
382
383 end += table->length;
384 while (p < end) {
385 h = (struct ivhd_header *)p;
386 switch (h->type) {
387 case ACPI_IVHD_TYPE:
388 find_last_devid_from_ivhd(h);
389 break;
390 default:
391 break;
392 }
393 p += h->length;
394 }
395 WARN_ON(p != end);
396
397 return 0;
398}
399
400/****************************************************************************
401 *
402 * The following functions belong the the code path which parses the ACPI table
403 * the second time. In this ACPI parsing iteration we allocate IOMMU specific
404 * data structures, initialize the device/alias/rlookup table and also
405 * basically initialize the hardware.
406 *
407 ****************************************************************************/
408
409/*
410 * Allocates the command buffer. This buffer is per AMD IOMMU. We can
411 * write commands to that buffer later and the IOMMU will execute them
412 * asynchronously
413 */
414static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
415{
416 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
417 get_order(CMD_BUFFER_SIZE));
418 u64 entry;
419
420 if (cmd_buf == NULL)
421 return NULL;
422
423 iommu->cmd_buf_size = CMD_BUFFER_SIZE;
424
425 entry = (u64)virt_to_phys(cmd_buf);
426 entry |= MMIO_CMD_SIZE_512;
427 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
428 &entry, sizeof(entry));
429
430 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
431
432 return cmd_buf;
433}
434
435static void __init free_command_buffer(struct amd_iommu *iommu)
436{
437 free_pages((unsigned long)iommu->cmd_buf,
438 get_order(iommu->cmd_buf_size));
439}
440
441/* allocates the memory where the IOMMU will log its events to */
442static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
443{
444 u64 entry;
445 iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
446 get_order(EVT_BUFFER_SIZE));
447
448 if (iommu->evt_buf == NULL)
449 return NULL;
450
451 entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
452 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
453 &entry, sizeof(entry));
454
455 iommu->evt_buf_size = EVT_BUFFER_SIZE;
456
457 return iommu->evt_buf;
458}
459
460static void __init free_event_buffer(struct amd_iommu *iommu)
461{
462 free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
463}
464
465/* sets a specific bit in the device table entry. */
466static void set_dev_entry_bit(u16 devid, u8 bit)
467{
468 int i = (bit >> 5) & 0x07;
469 int _bit = bit & 0x1f;
470
471 amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
472}
473
474/* Writes the specific IOMMU for a device into the rlookup table */
475static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
476{
477 amd_iommu_rlookup_table[devid] = iommu;
478}
479
480/*
481 * This function takes the device specific flags read from the ACPI
482 * table and sets up the device table entry with that information
483 */
484static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
485 u16 devid, u32 flags, u32 ext_flags)
486{
487 if (flags & ACPI_DEVFLAG_INITPASS)
488 set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
489 if (flags & ACPI_DEVFLAG_EXTINT)
490 set_dev_entry_bit(devid, DEV_ENTRY_EINT_PASS);
491 if (flags & ACPI_DEVFLAG_NMI)
492 set_dev_entry_bit(devid, DEV_ENTRY_NMI_PASS);
493 if (flags & ACPI_DEVFLAG_SYSMGT1)
494 set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1);
495 if (flags & ACPI_DEVFLAG_SYSMGT2)
496 set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2);
497 if (flags & ACPI_DEVFLAG_LINT0)
498 set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
499 if (flags & ACPI_DEVFLAG_LINT1)
500 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
501
502 set_iommu_for_device(iommu, devid);
503}
504
505/*
506 * Reads the device exclusion range from ACPI and initialize IOMMU with
507 * it
508 */
509static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
510{
511 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
512
513 if (!(m->flags & IVMD_FLAG_EXCL_RANGE))
514 return;
515
516 if (iommu) {
517 /*
518 * We only can configure exclusion ranges per IOMMU, not
519 * per device. But we can enable the exclusion range per
520 * device. This is done here
521 */
522 set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
523 iommu->exclusion_start = m->range_start;
524 iommu->exclusion_length = m->range_length;
525 }
526}
527
528/*
529 * This function reads some important data from the IOMMU PCI space and
530 * initializes the driver data structure with it. It reads the hardware
531 * capabilities and the first/last device entries
532 */
533static void __init init_iommu_from_pci(struct amd_iommu *iommu)
534{
535 int cap_ptr = iommu->cap_ptr;
536 u32 range, misc;
537
538 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
539 &iommu->cap);
540 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET,
541 &range);
542 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET,
543 &misc);
544
545 iommu->first_device = calc_devid(MMIO_GET_BUS(range),
546 MMIO_GET_FD(range));
547 iommu->last_device = calc_devid(MMIO_GET_BUS(range),
548 MMIO_GET_LD(range));
549 iommu->evt_msi_num = MMIO_MSI_NUM(misc);
550}
551
552/*
553 * Takes a pointer to an AMD IOMMU entry in the ACPI table and
554 * initializes the hardware and our data structures with it.
555 */
556static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
557 struct ivhd_header *h)
558{
559 u8 *p = (u8 *)h;
560 u8 *end = p, flags = 0;
561 u16 dev_i, devid = 0, devid_start = 0, devid_to = 0;
562 u32 ext_flags = 0;
563 bool alias = false;
564 struct ivhd_entry *e;
565
566 /*
567 * First set the recommended feature enable bits from ACPI
568 * into the IOMMU control registers
569 */
570 h->flags & IVHD_FLAG_HT_TUN_EN ?
571 iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
572 iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
573
574 h->flags & IVHD_FLAG_PASSPW_EN ?
575 iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
576 iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
577
578 h->flags & IVHD_FLAG_RESPASSPW_EN ?
579 iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
580 iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
581
582 h->flags & IVHD_FLAG_ISOC_EN ?
583 iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
584 iommu_feature_disable(iommu, CONTROL_ISOC_EN);
585
586 /*
587 * make IOMMU memory accesses cache coherent
588 */
589 iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
590
591 /*
592 * Done. Now parse the device entries
593 */
594 p += sizeof(struct ivhd_header);
595 end += h->length;
596
597 while (p < end) {
598 e = (struct ivhd_entry *)p;
599 switch (e->type) {
600 case IVHD_DEV_ALL:
601 for (dev_i = iommu->first_device;
602 dev_i <= iommu->last_device; ++dev_i)
603 set_dev_entry_from_acpi(iommu, dev_i,
604 e->flags, 0);
605 break;
606 case IVHD_DEV_SELECT:
607 devid = e->devid;
608 set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
609 break;
610 case IVHD_DEV_SELECT_RANGE_START:
611 devid_start = e->devid;
612 flags = e->flags;
613 ext_flags = 0;
614 alias = false;
615 break;
616 case IVHD_DEV_ALIAS:
617 devid = e->devid;
618 devid_to = e->ext >> 8;
619 set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
620 amd_iommu_alias_table[devid] = devid_to;
621 break;
622 case IVHD_DEV_ALIAS_RANGE:
623 devid_start = e->devid;
624 flags = e->flags;
625 devid_to = e->ext >> 8;
626 ext_flags = 0;
627 alias = true;
628 break;
629 case IVHD_DEV_EXT_SELECT:
630 devid = e->devid;
631 set_dev_entry_from_acpi(iommu, devid, e->flags,
632 e->ext);
633 break;
634 case IVHD_DEV_EXT_SELECT_RANGE:
635 devid_start = e->devid;
636 flags = e->flags;
637 ext_flags = e->ext;
638 alias = false;
639 break;
640 case IVHD_DEV_RANGE_END:
641 devid = e->devid;
642 for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
643 if (alias)
644 amd_iommu_alias_table[dev_i] = devid_to;
645 set_dev_entry_from_acpi(iommu,
646 amd_iommu_alias_table[dev_i],
647 flags, ext_flags);
648 }
649 break;
650 default:
651 break;
652 }
653
654 p += ivhd_entry_length(p);
655 }
656}
657
658/* Initializes the device->iommu mapping for the driver */
659static int __init init_iommu_devices(struct amd_iommu *iommu)
660{
661 u16 i;
662
663 for (i = iommu->first_device; i <= iommu->last_device; ++i)
664 set_iommu_for_device(iommu, i);
665
666 return 0;
667}
668
669static void __init free_iommu_one(struct amd_iommu *iommu)
670{
671 free_command_buffer(iommu);
672 free_event_buffer(iommu);
673 iommu_unmap_mmio_space(iommu);
674}
675
676static void __init free_iommu_all(void)
677{
678 struct amd_iommu *iommu, *next;
679
680 list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) {
681 list_del(&iommu->list);
682 free_iommu_one(iommu);
683 kfree(iommu);
684 }
685}
686
687/*
688 * This function clues the initialization function for one IOMMU
689 * together and also allocates the command buffer and programs the
690 * hardware. It does NOT enable the IOMMU. This is done afterwards.
691 */
692static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
693{
694 spin_lock_init(&iommu->lock);
695 list_add_tail(&iommu->list, &amd_iommu_list);
696
697 /*
698 * Copy data from ACPI table entry to the iommu struct
699 */
700 iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff);
701 if (!iommu->dev)
702 return 1;
703
704 iommu->cap_ptr = h->cap_ptr;
705 iommu->pci_seg = h->pci_seg;
706 iommu->mmio_phys = h->mmio_phys;
707 iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
708 if (!iommu->mmio_base)
709 return -ENOMEM;
710
711 iommu_set_device_table(iommu);
712 iommu->cmd_buf = alloc_command_buffer(iommu);
713 if (!iommu->cmd_buf)
714 return -ENOMEM;
715
716 iommu->evt_buf = alloc_event_buffer(iommu);
717 if (!iommu->evt_buf)
718 return -ENOMEM;
719
720 iommu->int_enabled = false;
721
722 init_iommu_from_pci(iommu);
723 init_iommu_from_acpi(iommu, h);
724 init_iommu_devices(iommu);
725
726 return pci_enable_device(iommu->dev);
727}
728
729/*
730 * Iterates over all IOMMU entries in the ACPI table, allocates the
731 * IOMMU structure and initializes it with init_iommu_one()
732 */
733static int __init init_iommu_all(struct acpi_table_header *table)
734{
735 u8 *p = (u8 *)table, *end = (u8 *)table;
736 struct ivhd_header *h;
737 struct amd_iommu *iommu;
738 int ret;
739
740 end += table->length;
741 p += IVRS_HEADER_LENGTH;
742
743 while (p < end) {
744 h = (struct ivhd_header *)p;
745 switch (*p) {
746 case ACPI_IVHD_TYPE:
747 iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
748 if (iommu == NULL)
749 return -ENOMEM;
750 ret = init_iommu_one(iommu, h);
751 if (ret)
752 return ret;
753 break;
754 default:
755 break;
756 }
757 p += h->length;
758
759 }
760 WARN_ON(p != end);
761
762 return 0;
763}
764
765/****************************************************************************
766 *
767 * The following functions initialize the MSI interrupts for all IOMMUs
768 * in the system. Its a bit challenging because there could be multiple
769 * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per
770 * pci_dev.
771 *
772 ****************************************************************************/
773
774static int __init iommu_setup_msix(struct amd_iommu *iommu)
775{
776 struct amd_iommu *curr;
777 struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */
778 int nvec = 0, i;
779
780 list_for_each_entry(curr, &amd_iommu_list, list) {
781 if (curr->dev == iommu->dev) {
782 entries[nvec].entry = curr->evt_msi_num;
783 entries[nvec].vector = 0;
784 curr->int_enabled = true;
785 nvec++;
786 }
787 }
788
789 if (pci_enable_msix(iommu->dev, entries, nvec)) {
790 pci_disable_msix(iommu->dev);
791 return 1;
792 }
793
794 for (i = 0; i < nvec; ++i) {
795 int r = request_irq(entries->vector, amd_iommu_int_handler,
796 IRQF_SAMPLE_RANDOM,
797 "AMD IOMMU",
798 NULL);
799 if (r)
800 goto out_free;
801 }
802
803 return 0;
804
805out_free:
806 for (i -= 1; i >= 0; --i)
807 free_irq(entries->vector, NULL);
808
809 pci_disable_msix(iommu->dev);
810
811 return 1;
812}
813
814static int __init iommu_setup_msi(struct amd_iommu *iommu)
815{
816 int r;
817 struct amd_iommu *curr;
818
819 list_for_each_entry(curr, &amd_iommu_list, list) {
820 if (curr->dev == iommu->dev)
821 curr->int_enabled = true;
822 }
823
824
825 if (pci_enable_msi(iommu->dev))
826 return 1;
827
828 r = request_irq(iommu->dev->irq, amd_iommu_int_handler,
829 IRQF_SAMPLE_RANDOM,
830 "AMD IOMMU",
831 NULL);
832
833 if (r) {
834 pci_disable_msi(iommu->dev);
835 return 1;
836 }
837
838 return 0;
839}
840
841static int __init iommu_init_msi(struct amd_iommu *iommu)
842{
843 if (iommu->int_enabled)
844 return 0;
845
846 if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX))
847 return iommu_setup_msix(iommu);
848 else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
849 return iommu_setup_msi(iommu);
850
851 return 1;
852}
853
854/****************************************************************************
855 *
856 * The next functions belong to the third pass of parsing the ACPI
857 * table. In this last pass the memory mapping requirements are
858 * gathered (like exclusion and unity mapping reanges).
859 *
860 ****************************************************************************/
861
862static void __init free_unity_maps(void)
863{
864 struct unity_map_entry *entry, *next;
865
866 list_for_each_entry_safe(entry, next, &amd_iommu_unity_map, list) {
867 list_del(&entry->list);
868 kfree(entry);
869 }
870}
871
872/* called when we find an exclusion range definition in ACPI */
873static int __init init_exclusion_range(struct ivmd_header *m)
874{
875 int i;
876
877 switch (m->type) {
878 case ACPI_IVMD_TYPE:
879 set_device_exclusion_range(m->devid, m);
880 break;
881 case ACPI_IVMD_TYPE_ALL:
882 for (i = 0; i <= amd_iommu_last_bdf; ++i)
883 set_device_exclusion_range(i, m);
884 break;
885 case ACPI_IVMD_TYPE_RANGE:
886 for (i = m->devid; i <= m->aux; ++i)
887 set_device_exclusion_range(i, m);
888 break;
889 default:
890 break;
891 }
892
893 return 0;
894}
895
896/* called for unity map ACPI definition */
897static int __init init_unity_map_range(struct ivmd_header *m)
898{
899 struct unity_map_entry *e = 0;
900
901 e = kzalloc(sizeof(*e), GFP_KERNEL);
902 if (e == NULL)
903 return -ENOMEM;
904
905 switch (m->type) {
906 default:
907 case ACPI_IVMD_TYPE:
908 e->devid_start = e->devid_end = m->devid;
909 break;
910 case ACPI_IVMD_TYPE_ALL:
911 e->devid_start = 0;
912 e->devid_end = amd_iommu_last_bdf;
913 break;
914 case ACPI_IVMD_TYPE_RANGE:
915 e->devid_start = m->devid;
916 e->devid_end = m->aux;
917 break;
918 }
919 e->address_start = PAGE_ALIGN(m->range_start);
920 e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
921 e->prot = m->flags >> 1;
922
923 list_add_tail(&e->list, &amd_iommu_unity_map);
924
925 return 0;
926}
927
928/* iterates over all memory definitions we find in the ACPI table */
929static int __init init_memory_definitions(struct acpi_table_header *table)
930{
931 u8 *p = (u8 *)table, *end = (u8 *)table;
932 struct ivmd_header *m;
933
934 end += table->length;
935 p += IVRS_HEADER_LENGTH;
936
937 while (p < end) {
938 m = (struct ivmd_header *)p;
939 if (m->flags & IVMD_FLAG_EXCL_RANGE)
940 init_exclusion_range(m);
941 else if (m->flags & IVMD_FLAG_UNITY_MAP)
942 init_unity_map_range(m);
943
944 p += m->length;
945 }
946
947 return 0;
948}
949
950/*
951 * Init the device table to not allow DMA access for devices and
952 * suppress all page faults
953 */
954static void init_device_table(void)
955{
956 u16 devid;
957
958 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
959 set_dev_entry_bit(devid, DEV_ENTRY_VALID);
960 set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
961 }
962}
963
964/*
965 * This function finally enables all IOMMUs found in the system after
966 * they have been initialized
967 */
968static void __init enable_iommus(void)
969{
970 struct amd_iommu *iommu;
971
972 list_for_each_entry(iommu, &amd_iommu_list, list) {
973 iommu_set_exclusion_range(iommu);
974 iommu_init_msi(iommu);
975 iommu_enable_event_logging(iommu);
976 iommu_enable(iommu);
977 }
978}
979
980/*
981 * Suspend/Resume support
982 * disable suspend until real resume implemented
983 */
984
985static int amd_iommu_resume(struct sys_device *dev)
986{
987 return 0;
988}
989
990static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
991{
992 return -EINVAL;
993}
994
995static struct sysdev_class amd_iommu_sysdev_class = {
996 .name = "amd_iommu",
997 .suspend = amd_iommu_suspend,
998 .resume = amd_iommu_resume,
999};
1000
1001static struct sys_device device_amd_iommu = {
1002 .id = 0,
1003 .cls = &amd_iommu_sysdev_class,
1004};
1005
1006/*
1007 * This is the core init function for AMD IOMMU hardware in the system.
1008 * This function is called from the generic x86 DMA layer initialization
1009 * code.
1010 *
1011 * This function basically parses the ACPI table for AMD IOMMU (IVRS)
1012 * three times:
1013 *
1014 * 1 pass) Find the highest PCI device id the driver has to handle.
1015 * Upon this information the size of the data structures is
1016 * determined that needs to be allocated.
1017 *
1018 * 2 pass) Initialize the data structures just allocated with the
1019 * information in the ACPI table about available AMD IOMMUs
1020 * in the system. It also maps the PCI devices in the
1021 * system to specific IOMMUs
1022 *
1023 * 3 pass) After the basic data structures are allocated and
1024 * initialized we update them with information about memory
1025 * remapping requirements parsed out of the ACPI table in
1026 * this last pass.
1027 *
1028 * After that the hardware is initialized and ready to go. In the last
1029 * step we do some Linux specific things like registering the driver in
1030 * the dma_ops interface and initializing the suspend/resume support
1031 * functions. Finally it prints some information about AMD IOMMUs and
1032 * the driver state and enables the hardware.
1033 */
1034int __init amd_iommu_init(void)
1035{
1036 int i, ret = 0;
1037
1038
1039 if (no_iommu) {
1040 printk(KERN_INFO "AMD IOMMU disabled by kernel command line\n");
1041 return 0;
1042 }
1043
1044 if (!amd_iommu_detected)
1045 return -ENODEV;
1046
1047 /*
1048 * First parse ACPI tables to find the largest Bus/Dev/Func
1049 * we need to handle. Upon this information the shared data
1050 * structures for the IOMMUs in the system will be allocated
1051 */
1052 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
1053 return -ENODEV;
1054
1055 dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE);
1056 alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
1057 rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
1058
1059 ret = -ENOMEM;
1060
1061 /* Device table - directly used by all IOMMUs */
1062 amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
1063 get_order(dev_table_size));
1064 if (amd_iommu_dev_table == NULL)
1065 goto out;
1066
1067 /*
1068 * Alias table - map PCI Bus/Dev/Func to Bus/Dev/Func the
1069 * IOMMU see for that device
1070 */
1071 amd_iommu_alias_table = (void *)__get_free_pages(GFP_KERNEL,
1072 get_order(alias_table_size));
1073 if (amd_iommu_alias_table == NULL)
1074 goto free;
1075
1076 /* IOMMU rlookup table - find the IOMMU for a specific device */
1077 amd_iommu_rlookup_table = (void *)__get_free_pages(GFP_KERNEL,
1078 get_order(rlookup_table_size));
1079 if (amd_iommu_rlookup_table == NULL)
1080 goto free;
1081
1082 /*
1083 * Protection Domain table - maps devices to protection domains
1084 * This table has the same size as the rlookup_table
1085 */
1086 amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
1087 get_order(rlookup_table_size));
1088 if (amd_iommu_pd_table == NULL)
1089 goto free;
1090
1091 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
1092 GFP_KERNEL | __GFP_ZERO,
1093 get_order(MAX_DOMAIN_ID/8));
1094 if (amd_iommu_pd_alloc_bitmap == NULL)
1095 goto free;
1096
1097 /* init the device table */
1098 init_device_table();
1099
1100 /*
1101 * let all alias entries point to itself
1102 */
1103 for (i = 0; i <= amd_iommu_last_bdf; ++i)
1104 amd_iommu_alias_table[i] = i;
1105
1106 /*
1107 * never allocate domain 0 because its used as the non-allocated and
1108 * error value placeholder
1109 */
1110 amd_iommu_pd_alloc_bitmap[0] = 1;
1111
1112 /*
1113 * now the data structures are allocated and basically initialized
1114 * start the real acpi table scan
1115 */
1116 ret = -ENODEV;
1117 if (acpi_table_parse("IVRS", init_iommu_all) != 0)
1118 goto free;
1119
1120 if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
1121 goto free;
1122
1123 ret = sysdev_class_register(&amd_iommu_sysdev_class);
1124 if (ret)
1125 goto free;
1126
1127 ret = sysdev_register(&device_amd_iommu);
1128 if (ret)
1129 goto free;
1130
1131 ret = amd_iommu_init_dma_ops();
1132 if (ret)
1133 goto free;
1134
1135 enable_iommus();
1136
1137 printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n",
1138 (1 << (amd_iommu_aperture_order-20)));
1139
1140 printk(KERN_INFO "AMD IOMMU: device isolation ");
1141 if (amd_iommu_isolate)
1142 printk("enabled\n");
1143 else
1144 printk("disabled\n");
1145
1146 if (amd_iommu_unmap_flush)
1147 printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n");
1148 else
1149 printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n");
1150
1151out:
1152 return ret;
1153
1154free:
1155 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
1156 get_order(MAX_DOMAIN_ID/8));
1157
1158 free_pages((unsigned long)amd_iommu_pd_table,
1159 get_order(rlookup_table_size));
1160
1161 free_pages((unsigned long)amd_iommu_rlookup_table,
1162 get_order(rlookup_table_size));
1163
1164 free_pages((unsigned long)amd_iommu_alias_table,
1165 get_order(alias_table_size));
1166
1167 free_pages((unsigned long)amd_iommu_dev_table,
1168 get_order(dev_table_size));
1169
1170 free_iommu_all();
1171
1172 free_unity_maps();
1173
1174 goto out;
1175}
1176
1177/****************************************************************************
1178 *
1179 * Early detect code. This code runs at IOMMU detection time in the DMA
1180 * layer. It just looks if there is an IVRS ACPI table to detect AMD
1181 * IOMMUs
1182 *
1183 ****************************************************************************/
1184static int __init early_amd_iommu_detect(struct acpi_table_header *table)
1185{
1186 return 0;
1187}
1188
1189void __init amd_iommu_detect(void)
1190{
1191 if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture))
1192 return;
1193
1194 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
1195 iommu_detected = 1;
1196 amd_iommu_detected = 1;
1197#ifdef CONFIG_GART_IOMMU
1198 gart_iommu_aperture_disabled = 1;
1199 gart_iommu_aperture = 0;
1200#endif
1201 }
1202}
1203
1204/****************************************************************************
1205 *
1206 * Parsing functions for the AMD IOMMU specific kernel command line
1207 * options.
1208 *
1209 ****************************************************************************/
1210
1211static int __init parse_amd_iommu_options(char *str)
1212{
1213 for (; *str; ++str) {
1214 if (strncmp(str, "isolate", 7) == 0)
1215 amd_iommu_isolate = 1;
1216 if (strncmp(str, "fullflush", 11) == 0)
1217 amd_iommu_unmap_flush = true;
1218 }
1219
1220 return 1;
1221}
1222
1223static int __init parse_amd_iommu_size_options(char *str)
1224{
1225 unsigned order = PAGE_SHIFT + get_order(memparse(str, &str));
1226
1227 if ((order > 24) && (order < 31))
1228 amd_iommu_aperture_order = order;
1229
1230 return 1;
1231}
1232
1233__setup("amd_iommu=", parse_amd_iommu_options);
1234__setup("amd_iommu_size=", parse_amd_iommu_size_options);
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 479926d9e004..9a32b37ee2ee 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -21,6 +21,7 @@
21#include <linux/suspend.h> 21#include <linux/suspend.h>
22#include <asm/e820.h> 22#include <asm/e820.h>
23#include <asm/io.h> 23#include <asm/io.h>
24#include <asm/iommu.h>
24#include <asm/gart.h> 25#include <asm/gart.h>
25#include <asm/pci-direct.h> 26#include <asm/pci-direct.h>
26#include <asm/dma.h> 27#include <asm/dma.h>
@@ -35,6 +36,18 @@ int fallback_aper_force __initdata;
35 36
36int fix_aperture __initdata = 1; 37int fix_aperture __initdata = 1;
37 38
39struct bus_dev_range {
40 int bus;
41 int dev_base;
42 int dev_limit;
43};
44
45static struct bus_dev_range bus_dev_ranges[] __initdata = {
46 { 0x00, 0x18, 0x20},
47 { 0xff, 0x00, 0x20},
48 { 0xfe, 0x00, 0x20}
49};
50
38static struct resource gart_resource = { 51static struct resource gart_resource = {
39 .name = "GART", 52 .name = "GART",
40 .flags = IORESOURCE_MEM, 53 .flags = IORESOURCE_MEM,
@@ -55,8 +68,9 @@ static u32 __init allocate_aperture(void)
55 u32 aper_size; 68 u32 aper_size;
56 void *p; 69 void *p;
57 70
58 if (fallback_aper_order > 7) 71 /* aper_size should <= 1G */
59 fallback_aper_order = 7; 72 if (fallback_aper_order > 5)
73 fallback_aper_order = 5;
60 aper_size = (32 * 1024 * 1024) << fallback_aper_order; 74 aper_size = (32 * 1024 * 1024) << fallback_aper_order;
61 75
62 /* 76 /*
@@ -65,7 +79,20 @@ static u32 __init allocate_aperture(void)
65 * memory. Unfortunately we cannot move it up because that would 79 * memory. Unfortunately we cannot move it up because that would
66 * make the IOMMU useless. 80 * make the IOMMU useless.
67 */ 81 */
68 p = __alloc_bootmem_nopanic(aper_size, aper_size, 0); 82 /*
83 * using 512M as goal, in case kexec will load kernel_big
84 * that will do the on position decompress, and could overlap with
85 * that positon with gart that is used.
86 * sequende:
87 * kernel_small
88 * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
89 * ==> kernel_small(gart area become e820_reserved)
90 * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
91 * ==> kerne_big (uncompressed size will be big than 64M or 128M)
92 * so don't use 512M below as gart iommu, leave the space for kernel
93 * code for safe
94 */
95 p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20);
69 if (!p || __pa(p)+aper_size > 0xffffffff) { 96 if (!p || __pa(p)+aper_size > 0xffffffff) {
70 printk(KERN_ERR 97 printk(KERN_ERR
71 "Cannot allocate aperture memory hole (%p,%uK)\n", 98 "Cannot allocate aperture memory hole (%p,%uK)\n",
@@ -83,69 +110,53 @@ static u32 __init allocate_aperture(void)
83 return (u32)__pa(p); 110 return (u32)__pa(p);
84} 111}
85 112
86static int __init aperture_valid(u64 aper_base, u32 aper_size)
87{
88 if (!aper_base)
89 return 0;
90
91 if (aper_base + aper_size > 0x100000000UL) {
92 printk(KERN_ERR "Aperture beyond 4GB. Ignoring.\n");
93 return 0;
94 }
95 if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
96 printk(KERN_ERR "Aperture pointing to e820 RAM. Ignoring.\n");
97 return 0;
98 }
99 if (aper_size < 64*1024*1024) {
100 printk(KERN_ERR "Aperture too small (%d MB)\n", aper_size>>20);
101 return 0;
102 }
103
104 return 1;
105}
106 113
107/* Find a PCI capability */ 114/* Find a PCI capability */
108static __u32 __init find_cap(int num, int slot, int func, int cap) 115static u32 __init find_cap(int bus, int slot, int func, int cap)
109{ 116{
110 int bytes; 117 int bytes;
111 u8 pos; 118 u8 pos;
112 119
113 if (!(read_pci_config_16(num, slot, func, PCI_STATUS) & 120 if (!(read_pci_config_16(bus, slot, func, PCI_STATUS) &
114 PCI_STATUS_CAP_LIST)) 121 PCI_STATUS_CAP_LIST))
115 return 0; 122 return 0;
116 123
117 pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST); 124 pos = read_pci_config_byte(bus, slot, func, PCI_CAPABILITY_LIST);
118 for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { 125 for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
119 u8 id; 126 u8 id;
120 127
121 pos &= ~3; 128 pos &= ~3;
122 id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID); 129 id = read_pci_config_byte(bus, slot, func, pos+PCI_CAP_LIST_ID);
123 if (id == 0xff) 130 if (id == 0xff)
124 break; 131 break;
125 if (id == cap) 132 if (id == cap)
126 return pos; 133 return pos;
127 pos = read_pci_config_byte(num, slot, func, 134 pos = read_pci_config_byte(bus, slot, func,
128 pos+PCI_CAP_LIST_NEXT); 135 pos+PCI_CAP_LIST_NEXT);
129 } 136 }
130 return 0; 137 return 0;
131} 138}
132 139
133/* Read a standard AGPv3 bridge header */ 140/* Read a standard AGPv3 bridge header */
134static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) 141static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
135{ 142{
136 u32 apsize; 143 u32 apsize;
137 u32 apsizereg; 144 u32 apsizereg;
138 int nbits; 145 int nbits;
139 u32 aper_low, aper_hi; 146 u32 aper_low, aper_hi;
140 u64 aper; 147 u64 aper;
148 u32 old_order;
141 149
142 printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", num, slot, func); 150 printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", bus, slot, func);
143 apsizereg = read_pci_config_16(num, slot, func, cap + 0x14); 151 apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14);
144 if (apsizereg == 0xffffffff) { 152 if (apsizereg == 0xffffffff) {
145 printk(KERN_ERR "APSIZE in AGP bridge unreadable\n"); 153 printk(KERN_ERR "APSIZE in AGP bridge unreadable\n");
146 return 0; 154 return 0;
147 } 155 }
148 156
157 /* old_order could be the value from NB gart setting */
158 old_order = *order;
159
149 apsize = apsizereg & 0xfff; 160 apsize = apsizereg & 0xfff;
150 /* Some BIOS use weird encodings not in the AGPv3 table. */ 161 /* Some BIOS use weird encodings not in the AGPv3 table. */
151 if (apsize & 0xff) 162 if (apsize & 0xff)
@@ -155,14 +166,26 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
155 if ((int)*order < 0) /* < 32MB */ 166 if ((int)*order < 0) /* < 32MB */
156 *order = 0; 167 *order = 0;
157 168
158 aper_low = read_pci_config(num, slot, func, 0x10); 169 aper_low = read_pci_config(bus, slot, func, 0x10);
159 aper_hi = read_pci_config(num, slot, func, 0x14); 170 aper_hi = read_pci_config(bus, slot, func, 0x14);
160 aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); 171 aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
161 172
173 /*
174 * On some sick chips, APSIZE is 0. It means it wants 4G
175 * so let double check that order, and lets trust AMD NB settings:
176 */
177 printk(KERN_INFO "Aperture from AGP @ %Lx old size %u MB\n",
178 aper, 32 << old_order);
179 if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) {
180 printk(KERN_INFO "Aperture size %u MB (APSIZE %x) is not right, using settings from NB\n",
181 32 << *order, apsizereg);
182 *order = old_order;
183 }
184
162 printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", 185 printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n",
163 aper, 32 << *order, apsizereg); 186 aper, 32 << *order, apsizereg);
164 187
165 if (!aperture_valid(aper, (32*1024*1024) << *order)) 188 if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20))
166 return 0; 189 return 0;
167 return (u32)aper; 190 return (u32)aper;
168} 191}
@@ -180,17 +203,17 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
180 * the AGP bridges should be always an own bus on the HT hierarchy, 203 * the AGP bridges should be always an own bus on the HT hierarchy,
181 * but do it here for future safety. 204 * but do it here for future safety.
182 */ 205 */
183static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) 206static u32 __init search_agp_bridge(u32 *order, int *valid_agp)
184{ 207{
185 int num, slot, func; 208 int bus, slot, func;
186 209
187 /* Poor man's PCI discovery */ 210 /* Poor man's PCI discovery */
188 for (num = 0; num < 256; num++) { 211 for (bus = 0; bus < 256; bus++) {
189 for (slot = 0; slot < 32; slot++) { 212 for (slot = 0; slot < 32; slot++) {
190 for (func = 0; func < 8; func++) { 213 for (func = 0; func < 8; func++) {
191 u32 class, cap; 214 u32 class, cap;
192 u8 type; 215 u8 type;
193 class = read_pci_config(num, slot, func, 216 class = read_pci_config(bus, slot, func,
194 PCI_CLASS_REVISION); 217 PCI_CLASS_REVISION);
195 if (class == 0xffffffff) 218 if (class == 0xffffffff)
196 break; 219 break;
@@ -199,17 +222,17 @@ static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
199 case PCI_CLASS_BRIDGE_HOST: 222 case PCI_CLASS_BRIDGE_HOST:
200 case PCI_CLASS_BRIDGE_OTHER: /* needed? */ 223 case PCI_CLASS_BRIDGE_OTHER: /* needed? */
201 /* AGP bridge? */ 224 /* AGP bridge? */
202 cap = find_cap(num, slot, func, 225 cap = find_cap(bus, slot, func,
203 PCI_CAP_ID_AGP); 226 PCI_CAP_ID_AGP);
204 if (!cap) 227 if (!cap)
205 break; 228 break;
206 *valid_agp = 1; 229 *valid_agp = 1;
207 return read_agp(num, slot, func, cap, 230 return read_agp(bus, slot, func, cap,
208 order); 231 order);
209 } 232 }
210 233
211 /* No multi-function device? */ 234 /* No multi-function device? */
212 type = read_pci_config_byte(num, slot, func, 235 type = read_pci_config_byte(bus, slot, func,
213 PCI_HEADER_TYPE); 236 PCI_HEADER_TYPE);
214 if (!(type & 0x80)) 237 if (!(type & 0x80))
215 break; 238 break;
@@ -249,36 +272,50 @@ void __init early_gart_iommu_check(void)
249 * or BIOS forget to put that in reserved. 272 * or BIOS forget to put that in reserved.
250 * try to update e820 to make that region as reserved. 273 * try to update e820 to make that region as reserved.
251 */ 274 */
252 int fix, num; 275 int i, fix, slot;
253 u32 ctl; 276 u32 ctl;
254 u32 aper_size = 0, aper_order = 0, last_aper_order = 0; 277 u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
255 u64 aper_base = 0, last_aper_base = 0; 278 u64 aper_base = 0, last_aper_base = 0;
256 int aper_enabled = 0, last_aper_enabled = 0; 279 int aper_enabled = 0, last_aper_enabled = 0, last_valid = 0;
257 280
258 if (!early_pci_allowed()) 281 if (!early_pci_allowed())
259 return; 282 return;
260 283
284 /* This is mostly duplicate of iommu_hole_init */
261 fix = 0; 285 fix = 0;
262 for (num = 24; num < 32; num++) { 286 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
263 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) 287 int bus;
264 continue; 288 int dev_base, dev_limit;
265 289
266 ctl = read_pci_config(0, num, 3, 0x90); 290 bus = bus_dev_ranges[i].bus;
267 aper_enabled = ctl & 1; 291 dev_base = bus_dev_ranges[i].dev_base;
268 aper_order = (ctl >> 1) & 7; 292 dev_limit = bus_dev_ranges[i].dev_limit;
269 aper_size = (32 * 1024 * 1024) << aper_order; 293
270 aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; 294 for (slot = dev_base; slot < dev_limit; slot++) {
271 aper_base <<= 25; 295 if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
272 296 continue;
273 if ((last_aper_order && aper_order != last_aper_order) || 297
274 (last_aper_base && aper_base != last_aper_base) || 298 ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
275 (last_aper_enabled && aper_enabled != last_aper_enabled)) { 299 aper_enabled = ctl & AMD64_GARTEN;
276 fix = 1; 300 aper_order = (ctl >> 1) & 7;
277 break; 301 aper_size = (32 * 1024 * 1024) << aper_order;
302 aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
303 aper_base <<= 25;
304
305 if (last_valid) {
306 if ((aper_order != last_aper_order) ||
307 (aper_base != last_aper_base) ||
308 (aper_enabled != last_aper_enabled)) {
309 fix = 1;
310 break;
311 }
312 }
313
314 last_aper_order = aper_order;
315 last_aper_base = aper_base;
316 last_aper_enabled = aper_enabled;
317 last_valid = 1;
278 } 318 }
279 last_aper_order = aper_order;
280 last_aper_base = aper_base;
281 last_aper_enabled = aper_enabled;
282 } 319 }
283 320
284 if (!fix && !aper_enabled) 321 if (!fix && !aper_enabled)
@@ -290,32 +327,46 @@ void __init early_gart_iommu_check(void)
290 if (gart_fix_e820 && !fix && aper_enabled) { 327 if (gart_fix_e820 && !fix && aper_enabled) {
291 if (e820_any_mapped(aper_base, aper_base + aper_size, 328 if (e820_any_mapped(aper_base, aper_base + aper_size,
292 E820_RAM)) { 329 E820_RAM)) {
293 /* reserved it, so we can resuse it in second kernel */ 330 /* reserve it, so we can reuse it in second kernel */
294 printk(KERN_INFO "update e820 for GART\n"); 331 printk(KERN_INFO "update e820 for GART\n");
295 add_memory_region(aper_base, aper_size, E820_RESERVED); 332 e820_add_region(aper_base, aper_size, E820_RESERVED);
296 update_e820(); 333 update_e820();
297 } 334 }
298 return;
299 } 335 }
300 336
337 if (!fix)
338 return;
339
301 /* different nodes have different setting, disable them all at first*/ 340 /* different nodes have different setting, disable them all at first*/
302 for (num = 24; num < 32; num++) { 341 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
303 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) 342 int bus;
304 continue; 343 int dev_base, dev_limit;
344
345 bus = bus_dev_ranges[i].bus;
346 dev_base = bus_dev_ranges[i].dev_base;
347 dev_limit = bus_dev_ranges[i].dev_limit;
305 348
306 ctl = read_pci_config(0, num, 3, 0x90); 349 for (slot = dev_base; slot < dev_limit; slot++) {
307 ctl &= ~1; 350 if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
308 write_pci_config(0, num, 3, 0x90, ctl); 351 continue;
352
353 ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
354 ctl &= ~AMD64_GARTEN;
355 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
356 }
309 } 357 }
310 358
311} 359}
312 360
361static int __initdata printed_gart_size_msg;
362
313void __init gart_iommu_hole_init(void) 363void __init gart_iommu_hole_init(void)
314{ 364{
365 u32 agp_aper_base = 0, agp_aper_order = 0;
315 u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; 366 u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
316 u64 aper_base, last_aper_base = 0; 367 u64 aper_base, last_aper_base = 0;
317 int fix, num, valid_agp = 0; 368 int fix, slot, valid_agp = 0;
318 int node; 369 int i, node;
319 370
320 if (gart_iommu_aperture_disabled || !fix_aperture || 371 if (gart_iommu_aperture_disabled || !fix_aperture ||
321 !early_pci_allowed()) 372 !early_pci_allowed())
@@ -323,38 +374,65 @@ void __init gart_iommu_hole_init(void)
323 374
324 printk(KERN_INFO "Checking aperture...\n"); 375 printk(KERN_INFO "Checking aperture...\n");
325 376
377 if (!fallback_aper_force)
378 agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
379
326 fix = 0; 380 fix = 0;
327 node = 0; 381 node = 0;
328 for (num = 24; num < 32; num++) { 382 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
329 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) 383 int bus;
330 continue; 384 int dev_base, dev_limit;
331 385
332 iommu_detected = 1; 386 bus = bus_dev_ranges[i].bus;
333 gart_iommu_aperture = 1; 387 dev_base = bus_dev_ranges[i].dev_base;
334 388 dev_limit = bus_dev_ranges[i].dev_limit;
335 aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; 389
336 aper_size = (32 * 1024 * 1024) << aper_order; 390 for (slot = dev_base; slot < dev_limit; slot++) {
337 aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; 391 if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
338 aper_base <<= 25; 392 continue;
339 393
340 printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n", 394 iommu_detected = 1;
341 node, aper_base, aper_size >> 20); 395 gart_iommu_aperture = 1;
342 node++; 396
343 397 aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7;
344 if (!aperture_valid(aper_base, aper_size)) { 398 aper_size = (32 * 1024 * 1024) << aper_order;
345 fix = 1; 399 aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
346 break; 400 aper_base <<= 25;
347 } 401
402 printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n",
403 node, aper_base, aper_size >> 20);
404 node++;
405
406 if (!aperture_valid(aper_base, aper_size, 64<<20)) {
407 if (valid_agp && agp_aper_base &&
408 agp_aper_base == aper_base &&
409 agp_aper_order == aper_order) {
410 /* the same between two setting from NB and agp */
411 if (!no_iommu &&
412 max_pfn > MAX_DMA32_PFN &&
413 !printed_gart_size_msg) {
414 printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n");
415 printk(KERN_ERR "please increase GART size in your BIOS setup\n");
416 printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n");
417 printed_gart_size_msg = 1;
418 }
419 } else {
420 fix = 1;
421 goto out;
422 }
423 }
348 424
349 if ((last_aper_order && aper_order != last_aper_order) || 425 if ((last_aper_order && aper_order != last_aper_order) ||
350 (last_aper_base && aper_base != last_aper_base)) { 426 (last_aper_base && aper_base != last_aper_base)) {
351 fix = 1; 427 fix = 1;
352 break; 428 goto out;
429 }
430 last_aper_order = aper_order;
431 last_aper_base = aper_base;
353 } 432 }
354 last_aper_order = aper_order;
355 last_aper_base = aper_base;
356 } 433 }
357 434
435out:
358 if (!fix && !fallback_aper_force) { 436 if (!fix && !fallback_aper_force) {
359 if (last_aper_base) { 437 if (last_aper_base) {
360 unsigned long n = (32 * 1024 * 1024) << last_aper_order; 438 unsigned long n = (32 * 1024 * 1024) << last_aper_order;
@@ -364,22 +442,24 @@ void __init gart_iommu_hole_init(void)
364 return; 442 return;
365 } 443 }
366 444
367 if (!fallback_aper_force) 445 if (!fallback_aper_force) {
368 aper_alloc = search_agp_bridge(&aper_order, &valid_agp); 446 aper_alloc = agp_aper_base;
447 aper_order = agp_aper_order;
448 }
369 449
370 if (aper_alloc) { 450 if (aper_alloc) {
371 /* Got the aperture from the AGP bridge */ 451 /* Got the aperture from the AGP bridge */
372 } else if (swiotlb && !valid_agp) { 452 } else if (swiotlb && !valid_agp) {
373 /* Do nothing */ 453 /* Do nothing */
374 } else if ((!no_iommu && end_pfn > MAX_DMA32_PFN) || 454 } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) ||
375 force_iommu || 455 force_iommu ||
376 valid_agp || 456 valid_agp ||
377 fallback_aper_force) { 457 fallback_aper_force) {
378 printk(KERN_ERR 458 printk(KERN_INFO
379 "Your BIOS doesn't leave a aperture memory hole\n"); 459 "Your BIOS doesn't leave a aperture memory hole\n");
380 printk(KERN_ERR 460 printk(KERN_INFO
381 "Please enable the IOMMU option in the BIOS setup\n"); 461 "Please enable the IOMMU option in the BIOS setup\n");
382 printk(KERN_ERR 462 printk(KERN_INFO
383 "This costs you %d MB of RAM\n", 463 "This costs you %d MB of RAM\n",
384 32 << fallback_aper_order); 464 32 << fallback_aper_order);
385 465
@@ -401,16 +481,24 @@ void __init gart_iommu_hole_init(void)
401 } 481 }
402 482
403 /* Fix up the north bridges */ 483 /* Fix up the north bridges */
404 for (num = 24; num < 32; num++) { 484 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
405 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) 485 int bus;
406 continue; 486 int dev_base, dev_limit;
407 487
408 /* 488 bus = bus_dev_ranges[i].bus;
409 * Don't enable translation yet. That is done later. 489 dev_base = bus_dev_ranges[i].dev_base;
410 * Assume this BIOS didn't initialise the GART so 490 dev_limit = bus_dev_ranges[i].dev_limit;
411 * just overwrite all previous bits 491 for (slot = dev_base; slot < dev_limit; slot++) {
412 */ 492 if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
413 write_pci_config(0, num, 3, 0x90, aper_order<<1); 493 continue;
414 write_pci_config(0, num, 3, 0x94, aper_alloc>>25); 494
495 /* Don't enable translation yet. That is done later.
496 Assume this BIOS didn't initialise the GART so
497 just overwrite all previous bits */
498 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1);
499 write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25);
500 }
415 } 501 }
502
503 set_up_gart_resume(aper_order, aper_alloc);
416} 504}
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 4b99b1bdeb6c..21c831d96af3 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -52,29 +52,38 @@
52 52
53unsigned long mp_lapic_addr; 53unsigned long mp_lapic_addr;
54 54
55DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
56EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
57
58/* 55/*
59 * Knob to control our willingness to enable the local APIC. 56 * Knob to control our willingness to enable the local APIC.
60 * 57 *
61 * -1=force-disable, +1=force-enable 58 * +1=force-enable
62 */ 59 */
63static int enable_local_apic __initdata; 60static int force_enable_local_apic;
61int disable_apic;
64 62
65/* Local APIC timer verification ok */ 63/* Disable local APIC timer from the kernel commandline or via dmi quirk */
66static int local_apic_timer_verify_ok; 64static int disable_apic_timer __cpuinitdata;
67/* Disable local APIC timer from the kernel commandline or via dmi quirk
68 or using CPU MSR check */
69int local_apic_timer_disabled;
70/* Local APIC timer works in C2 */ 65/* Local APIC timer works in C2 */
71int local_apic_timer_c2_ok; 66int local_apic_timer_c2_ok;
72EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); 67EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
73 68
69int first_system_vector = 0xfe;
70
71char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
72
74/* 73/*
75 * Debug level, exported for io_apic.c 74 * Debug level, exported for io_apic.c
76 */ 75 */
77int apic_verbosity; 76unsigned int apic_verbosity;
77
78int pic_mode;
79
80/* Have we found an MP table */
81int smp_found_config;
82
83static struct resource lapic_resource = {
84 .name = "Local APIC",
85 .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
86};
78 87
79static unsigned int calibration_result; 88static unsigned int calibration_result;
80 89
@@ -119,7 +128,11 @@ static inline int lapic_get_version(void)
119 */ 128 */
120static inline int lapic_is_integrated(void) 129static inline int lapic_is_integrated(void)
121{ 130{
131#ifdef CONFIG_X86_64
132 return 1;
133#else
122 return APIC_INTEGRATED(lapic_get_version()); 134 return APIC_INTEGRATED(lapic_get_version());
135#endif
123} 136}
124 137
125/* 138/*
@@ -134,13 +147,18 @@ static int modern_apic(void)
134 return lapic_get_version() >= 0x14; 147 return lapic_get_version() >= 0x14;
135} 148}
136 149
137void apic_wait_icr_idle(void) 150/*
151 * Paravirt kernels also might be using these below ops. So we still
152 * use generic apic_read()/apic_write(), which might be pointing to different
153 * ops in PARAVIRT case.
154 */
155void xapic_wait_icr_idle(void)
138{ 156{
139 while (apic_read(APIC_ICR) & APIC_ICR_BUSY) 157 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
140 cpu_relax(); 158 cpu_relax();
141} 159}
142 160
143u32 safe_apic_wait_icr_idle(void) 161u32 safe_xapic_wait_icr_idle(void)
144{ 162{
145 u32 send_status; 163 u32 send_status;
146 int timeout; 164 int timeout;
@@ -156,17 +174,49 @@ u32 safe_apic_wait_icr_idle(void)
156 return send_status; 174 return send_status;
157} 175}
158 176
177void xapic_icr_write(u32 low, u32 id)
178{
179 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
180 apic_write(APIC_ICR, low);
181}
182
183u64 xapic_icr_read(void)
184{
185 u32 icr1, icr2;
186
187 icr2 = apic_read(APIC_ICR2);
188 icr1 = apic_read(APIC_ICR);
189
190 return icr1 | ((u64)icr2 << 32);
191}
192
193static struct apic_ops xapic_ops = {
194 .read = native_apic_mem_read,
195 .write = native_apic_mem_write,
196 .icr_read = xapic_icr_read,
197 .icr_write = xapic_icr_write,
198 .wait_icr_idle = xapic_wait_icr_idle,
199 .safe_wait_icr_idle = safe_xapic_wait_icr_idle,
200};
201
202struct apic_ops __read_mostly *apic_ops = &xapic_ops;
203EXPORT_SYMBOL_GPL(apic_ops);
204
159/** 205/**
160 * enable_NMI_through_LVT0 - enable NMI through local vector table 0 206 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
161 */ 207 */
162void __cpuinit enable_NMI_through_LVT0(void) 208void __cpuinit enable_NMI_through_LVT0(void)
163{ 209{
164 unsigned int v = APIC_DM_NMI; 210 unsigned int v;
211
212 /* unmask and set to NMI */
213 v = APIC_DM_NMI;
165 214
166 /* Level triggered for 82489DX */ 215 /* Level triggered for 82489DX (32bit mode) */
167 if (!lapic_is_integrated()) 216 if (!lapic_is_integrated())
168 v |= APIC_LVT_LEVEL_TRIGGER; 217 v |= APIC_LVT_LEVEL_TRIGGER;
169 apic_write_around(APIC_LVT0, v); 218
219 apic_write(APIC_LVT0, v);
170} 220}
171 221
172/** 222/**
@@ -182,9 +232,13 @@ int get_physical_broadcast(void)
182 */ 232 */
183int lapic_get_maxlvt(void) 233int lapic_get_maxlvt(void)
184{ 234{
185 unsigned int v = apic_read(APIC_LVR); 235 unsigned int v;
186 236
187 /* 82489DXs do not report # of LVT entries. */ 237 v = apic_read(APIC_LVR);
238 /*
239 * - we always have APIC integrated on 64bit mode
240 * - 82489DXs do not report # of LVT entries
241 */
188 return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; 242 return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
189} 243}
190 244
@@ -192,8 +246,12 @@ int lapic_get_maxlvt(void)
192 * Local APIC timer 246 * Local APIC timer
193 */ 247 */
194 248
195/* Clock divisor is set to 16 */ 249/* Clock divisor */
250#ifdef CONFG_X86_64
251#define APIC_DIVISOR 1
252#else
196#define APIC_DIVISOR 16 253#define APIC_DIVISOR 16
254#endif
197 255
198/* 256/*
199 * This function sets up the local APIC timer, with a timeout of 257 * This function sets up the local APIC timer, with a timeout of
@@ -218,27 +276,61 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
218 if (!irqen) 276 if (!irqen)
219 lvtt_value |= APIC_LVT_MASKED; 277 lvtt_value |= APIC_LVT_MASKED;
220 278
221 apic_write_around(APIC_LVTT, lvtt_value); 279 apic_write(APIC_LVTT, lvtt_value);
222 280
223 /* 281 /*
224 * Divide PICLK by 16 282 * Divide PICLK by 16
225 */ 283 */
226 tmp_value = apic_read(APIC_TDCR); 284 tmp_value = apic_read(APIC_TDCR);
227 apic_write_around(APIC_TDCR, (tmp_value 285 apic_write(APIC_TDCR,
228 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) 286 (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
229 | APIC_TDR_DIV_16); 287 APIC_TDR_DIV_16);
230 288
231 if (!oneshot) 289 if (!oneshot)
232 apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); 290 apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
233} 291}
234 292
235/* 293/*
294 * Setup extended LVT, AMD specific (K8, family 10h)
295 *
296 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
297 * MCE interrupts are supported. Thus MCE offset must be set to 0.
298 *
299 * If mask=1, the LVT entry does not generate interrupts while mask=0
300 * enables the vector. See also the BKDGs.
301 */
302
303#define APIC_EILVT_LVTOFF_MCE 0
304#define APIC_EILVT_LVTOFF_IBS 1
305
306static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
307{
308 unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
309 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
310
311 apic_write(reg, v);
312}
313
314u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
315{
316 setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
317 return APIC_EILVT_LVTOFF_MCE;
318}
319
320u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
321{
322 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
323 return APIC_EILVT_LVTOFF_IBS;
324}
325EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs);
326
327/*
236 * Program the next event, relative to now 328 * Program the next event, relative to now
237 */ 329 */
238static int lapic_next_event(unsigned long delta, 330static int lapic_next_event(unsigned long delta,
239 struct clock_event_device *evt) 331 struct clock_event_device *evt)
240{ 332{
241 apic_write_around(APIC_TMICT, delta); 333 apic_write(APIC_TMICT, delta);
242 return 0; 334 return 0;
243} 335}
244 336
@@ -251,8 +343,8 @@ static void lapic_timer_setup(enum clock_event_mode mode,
251 unsigned long flags; 343 unsigned long flags;
252 unsigned int v; 344 unsigned int v;
253 345
254 /* Lapic used for broadcast ? */ 346 /* Lapic used as dummy for broadcast ? */
255 if (!local_apic_timer_verify_ok) 347 if (evt->features & CLOCK_EVT_FEAT_DUMMY)
256 return; 348 return;
257 349
258 local_irq_save(flags); 350 local_irq_save(flags);
@@ -267,7 +359,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
267 case CLOCK_EVT_MODE_SHUTDOWN: 359 case CLOCK_EVT_MODE_SHUTDOWN:
268 v = apic_read(APIC_LVTT); 360 v = apic_read(APIC_LVTT);
269 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 361 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
270 apic_write_around(APIC_LVTT, v); 362 apic_write(APIC_LVTT, v);
271 break; 363 break;
272 case CLOCK_EVT_MODE_RESUME: 364 case CLOCK_EVT_MODE_RESUME:
273 /* Nothing to do here */ 365 /* Nothing to do here */
@@ -361,12 +453,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
361 } 453 }
362} 454}
363 455
364/* 456static int __init calibrate_APIC_clock(void)
365 * Setup the boot APIC
366 *
367 * Calibrate and verify the result.
368 */
369void __init setup_boot_APIC_clock(void)
370{ 457{
371 struct clock_event_device *levt = &__get_cpu_var(lapic_events); 458 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
372 const long pm_100ms = PMTMR_TICKS_PER_SEC/10; 459 const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
@@ -376,24 +463,6 @@ void __init setup_boot_APIC_clock(void)
376 long delta, deltapm; 463 long delta, deltapm;
377 int pm_referenced = 0; 464 int pm_referenced = 0;
378 465
379 /*
380 * The local apic timer can be disabled via the kernel
381 * commandline or from the CPU detection code. Register the lapic
382 * timer as a dummy clock event source on SMP systems, so the
383 * broadcast mechanism is used. On UP systems simply ignore it.
384 */
385 if (local_apic_timer_disabled) {
386 /* No broadcast on UP ! */
387 if (num_possible_cpus() > 1) {
388 lapic_clockevent.mult = 1;
389 setup_APIC_timer();
390 }
391 return;
392 }
393
394 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
395 "calibrating APIC timer ...\n");
396
397 local_irq_disable(); 466 local_irq_disable();
398 467
399 /* Replace the global interrupt handler */ 468 /* Replace the global interrupt handler */
@@ -478,8 +547,6 @@ void __init setup_boot_APIC_clock(void)
478 calibration_result / (1000000 / HZ), 547 calibration_result / (1000000 / HZ),
479 calibration_result % (1000000 / HZ)); 548 calibration_result % (1000000 / HZ));
480 549
481 local_apic_timer_verify_ok = 1;
482
483 /* 550 /*
484 * Do a sanity check on the APIC calibration result 551 * Do a sanity check on the APIC calibration result
485 */ 552 */
@@ -487,12 +554,11 @@ void __init setup_boot_APIC_clock(void)
487 local_irq_enable(); 554 local_irq_enable();
488 printk(KERN_WARNING 555 printk(KERN_WARNING
489 "APIC frequency too slow, disabling apic timer\n"); 556 "APIC frequency too slow, disabling apic timer\n");
490 /* No broadcast on UP ! */ 557 return -1;
491 if (num_possible_cpus() > 1)
492 setup_APIC_timer();
493 return;
494 } 558 }
495 559
560 levt->features &= ~CLOCK_EVT_FEAT_DUMMY;
561
496 /* We trust the pm timer based calibration */ 562 /* We trust the pm timer based calibration */
497 if (!pm_referenced) { 563 if (!pm_referenced) {
498 apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); 564 apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
@@ -525,29 +591,63 @@ void __init setup_boot_APIC_clock(void)
525 if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) 591 if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2)
526 apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); 592 apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
527 else 593 else
528 local_apic_timer_verify_ok = 0; 594 levt->features |= CLOCK_EVT_FEAT_DUMMY;
529 } else 595 } else
530 local_irq_enable(); 596 local_irq_enable();
531 597
532 if (!local_apic_timer_verify_ok) { 598 if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
533 printk(KERN_WARNING 599 printk(KERN_WARNING
534 "APIC timer disabled due to verification failure.\n"); 600 "APIC timer disabled due to verification failure.\n");
601 return -1;
602 }
603
604 return 0;
605}
606
607/*
608 * Setup the boot APIC
609 *
610 * Calibrate and verify the result.
611 */
612void __init setup_boot_APIC_clock(void)
613{
614 /*
615 * The local apic timer can be disabled via the kernel
616 * commandline or from the CPU detection code. Register the lapic
617 * timer as a dummy clock event source on SMP systems, so the
618 * broadcast mechanism is used. On UP systems simply ignore it.
619 */
620 if (disable_apic_timer) {
621 printk(KERN_INFO "Disabling APIC timer\n");
535 /* No broadcast on UP ! */ 622 /* No broadcast on UP ! */
536 if (num_possible_cpus() == 1) 623 if (num_possible_cpus() > 1) {
537 return; 624 lapic_clockevent.mult = 1;
538 } else { 625 setup_APIC_timer();
539 /* 626 }
540 * If nmi_watchdog is set to IO_APIC, we need the 627 return;
541 * PIT/HPET going. Otherwise register lapic as a dummy
542 * device.
543 */
544 if (nmi_watchdog != NMI_IO_APIC)
545 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
546 else
547 printk(KERN_WARNING "APIC timer registered as dummy,"
548 " due to nmi_watchdog=1!\n");
549 } 628 }
550 629
630 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
631 "calibrating APIC timer ...\n");
632
633 if (calibrate_APIC_clock()) {
634 /* No broadcast on UP ! */
635 if (num_possible_cpus() > 1)
636 setup_APIC_timer();
637 return;
638 }
639
640 /*
641 * If nmi_watchdog is set to IO_APIC, we need the
642 * PIT/HPET going. Otherwise register lapic as a dummy
643 * device.
644 */
645 if (nmi_watchdog != NMI_IO_APIC)
646 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
647 else
648 printk(KERN_WARNING "APIC timer registered as dummy,"
649 " due to nmi_watchdog=%d!\n", nmi_watchdog);
650
551 /* Setup the lapic or request the broadcast */ 651 /* Setup the lapic or request the broadcast */
552 setup_APIC_timer(); 652 setup_APIC_timer();
553} 653}
@@ -587,7 +687,11 @@ static void local_apic_timer_interrupt(void)
587 /* 687 /*
588 * the NMI deadlock-detector uses this. 688 * the NMI deadlock-detector uses this.
589 */ 689 */
690#ifdef CONFIG_X86_64
691 add_pda(apic_timer_irqs, 1);
692#else
590 per_cpu(irq_stat, cpu).apic_timer_irqs++; 693 per_cpu(irq_stat, cpu).apic_timer_irqs++;
694#endif
591 695
592 evt->event_handler(evt); 696 evt->event_handler(evt);
593} 697}
@@ -627,35 +731,6 @@ int setup_profiling_timer(unsigned int multiplier)
627} 731}
628 732
629/* 733/*
630 * Setup extended LVT, AMD specific (K8, family 10h)
631 *
632 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
633 * MCE interrupts are supported. Thus MCE offset must be set to 0.
634 */
635
636#define APIC_EILVT_LVTOFF_MCE 0
637#define APIC_EILVT_LVTOFF_IBS 1
638
639static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
640{
641 unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
642 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
643 apic_write(reg, v);
644}
645
646u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
647{
648 setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
649 return APIC_EILVT_LVTOFF_MCE;
650}
651
652u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
653{
654 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
655 return APIC_EILVT_LVTOFF_IBS;
656}
657
658/*
659 * Local APIC start and shutdown 734 * Local APIC start and shutdown
660 */ 735 */
661 736
@@ -682,45 +757,41 @@ void clear_local_APIC(void)
682 */ 757 */
683 if (maxlvt >= 3) { 758 if (maxlvt >= 3) {
684 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ 759 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
685 apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); 760 apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
686 } 761 }
687 /* 762 /*
688 * Careful: we have to set masks only first to deassert 763 * Careful: we have to set masks only first to deassert
689 * any level-triggered sources. 764 * any level-triggered sources.
690 */ 765 */
691 v = apic_read(APIC_LVTT); 766 v = apic_read(APIC_LVTT);
692 apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); 767 apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
693 v = apic_read(APIC_LVT0); 768 v = apic_read(APIC_LVT0);
694 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); 769 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
695 v = apic_read(APIC_LVT1); 770 v = apic_read(APIC_LVT1);
696 apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); 771 apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
697 if (maxlvt >= 4) { 772 if (maxlvt >= 4) {
698 v = apic_read(APIC_LVTPC); 773 v = apic_read(APIC_LVTPC);
699 apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); 774 apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
700 } 775 }
701 776
702 /* lets not touch this if we didn't frob it */ 777 /* lets not touch this if we didn't frob it */
703#ifdef CONFIG_X86_MCE_P4THERMAL 778#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL)
704 if (maxlvt >= 5) { 779 if (maxlvt >= 5) {
705 v = apic_read(APIC_LVTTHMR); 780 v = apic_read(APIC_LVTTHMR);
706 apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED); 781 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
707 } 782 }
708#endif 783#endif
709 /* 784 /*
710 * Clean APIC state for other OSs: 785 * Clean APIC state for other OSs:
711 */ 786 */
712 apic_write_around(APIC_LVTT, APIC_LVT_MASKED); 787 apic_write(APIC_LVTT, APIC_LVT_MASKED);
713 apic_write_around(APIC_LVT0, APIC_LVT_MASKED); 788 apic_write(APIC_LVT0, APIC_LVT_MASKED);
714 apic_write_around(APIC_LVT1, APIC_LVT_MASKED); 789 apic_write(APIC_LVT1, APIC_LVT_MASKED);
715 if (maxlvt >= 3) 790 if (maxlvt >= 3)
716 apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); 791 apic_write(APIC_LVTERR, APIC_LVT_MASKED);
717 if (maxlvt >= 4) 792 if (maxlvt >= 4)
718 apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); 793 apic_write(APIC_LVTPC, APIC_LVT_MASKED);
719 794
720#ifdef CONFIG_X86_MCE_P4THERMAL
721 if (maxlvt >= 5)
722 apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED);
723#endif
724 /* Integrated APIC (!82489DX) ? */ 795 /* Integrated APIC (!82489DX) ? */
725 if (lapic_is_integrated()) { 796 if (lapic_is_integrated()) {
726 if (maxlvt > 3) 797 if (maxlvt > 3)
@@ -735,7 +806,7 @@ void clear_local_APIC(void)
735 */ 806 */
736void disable_local_APIC(void) 807void disable_local_APIC(void)
737{ 808{
738 unsigned long value; 809 unsigned int value;
739 810
740 clear_local_APIC(); 811 clear_local_APIC();
741 812
@@ -745,8 +816,9 @@ void disable_local_APIC(void)
745 */ 816 */
746 value = apic_read(APIC_SPIV); 817 value = apic_read(APIC_SPIV);
747 value &= ~APIC_SPIV_APIC_ENABLED; 818 value &= ~APIC_SPIV_APIC_ENABLED;
748 apic_write_around(APIC_SPIV, value); 819 apic_write(APIC_SPIV, value);
749 820
821#ifdef CONFIG_X86_32
750 /* 822 /*
751 * When LAPIC was disabled by the BIOS and enabled by the kernel, 823 * When LAPIC was disabled by the BIOS and enabled by the kernel,
752 * restore the disabled state. 824 * restore the disabled state.
@@ -758,6 +830,7 @@ void disable_local_APIC(void)
758 l &= ~MSR_IA32_APICBASE_ENABLE; 830 l &= ~MSR_IA32_APICBASE_ENABLE;
759 wrmsr(MSR_IA32_APICBASE, l, h); 831 wrmsr(MSR_IA32_APICBASE, l, h);
760 } 832 }
833#endif
761} 834}
762 835
763/* 836/*
@@ -774,11 +847,15 @@ void lapic_shutdown(void)
774 return; 847 return;
775 848
776 local_irq_save(flags); 849 local_irq_save(flags);
777 clear_local_APIC();
778 850
779 if (enabled_via_apicbase) 851#ifdef CONFIG_X86_32
852 if (!enabled_via_apicbase)
853 clear_local_APIC();
854 else
855#endif
780 disable_local_APIC(); 856 disable_local_APIC();
781 857
858
782 local_irq_restore(flags); 859 local_irq_restore(flags);
783} 860}
784 861
@@ -823,6 +900,12 @@ int __init verify_local_APIC(void)
823 */ 900 */
824 reg0 = apic_read(APIC_ID); 901 reg0 = apic_read(APIC_ID);
825 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); 902 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
903 apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
904 reg1 = apic_read(APIC_ID);
905 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
906 apic_write(APIC_ID, reg0);
907 if (reg1 != (reg0 ^ APIC_ID_MASK))
908 return 0;
826 909
827 /* 910 /*
828 * The next two are just to see if we have sane values. 911 * The next two are just to see if we have sane values.
@@ -848,14 +931,15 @@ void __init sync_Arb_IDs(void)
848 */ 931 */
849 if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) 932 if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
850 return; 933 return;
934
851 /* 935 /*
852 * Wait for idle. 936 * Wait for idle.
853 */ 937 */
854 apic_wait_icr_idle(); 938 apic_wait_icr_idle();
855 939
856 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); 940 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
857 apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG 941 apic_write(APIC_ICR, APIC_DEST_ALLINC |
858 | APIC_DM_INIT); 942 APIC_INT_LEVELTRIG | APIC_DM_INIT);
859} 943}
860 944
861/* 945/*
@@ -863,7 +947,7 @@ void __init sync_Arb_IDs(void)
863 */ 947 */
864void __init init_bsp_APIC(void) 948void __init init_bsp_APIC(void)
865{ 949{
866 unsigned long value; 950 unsigned int value;
867 951
868 /* 952 /*
869 * Don't do the setup now if we have a SMP BIOS as the 953 * Don't do the setup now if we have a SMP BIOS as the
@@ -884,29 +968,41 @@ void __init init_bsp_APIC(void)
884 value &= ~APIC_VECTOR_MASK; 968 value &= ~APIC_VECTOR_MASK;
885 value |= APIC_SPIV_APIC_ENABLED; 969 value |= APIC_SPIV_APIC_ENABLED;
886 970
971#ifdef CONFIG_X86_32
887 /* This bit is reserved on P4/Xeon and should be cleared */ 972 /* This bit is reserved on P4/Xeon and should be cleared */
888 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && 973 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
889 (boot_cpu_data.x86 == 15)) 974 (boot_cpu_data.x86 == 15))
890 value &= ~APIC_SPIV_FOCUS_DISABLED; 975 value &= ~APIC_SPIV_FOCUS_DISABLED;
891 else 976 else
977#endif
892 value |= APIC_SPIV_FOCUS_DISABLED; 978 value |= APIC_SPIV_FOCUS_DISABLED;
893 value |= SPURIOUS_APIC_VECTOR; 979 value |= SPURIOUS_APIC_VECTOR;
894 apic_write_around(APIC_SPIV, value); 980 apic_write(APIC_SPIV, value);
895 981
896 /* 982 /*
897 * Set up the virtual wire mode. 983 * Set up the virtual wire mode.
898 */ 984 */
899 apic_write_around(APIC_LVT0, APIC_DM_EXTINT); 985 apic_write(APIC_LVT0, APIC_DM_EXTINT);
900 value = APIC_DM_NMI; 986 value = APIC_DM_NMI;
901 if (!lapic_is_integrated()) /* 82489DX */ 987 if (!lapic_is_integrated()) /* 82489DX */
902 value |= APIC_LVT_LEVEL_TRIGGER; 988 value |= APIC_LVT_LEVEL_TRIGGER;
903 apic_write_around(APIC_LVT1, value); 989 apic_write(APIC_LVT1, value);
904} 990}
905 991
906static void __cpuinit lapic_setup_esr(void) 992static void __cpuinit lapic_setup_esr(void)
907{ 993{
908 unsigned long oldvalue, value, maxlvt; 994 unsigned long oldvalue, value, maxlvt;
909 if (lapic_is_integrated() && !esr_disable) { 995 if (lapic_is_integrated() && !esr_disable) {
996 if (esr_disable) {
997 /*
998 * Something untraceable is creating bad interrupts on
999 * secondary quads ... for the moment, just leave the
1000 * ESR disabled - we can't do anything useful with the
1001 * errors anyway - mbligh
1002 */
1003 printk(KERN_INFO "Leaving ESR disabled.\n");
1004 return;
1005 }
910 /* !82489DX */ 1006 /* !82489DX */
911 maxlvt = lapic_get_maxlvt(); 1007 maxlvt = lapic_get_maxlvt();
912 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ 1008 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
@@ -915,7 +1011,7 @@ static void __cpuinit lapic_setup_esr(void)
915 1011
916 /* enables sending errors */ 1012 /* enables sending errors */
917 value = ERROR_APIC_VECTOR; 1013 value = ERROR_APIC_VECTOR;
918 apic_write_around(APIC_LVTERR, value); 1014 apic_write(APIC_LVTERR, value);
919 /* 1015 /*
920 * spec says clear errors after enabling vector. 1016 * spec says clear errors after enabling vector.
921 */ 1017 */
@@ -927,16 +1023,7 @@ static void __cpuinit lapic_setup_esr(void)
927 "vector: 0x%08lx after: 0x%08lx\n", 1023 "vector: 0x%08lx after: 0x%08lx\n",
928 oldvalue, value); 1024 oldvalue, value);
929 } else { 1025 } else {
930 if (esr_disable) 1026 printk(KERN_INFO "No ESR for 82489DX.\n");
931 /*
932 * Something untraceable is creating bad interrupts on
933 * secondary quads ... for the moment, just leave the
934 * ESR disabled - we can't do anything useful with the
935 * errors anyway - mbligh
936 */
937 printk(KERN_INFO "Leaving ESR disabled.\n");
938 else
939 printk(KERN_INFO "No ESR for 82489DX.\n");
940 } 1027 }
941} 1028}
942 1029
@@ -963,7 +1050,7 @@ void __cpuinit setup_local_APIC(void)
963 * Double-check whether this APIC is really registered. 1050 * Double-check whether this APIC is really registered.
964 */ 1051 */
965 if (!apic_id_registered()) 1052 if (!apic_id_registered())
966 BUG(); 1053 WARN_ON_ONCE(1);
967 1054
968 /* 1055 /*
969 * Intel recommends to set DFR, LDR and TPR before enabling 1056 * Intel recommends to set DFR, LDR and TPR before enabling
@@ -978,7 +1065,7 @@ void __cpuinit setup_local_APIC(void)
978 */ 1065 */
979 value = apic_read(APIC_TASKPRI); 1066 value = apic_read(APIC_TASKPRI);
980 value &= ~APIC_TPRI_MASK; 1067 value &= ~APIC_TPRI_MASK;
981 apic_write_around(APIC_TASKPRI, value); 1068 apic_write(APIC_TASKPRI, value);
982 1069
983 /* 1070 /*
984 * After a crash, we no longer service the interrupts and a pending 1071 * After a crash, we no longer service the interrupts and a pending
@@ -1036,7 +1123,7 @@ void __cpuinit setup_local_APIC(void)
1036 * Set spurious IRQ vector 1123 * Set spurious IRQ vector
1037 */ 1124 */
1038 value |= SPURIOUS_APIC_VECTOR; 1125 value |= SPURIOUS_APIC_VECTOR;
1039 apic_write_around(APIC_SPIV, value); 1126 apic_write(APIC_SPIV, value);
1040 1127
1041 /* 1128 /*
1042 * Set up LVT0, LVT1: 1129 * Set up LVT0, LVT1:
@@ -1058,7 +1145,7 @@ void __cpuinit setup_local_APIC(void)
1058 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", 1145 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
1059 smp_processor_id()); 1146 smp_processor_id());
1060 } 1147 }
1061 apic_write_around(APIC_LVT0, value); 1148 apic_write(APIC_LVT0, value);
1062 1149
1063 /* 1150 /*
1064 * only the BP should see the LINT1 NMI signal, obviously. 1151 * only the BP should see the LINT1 NMI signal, obviously.
@@ -1069,18 +1156,22 @@ void __cpuinit setup_local_APIC(void)
1069 value = APIC_DM_NMI | APIC_LVT_MASKED; 1156 value = APIC_DM_NMI | APIC_LVT_MASKED;
1070 if (!integrated) /* 82489DX */ 1157 if (!integrated) /* 82489DX */
1071 value |= APIC_LVT_LEVEL_TRIGGER; 1158 value |= APIC_LVT_LEVEL_TRIGGER;
1072 apic_write_around(APIC_LVT1, value); 1159 apic_write(APIC_LVT1, value);
1073} 1160}
1074 1161
1075void __cpuinit end_local_APIC_setup(void) 1162void __cpuinit end_local_APIC_setup(void)
1076{ 1163{
1077 unsigned long value;
1078
1079 lapic_setup_esr(); 1164 lapic_setup_esr();
1080 /* Disable the local apic timer */ 1165
1081 value = apic_read(APIC_LVTT); 1166#ifdef CONFIG_X86_32
1082 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 1167 {
1083 apic_write_around(APIC_LVTT, value); 1168 unsigned int value;
1169 /* Disable the local apic timer */
1170 value = apic_read(APIC_LVTT);
1171 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
1172 apic_write(APIC_LVTT, value);
1173 }
1174#endif
1084 1175
1085 setup_apic_nmi_watchdog(NULL); 1176 setup_apic_nmi_watchdog(NULL);
1086 apic_pm_activate(); 1177 apic_pm_activate();
@@ -1094,7 +1185,7 @@ static int __init detect_init_APIC(void)
1094 u32 h, l, features; 1185 u32 h, l, features;
1095 1186
1096 /* Disabled by kernel option? */ 1187 /* Disabled by kernel option? */
1097 if (enable_local_apic < 0) 1188 if (disable_apic)
1098 return -1; 1189 return -1;
1099 1190
1100 switch (boot_cpu_data.x86_vendor) { 1191 switch (boot_cpu_data.x86_vendor) {
@@ -1117,7 +1208,7 @@ static int __init detect_init_APIC(void)
1117 * Over-ride BIOS and try to enable the local APIC only if 1208 * Over-ride BIOS and try to enable the local APIC only if
1118 * "lapic" specified. 1209 * "lapic" specified.
1119 */ 1210 */
1120 if (enable_local_apic <= 0) { 1211 if (!force_enable_local_apic) {
1121 printk(KERN_INFO "Local APIC disabled by BIOS -- " 1212 printk(KERN_INFO "Local APIC disabled by BIOS -- "
1122 "you can enable it with \"lapic\"\n"); 1213 "you can enable it with \"lapic\"\n");
1123 return -1; 1214 return -1;
@@ -1154,9 +1245,6 @@ static int __init detect_init_APIC(void)
1154 if (l & MSR_IA32_APICBASE_ENABLE) 1245 if (l & MSR_IA32_APICBASE_ENABLE)
1155 mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; 1246 mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
1156 1247
1157 if (nmi_watchdog != NMI_NONE && nmi_watchdog != NMI_DISABLED)
1158 nmi_watchdog = NMI_LOCAL_APIC;
1159
1160 printk(KERN_INFO "Found and enabled local APIC!\n"); 1248 printk(KERN_INFO "Found and enabled local APIC!\n");
1161 1249
1162 apic_pm_activate(); 1250 apic_pm_activate();
@@ -1193,38 +1281,8 @@ void __init init_apic_mappings(void)
1193 * default configuration (or the MP table is broken). 1281 * default configuration (or the MP table is broken).
1194 */ 1282 */
1195 if (boot_cpu_physical_apicid == -1U) 1283 if (boot_cpu_physical_apicid == -1U)
1196 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); 1284 boot_cpu_physical_apicid = read_apic_id();
1197 1285
1198#ifdef CONFIG_X86_IO_APIC
1199 {
1200 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
1201 int i;
1202
1203 for (i = 0; i < nr_ioapics; i++) {
1204 if (smp_found_config) {
1205 ioapic_phys = mp_ioapics[i].mpc_apicaddr;
1206 if (!ioapic_phys) {
1207 printk(KERN_ERR
1208 "WARNING: bogus zero IO-APIC "
1209 "address found in MPTABLE, "
1210 "disabling IO/APIC support!\n");
1211 smp_found_config = 0;
1212 skip_ioapic_setup = 1;
1213 goto fake_ioapic_page;
1214 }
1215 } else {
1216fake_ioapic_page:
1217 ioapic_phys = (unsigned long)
1218 alloc_bootmem_pages(PAGE_SIZE);
1219 ioapic_phys = __pa(ioapic_phys);
1220 }
1221 set_fixmap_nocache(idx, ioapic_phys);
1222 printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
1223 __fix_to_virt(idx), ioapic_phys);
1224 idx++;
1225 }
1226 }
1227#endif
1228} 1286}
1229 1287
1230/* 1288/*
@@ -1236,9 +1294,6 @@ int apic_version[MAX_APICS];
1236 1294
1237int __init APIC_init_uniprocessor(void) 1295int __init APIC_init_uniprocessor(void)
1238{ 1296{
1239 if (enable_local_apic < 0)
1240 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1241
1242 if (!smp_found_config && !cpu_has_apic) 1297 if (!smp_found_config && !cpu_has_apic)
1243 return -1; 1298 return -1;
1244 1299
@@ -1263,12 +1318,16 @@ int __init APIC_init_uniprocessor(void)
1263 * might be zero if read from MP tables. Get it from LAPIC. 1318 * might be zero if read from MP tables. Get it from LAPIC.
1264 */ 1319 */
1265#ifdef CONFIG_CRASH_DUMP 1320#ifdef CONFIG_CRASH_DUMP
1266 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); 1321 boot_cpu_physical_apicid = read_apic_id();
1267#endif 1322#endif
1268 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); 1323 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
1269 1324
1270 setup_local_APIC(); 1325 setup_local_APIC();
1271 1326
1327#ifdef CONFIG_X86_IO_APIC
1328 if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
1329#endif
1330 localise_nmi_watchdog();
1272 end_local_APIC_setup(); 1331 end_local_APIC_setup();
1273#ifdef CONFIG_X86_IO_APIC 1332#ifdef CONFIG_X86_IO_APIC
1274 if (smp_found_config) 1333 if (smp_found_config)
@@ -1338,55 +1397,12 @@ void smp_error_interrupt(struct pt_regs *regs)
1338 irq_exit(); 1397 irq_exit();
1339} 1398}
1340 1399
1341#ifdef CONFIG_SMP
1342void __init smp_intr_init(void)
1343{
1344 /*
1345 * IRQ0 must be given a fixed assignment and initialized,
1346 * because it's used before the IO-APIC is set up.
1347 */
1348 set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
1349
1350 /*
1351 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
1352 * IPI, driven by wakeup.
1353 */
1354 set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
1355
1356 /* IPI for invalidation */
1357 set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
1358
1359 /* IPI for generic function call */
1360 set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
1361}
1362#endif
1363
1364/*
1365 * Initialize APIC interrupts
1366 */
1367void __init apic_intr_init(void)
1368{
1369#ifdef CONFIG_SMP
1370 smp_intr_init();
1371#endif
1372 /* self generated IPI for local APIC timer */
1373 set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
1374
1375 /* IPI vectors for APIC spurious and error interrupts */
1376 set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
1377 set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
1378
1379 /* thermal monitor LVT interrupt */
1380#ifdef CONFIG_X86_MCE_P4THERMAL
1381 set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
1382#endif
1383}
1384
1385/** 1400/**
1386 * connect_bsp_APIC - attach the APIC to the interrupt system 1401 * connect_bsp_APIC - attach the APIC to the interrupt system
1387 */ 1402 */
1388void __init connect_bsp_APIC(void) 1403void __init connect_bsp_APIC(void)
1389{ 1404{
1405#ifdef CONFIG_X86_32
1390 if (pic_mode) { 1406 if (pic_mode) {
1391 /* 1407 /*
1392 * Do not trust the local APIC being empty at bootup. 1408 * Do not trust the local APIC being empty at bootup.
@@ -1401,6 +1417,7 @@ void __init connect_bsp_APIC(void)
1401 outb(0x70, 0x22); 1417 outb(0x70, 0x22);
1402 outb(0x01, 0x23); 1418 outb(0x01, 0x23);
1403 } 1419 }
1420#endif
1404 enable_apic_mode(); 1421 enable_apic_mode();
1405} 1422}
1406 1423
@@ -1413,6 +1430,9 @@ void __init connect_bsp_APIC(void)
1413 */ 1430 */
1414void disconnect_bsp_APIC(int virt_wire_setup) 1431void disconnect_bsp_APIC(int virt_wire_setup)
1415{ 1432{
1433 unsigned int value;
1434
1435#ifdef CONFIG_X86_32
1416 if (pic_mode) { 1436 if (pic_mode) {
1417 /* 1437 /*
1418 * Put the board back into PIC mode (has an effect only on 1438 * Put the board back into PIC mode (has an effect only on
@@ -1424,56 +1444,53 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1424 "entering PIC mode.\n"); 1444 "entering PIC mode.\n");
1425 outb(0x70, 0x22); 1445 outb(0x70, 0x22);
1426 outb(0x00, 0x23); 1446 outb(0x00, 0x23);
1427 } else { 1447 return;
1428 /* Go back to Virtual Wire compatibility mode */ 1448 }
1429 unsigned long value; 1449#endif
1430 1450
1431 /* For the spurious interrupt use vector F, and enable it */ 1451 /* Go back to Virtual Wire compatibility mode */
1432 value = apic_read(APIC_SPIV);
1433 value &= ~APIC_VECTOR_MASK;
1434 value |= APIC_SPIV_APIC_ENABLED;
1435 value |= 0xf;
1436 apic_write_around(APIC_SPIV, value);
1437 1452
1438 if (!virt_wire_setup) { 1453 /* For the spurious interrupt use vector F, and enable it */
1439 /* 1454 value = apic_read(APIC_SPIV);
1440 * For LVT0 make it edge triggered, active high, 1455 value &= ~APIC_VECTOR_MASK;
1441 * external and enabled 1456 value |= APIC_SPIV_APIC_ENABLED;
1442 */ 1457 value |= 0xf;
1443 value = apic_read(APIC_LVT0); 1458 apic_write(APIC_SPIV, value);
1444 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1445 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1446 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1447 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1448 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1449 apic_write_around(APIC_LVT0, value);
1450 } else {
1451 /* Disable LVT0 */
1452 apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
1453 }
1454 1459
1460 if (!virt_wire_setup) {
1455 /* 1461 /*
1456 * For LVT1 make it edge triggered, active high, nmi and 1462 * For LVT0 make it edge triggered, active high,
1457 * enabled 1463 * external and enabled
1458 */ 1464 */
1459 value = apic_read(APIC_LVT1); 1465 value = apic_read(APIC_LVT0);
1460 value &= ~( 1466 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1461 APIC_MODE_MASK | APIC_SEND_PENDING |
1462 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | 1467 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1463 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); 1468 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1464 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; 1469 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1465 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); 1470 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1466 apic_write_around(APIC_LVT1, value); 1471 apic_write(APIC_LVT0, value);
1472 } else {
1473 /* Disable LVT0 */
1474 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1467 } 1475 }
1468}
1469 1476
1470unsigned int __cpuinitdata maxcpus = NR_CPUS; 1477 /*
1478 * For LVT1 make it edge triggered, active high,
1479 * nmi and enabled
1480 */
1481 value = apic_read(APIC_LVT1);
1482 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1483 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1484 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1485 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1486 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
1487 apic_write(APIC_LVT1, value);
1488}
1471 1489
1472void __cpuinit generic_processor_info(int apicid, int version) 1490void __cpuinit generic_processor_info(int apicid, int version)
1473{ 1491{
1474 int cpu; 1492 int cpu;
1475 cpumask_t tmp_map; 1493 cpumask_t tmp_map;
1476 physid_mask_t phys_cpu;
1477 1494
1478 /* 1495 /*
1479 * Validate version 1496 * Validate version
@@ -1486,33 +1503,29 @@ void __cpuinit generic_processor_info(int apicid, int version)
1486 } 1503 }
1487 apic_version[apicid] = version; 1504 apic_version[apicid] = version;
1488 1505
1489 phys_cpu = apicid_to_cpu_present(apicid);
1490 physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
1491
1492 if (num_processors >= NR_CPUS) { 1506 if (num_processors >= NR_CPUS) {
1493 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." 1507 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
1494 " Processor ignored.\n", NR_CPUS); 1508 " Processor ignored.\n", NR_CPUS);
1495 return; 1509 return;
1496 } 1510 }
1497 1511
1498 if (num_processors >= maxcpus) {
1499 printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
1500 " Processor ignored.\n", maxcpus);
1501 return;
1502 }
1503
1504 num_processors++; 1512 num_processors++;
1505 cpus_complement(tmp_map, cpu_present_map); 1513 cpus_complement(tmp_map, cpu_present_map);
1506 cpu = first_cpu(tmp_map); 1514 cpu = first_cpu(tmp_map);
1507 1515
1508 if (apicid == boot_cpu_physical_apicid) 1516 physid_set(apicid, phys_cpu_present_map);
1517 if (apicid == boot_cpu_physical_apicid) {
1509 /* 1518 /*
1510 * x86_bios_cpu_apicid is required to have processors listed 1519 * x86_bios_cpu_apicid is required to have processors listed
1511 * in same order as logical cpu numbers. Hence the first 1520 * in same order as logical cpu numbers. Hence the first
1512 * entry is BSP, and so on. 1521 * entry is BSP, and so on.
1513 */ 1522 */
1514 cpu = 0; 1523 cpu = 0;
1524 }
1525 if (apicid > max_physical_apicid)
1526 max_physical_apicid = apicid;
1515 1527
1528#ifdef CONFIG_X86_32
1516 /* 1529 /*
1517 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y 1530 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
1518 * but we need to work other dependencies like SMP_SUSPEND etc 1531 * but we need to work other dependencies like SMP_SUSPEND etc
@@ -1520,7 +1533,7 @@ void __cpuinit generic_processor_info(int apicid, int version)
1520 * if (CPU_HOTPLUG_ENABLED || num_processors > 8) 1533 * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
1521 * - Ashok Raj <ashok.raj@intel.com> 1534 * - Ashok Raj <ashok.raj@intel.com>
1522 */ 1535 */
1523 if (num_processors > 8) { 1536 if (max_physical_apicid >= 8) {
1524 switch (boot_cpu_data.x86_vendor) { 1537 switch (boot_cpu_data.x86_vendor) {
1525 case X86_VENDOR_INTEL: 1538 case X86_VENDOR_INTEL:
1526 if (!APIC_XAPIC(version)) { 1539 if (!APIC_XAPIC(version)) {
@@ -1532,11 +1545,13 @@ void __cpuinit generic_processor_info(int apicid, int version)
1532 def_to_bigsmp = 1; 1545 def_to_bigsmp = 1;
1533 } 1546 }
1534 } 1547 }
1535#ifdef CONFIG_SMP 1548#endif
1549
1550#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
1536 /* are we being called early in kernel startup? */ 1551 /* are we being called early in kernel startup? */
1537 if (x86_cpu_to_apicid_early_ptr) { 1552 if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
1538 u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr; 1553 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
1539 u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; 1554 u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
1540 1555
1541 cpu_to_apicid[cpu] = apicid; 1556 cpu_to_apicid[cpu] = apicid;
1542 bios_cpu_apicid[cpu] = apicid; 1557 bios_cpu_apicid[cpu] = apicid;
@@ -1545,6 +1560,7 @@ void __cpuinit generic_processor_info(int apicid, int version)
1545 per_cpu(x86_bios_cpu_apicid, cpu) = apicid; 1560 per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1546 } 1561 }
1547#endif 1562#endif
1563
1548 cpu_set(cpu, cpu_possible_map); 1564 cpu_set(cpu, cpu_possible_map);
1549 cpu_set(cpu, cpu_present_map); 1565 cpu_set(cpu, cpu_present_map);
1550} 1566}
@@ -1555,6 +1571,11 @@ void __cpuinit generic_processor_info(int apicid, int version)
1555#ifdef CONFIG_PM 1571#ifdef CONFIG_PM
1556 1572
1557static struct { 1573static struct {
1574 /*
1575 * 'active' is true if the local APIC was enabled by us and
1576 * not the BIOS; this signifies that we are also responsible
1577 * for disabling it before entering apm/acpi suspend
1578 */
1558 int active; 1579 int active;
1559 /* r/w apic fields */ 1580 /* r/w apic fields */
1560 unsigned int apic_id; 1581 unsigned int apic_id;
@@ -1595,7 +1616,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1595 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); 1616 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
1596 apic_pm_state.apic_tmict = apic_read(APIC_TMICT); 1617 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
1597 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); 1618 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
1598#ifdef CONFIG_X86_MCE_P4THERMAL 1619#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1599 if (maxlvt >= 5) 1620 if (maxlvt >= 5)
1600 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); 1621 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
1601#endif 1622#endif
@@ -1619,16 +1640,23 @@ static int lapic_resume(struct sys_device *dev)
1619 1640
1620 local_irq_save(flags); 1641 local_irq_save(flags);
1621 1642
1622 /* 1643#ifdef CONFIG_X86_64
1623 * Make sure the APICBASE points to the right address 1644 if (x2apic)
1624 * 1645 enable_x2apic();
1625 * FIXME! This will be wrong if we ever support suspend on 1646 else
1626 * SMP! We'll need to do this as part of the CPU restore! 1647#endif
1627 */ 1648 {
1628 rdmsr(MSR_IA32_APICBASE, l, h); 1649 /*
1629 l &= ~MSR_IA32_APICBASE_BASE; 1650 * Make sure the APICBASE points to the right address
1630 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; 1651 *
1631 wrmsr(MSR_IA32_APICBASE, l, h); 1652 * FIXME! This will be wrong if we ever support suspend on
1653 * SMP! We'll need to do this as part of the CPU restore!
1654 */
1655 rdmsr(MSR_IA32_APICBASE, l, h);
1656 l &= ~MSR_IA32_APICBASE_BASE;
1657 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
1658 wrmsr(MSR_IA32_APICBASE, l, h);
1659 }
1632 1660
1633 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); 1661 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
1634 apic_write(APIC_ID, apic_pm_state.apic_id); 1662 apic_write(APIC_ID, apic_pm_state.apic_id);
@@ -1638,7 +1666,7 @@ static int lapic_resume(struct sys_device *dev)
1638 apic_write(APIC_SPIV, apic_pm_state.apic_spiv); 1666 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
1639 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); 1667 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
1640 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); 1668 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
1641#ifdef CONFIG_X86_MCE_P4THERMAL 1669#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1642 if (maxlvt >= 5) 1670 if (maxlvt >= 5)
1643 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); 1671 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
1644#endif 1672#endif
@@ -1652,7 +1680,9 @@ static int lapic_resume(struct sys_device *dev)
1652 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); 1680 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
1653 apic_write(APIC_ESR, 0); 1681 apic_write(APIC_ESR, 0);
1654 apic_read(APIC_ESR); 1682 apic_read(APIC_ESR);
1683
1655 local_irq_restore(flags); 1684 local_irq_restore(flags);
1685
1656 return 0; 1686 return 0;
1657} 1687}
1658 1688
@@ -1703,25 +1733,25 @@ static void apic_pm_activate(void) { }
1703 */ 1733 */
1704static int __init parse_lapic(char *arg) 1734static int __init parse_lapic(char *arg)
1705{ 1735{
1706 enable_local_apic = 1; 1736 force_enable_local_apic = 1;
1707 return 0; 1737 return 0;
1708} 1738}
1709early_param("lapic", parse_lapic); 1739early_param("lapic", parse_lapic);
1710 1740
1711static int __init parse_nolapic(char *arg) 1741static int __init setup_disableapic(char *arg)
1712{ 1742{
1713 enable_local_apic = -1; 1743 disable_apic = 1;
1714 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 1744 setup_clear_cpu_cap(X86_FEATURE_APIC);
1715 return 0; 1745 return 0;
1716} 1746}
1717early_param("nolapic", parse_nolapic); 1747early_param("disableapic", setup_disableapic);
1718 1748
1719static int __init parse_disable_lapic_timer(char *arg) 1749/* same as disableapic, for compatibility */
1750static int __init setup_nolapic(char *arg)
1720{ 1751{
1721 local_apic_timer_disabled = 1; 1752 return setup_disableapic(arg);
1722 return 0;
1723} 1753}
1724early_param("nolapic_timer", parse_disable_lapic_timer); 1754early_param("nolapic", setup_nolapic);
1725 1755
1726static int __init parse_lapic_timer_c2_ok(char *arg) 1756static int __init parse_lapic_timer_c2_ok(char *arg)
1727{ 1757{
@@ -1730,13 +1760,60 @@ static int __init parse_lapic_timer_c2_ok(char *arg)
1730} 1760}
1731early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); 1761early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1732 1762
1733static int __init apic_set_verbosity(char *str) 1763static int __init parse_disable_apic_timer(char *arg)
1764{
1765 disable_apic_timer = 1;
1766 return 0;
1767}
1768early_param("noapictimer", parse_disable_apic_timer);
1769
1770static int __init parse_nolapic_timer(char *arg)
1771{
1772 disable_apic_timer = 1;
1773 return 0;
1774}
1775early_param("nolapic_timer", parse_nolapic_timer);
1776
1777static int __init apic_set_verbosity(char *arg)
1734{ 1778{
1735 if (strcmp("debug", str) == 0) 1779 if (!arg) {
1780#ifdef CONFIG_X86_64
1781 skip_ioapic_setup = 0;
1782 ioapic_force = 1;
1783 return 0;
1784#endif
1785 return -EINVAL;
1786 }
1787
1788 if (strcmp("debug", arg) == 0)
1736 apic_verbosity = APIC_DEBUG; 1789 apic_verbosity = APIC_DEBUG;
1737 else if (strcmp("verbose", str) == 0) 1790 else if (strcmp("verbose", arg) == 0)
1738 apic_verbosity = APIC_VERBOSE; 1791 apic_verbosity = APIC_VERBOSE;
1739 return 1; 1792 else {
1793 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
1794 " use apic=verbose or apic=debug\n", arg);
1795 return -EINVAL;
1796 }
1797
1798 return 0;
1799}
1800early_param("apic", apic_set_verbosity);
1801
1802static int __init lapic_insert_resource(void)
1803{
1804 if (!apic_phys)
1805 return -1;
1806
1807 /* Put local APIC into the resource map. */
1808 lapic_resource.start = apic_phys;
1809 lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
1810 insert_resource(&iomem_resource, &lapic_resource);
1811
1812 return 0;
1740} 1813}
1741__setup("apic=", apic_set_verbosity);
1742 1814
1815/*
1816 * need call insert after e820_reserve_resources()
1817 * that is using request_resource
1818 */
1819late_initcall(lapic_insert_resource);
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 0633cfd0dc29..94ddb69ae15e 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -27,6 +27,7 @@
27#include <linux/clockchips.h> 27#include <linux/clockchips.h>
28#include <linux/acpi_pmtmr.h> 28#include <linux/acpi_pmtmr.h>
29#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/dmar.h>
30 31
31#include <asm/atomic.h> 32#include <asm/atomic.h>
32#include <asm/smp.h> 33#include <asm/smp.h>
@@ -39,13 +40,20 @@
39#include <asm/proto.h> 40#include <asm/proto.h>
40#include <asm/timex.h> 41#include <asm/timex.h>
41#include <asm/apic.h> 42#include <asm/apic.h>
43#include <asm/i8259.h>
42 44
43#include <mach_ipi.h> 45#include <mach_ipi.h>
44#include <mach_apic.h> 46#include <mach_apic.h>
45 47
46int disable_apic_timer __cpuinitdata; 48/* Disable local APIC timer from the kernel commandline or via dmi quirk */
49static int disable_apic_timer __cpuinitdata;
47static int apic_calibrate_pmtmr __initdata; 50static int apic_calibrate_pmtmr __initdata;
48int disable_apic; 51int disable_apic;
52int disable_x2apic;
53int x2apic;
54
55/* x2apic enabled before OS handover */
56int x2apic_preenabled;
49 57
50/* Local APIC timer works in C2 */ 58/* Local APIC timer works in C2 */
51int local_apic_timer_c2_ok; 59int local_apic_timer_c2_ok;
@@ -54,7 +62,10 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
54/* 62/*
55 * Debug level, exported for io_apic.c 63 * Debug level, exported for io_apic.c
56 */ 64 */
57int apic_verbosity; 65unsigned int apic_verbosity;
66
67/* Have we found an MP table */
68int smp_found_config;
58 69
59static struct resource lapic_resource = { 70static struct resource lapic_resource = {
60 .name = "Local APIC", 71 .name = "Local APIC",
@@ -70,6 +81,9 @@ static void lapic_timer_setup(enum clock_event_mode mode,
70static void lapic_timer_broadcast(cpumask_t mask); 81static void lapic_timer_broadcast(cpumask_t mask);
71static void apic_pm_activate(void); 82static void apic_pm_activate(void);
72 83
84/*
85 * The local apic timer can be used for any function which is CPU local.
86 */
73static struct clock_event_device lapic_clockevent = { 87static struct clock_event_device lapic_clockevent = {
74 .name = "lapic", 88 .name = "lapic",
75 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT 89 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
@@ -87,10 +101,6 @@ static unsigned long apic_phys;
87 101
88unsigned long mp_lapic_addr; 102unsigned long mp_lapic_addr;
89 103
90DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
91EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
92
93unsigned int __cpuinitdata maxcpus = NR_CPUS;
94/* 104/*
95 * Get the LAPIC version 105 * Get the LAPIC version
96 */ 106 */
@@ -100,11 +110,15 @@ static inline int lapic_get_version(void)
100} 110}
101 111
102/* 112/*
103 * Check, if the APIC is integrated or a seperate chip 113 * Check, if the APIC is integrated or a separate chip
104 */ 114 */
105static inline int lapic_is_integrated(void) 115static inline int lapic_is_integrated(void)
106{ 116{
117#ifdef CONFIG_X86_64
107 return 1; 118 return 1;
119#else
120 return APIC_INTEGRATED(lapic_get_version());
121#endif
108} 122}
109 123
110/* 124/*
@@ -119,13 +133,18 @@ static int modern_apic(void)
119 return lapic_get_version() >= 0x14; 133 return lapic_get_version() >= 0x14;
120} 134}
121 135
122void apic_wait_icr_idle(void) 136/*
137 * Paravirt kernels also might be using these below ops. So we still
138 * use generic apic_read()/apic_write(), which might be pointing to different
139 * ops in PARAVIRT case.
140 */
141void xapic_wait_icr_idle(void)
123{ 142{
124 while (apic_read(APIC_ICR) & APIC_ICR_BUSY) 143 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
125 cpu_relax(); 144 cpu_relax();
126} 145}
127 146
128u32 safe_apic_wait_icr_idle(void) 147u32 safe_xapic_wait_icr_idle(void)
129{ 148{
130 u32 send_status; 149 u32 send_status;
131 int timeout; 150 int timeout;
@@ -141,6 +160,68 @@ u32 safe_apic_wait_icr_idle(void)
141 return send_status; 160 return send_status;
142} 161}
143 162
163void xapic_icr_write(u32 low, u32 id)
164{
165 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
166 apic_write(APIC_ICR, low);
167}
168
169u64 xapic_icr_read(void)
170{
171 u32 icr1, icr2;
172
173 icr2 = apic_read(APIC_ICR2);
174 icr1 = apic_read(APIC_ICR);
175
176 return icr1 | ((u64)icr2 << 32);
177}
178
179static struct apic_ops xapic_ops = {
180 .read = native_apic_mem_read,
181 .write = native_apic_mem_write,
182 .icr_read = xapic_icr_read,
183 .icr_write = xapic_icr_write,
184 .wait_icr_idle = xapic_wait_icr_idle,
185 .safe_wait_icr_idle = safe_xapic_wait_icr_idle,
186};
187
188struct apic_ops __read_mostly *apic_ops = &xapic_ops;
189EXPORT_SYMBOL_GPL(apic_ops);
190
191static void x2apic_wait_icr_idle(void)
192{
193 /* no need to wait for icr idle in x2apic */
194 return;
195}
196
197static u32 safe_x2apic_wait_icr_idle(void)
198{
199 /* no need to wait for icr idle in x2apic */
200 return 0;
201}
202
203void x2apic_icr_write(u32 low, u32 id)
204{
205 wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
206}
207
208u64 x2apic_icr_read(void)
209{
210 unsigned long val;
211
212 rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val);
213 return val;
214}
215
216static struct apic_ops x2apic_ops = {
217 .read = native_apic_msr_read,
218 .write = native_apic_msr_write,
219 .icr_read = x2apic_icr_read,
220 .icr_write = x2apic_icr_write,
221 .wait_icr_idle = x2apic_wait_icr_idle,
222 .safe_wait_icr_idle = safe_x2apic_wait_icr_idle,
223};
224
144/** 225/**
145 * enable_NMI_through_LVT0 - enable NMI through local vector table 0 226 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
146 */ 227 */
@@ -150,6 +231,11 @@ void __cpuinit enable_NMI_through_LVT0(void)
150 231
151 /* unmask and set to NMI */ 232 /* unmask and set to NMI */
152 v = APIC_DM_NMI; 233 v = APIC_DM_NMI;
234
235 /* Level triggered for 82489DX (32bit mode) */
236 if (!lapic_is_integrated())
237 v |= APIC_LVT_LEVEL_TRIGGER;
238
153 apic_write(APIC_LVT0, v); 239 apic_write(APIC_LVT0, v);
154} 240}
155 241
@@ -158,14 +244,28 @@ void __cpuinit enable_NMI_through_LVT0(void)
158 */ 244 */
159int lapic_get_maxlvt(void) 245int lapic_get_maxlvt(void)
160{ 246{
161 unsigned int v, maxlvt; 247 unsigned int v;
162 248
163 v = apic_read(APIC_LVR); 249 v = apic_read(APIC_LVR);
164 maxlvt = GET_APIC_MAXLVT(v); 250 /*
165 return maxlvt; 251 * - we always have APIC integrated on 64bit mode
252 * - 82489DXs do not report # of LVT entries
253 */
254 return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
166} 255}
167 256
168/* 257/*
258 * Local APIC timer
259 */
260
261/* Clock divisor */
262#ifdef CONFG_X86_64
263#define APIC_DIVISOR 1
264#else
265#define APIC_DIVISOR 16
266#endif
267
268/*
169 * This function sets up the local APIC timer, with a timeout of 269 * This function sets up the local APIC timer, with a timeout of
170 * 'clocks' APIC bus clock. During calibration we actually call 270 * 'clocks' APIC bus clock. During calibration we actually call
171 * this function twice on the boot CPU, once with a bogus timeout 271 * this function twice on the boot CPU, once with a bogus timeout
@@ -175,7 +275,6 @@ int lapic_get_maxlvt(void)
175 * We do reads before writes even if unnecessary, to get around the 275 * We do reads before writes even if unnecessary, to get around the
176 * P5 APIC double write bug. 276 * P5 APIC double write bug.
177 */ 277 */
178
179static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) 278static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
180{ 279{
181 unsigned int lvtt_value, tmp_value; 280 unsigned int lvtt_value, tmp_value;
@@ -183,6 +282,9 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
183 lvtt_value = LOCAL_TIMER_VECTOR; 282 lvtt_value = LOCAL_TIMER_VECTOR;
184 if (!oneshot) 283 if (!oneshot)
185 lvtt_value |= APIC_LVT_TIMER_PERIODIC; 284 lvtt_value |= APIC_LVT_TIMER_PERIODIC;
285 if (!lapic_is_integrated())
286 lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
287
186 if (!irqen) 288 if (!irqen)
187 lvtt_value |= APIC_LVT_MASKED; 289 lvtt_value |= APIC_LVT_MASKED;
188 290
@@ -192,12 +294,12 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
192 * Divide PICLK by 16 294 * Divide PICLK by 16
193 */ 295 */
194 tmp_value = apic_read(APIC_TDCR); 296 tmp_value = apic_read(APIC_TDCR);
195 apic_write(APIC_TDCR, (tmp_value 297 apic_write(APIC_TDCR,
196 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) 298 (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
197 | APIC_TDR_DIV_16); 299 APIC_TDR_DIV_16);
198 300
199 if (!oneshot) 301 if (!oneshot)
200 apic_write(APIC_TMICT, clocks); 302 apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
201} 303}
202 304
203/* 305/*
@@ -205,6 +307,9 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
205 * 307 *
206 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and 308 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
207 * MCE interrupts are supported. Thus MCE offset must be set to 0. 309 * MCE interrupts are supported. Thus MCE offset must be set to 0.
310 *
311 * If mask=1, the LVT entry does not generate interrupts while mask=0
312 * enables the vector. See also the BKDGs.
208 */ 313 */
209 314
210#define APIC_EILVT_LVTOFF_MCE 0 315#define APIC_EILVT_LVTOFF_MCE 0
@@ -229,6 +334,7 @@ u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
229 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); 334 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
230 return APIC_EILVT_LVTOFF_IBS; 335 return APIC_EILVT_LVTOFF_IBS;
231} 336}
337EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs);
232 338
233/* 339/*
234 * Program the next event, relative to now 340 * Program the next event, relative to now
@@ -314,7 +420,7 @@ static void setup_APIC_timer(void)
314 420
315#define TICK_COUNT 100000000 421#define TICK_COUNT 100000000
316 422
317static void __init calibrate_APIC_clock(void) 423static int __init calibrate_APIC_clock(void)
318{ 424{
319 unsigned apic, apic_start; 425 unsigned apic, apic_start;
320 unsigned long tsc, tsc_start; 426 unsigned long tsc, tsc_start;
@@ -367,7 +473,18 @@ static void __init calibrate_APIC_clock(void)
367 lapic_clockevent.min_delta_ns = 473 lapic_clockevent.min_delta_ns =
368 clockevent_delta2ns(0xF, &lapic_clockevent); 474 clockevent_delta2ns(0xF, &lapic_clockevent);
369 475
370 calibration_result = result / HZ; 476 calibration_result = (result * APIC_DIVISOR) / HZ;
477
478 /*
479 * Do a sanity check on the APIC calibration result
480 */
481 if (calibration_result < (1000000 / HZ)) {
482 printk(KERN_WARNING
483 "APIC frequency too slow, disabling apic timer\n");
484 return -1;
485 }
486
487 return 0;
371} 488}
372 489
373/* 490/*
@@ -378,10 +495,10 @@ static void __init calibrate_APIC_clock(void)
378void __init setup_boot_APIC_clock(void) 495void __init setup_boot_APIC_clock(void)
379{ 496{
380 /* 497 /*
381 * The local apic timer can be disabled via the kernel commandline. 498 * The local apic timer can be disabled via the kernel
382 * Register the lapic timer as a dummy clock event source on SMP 499 * commandline or from the CPU detection code. Register the lapic
383 * systems, so the broadcast mechanism is used. On UP systems simply 500 * timer as a dummy clock event source on SMP systems, so the
384 * ignore it. 501 * broadcast mechanism is used. On UP systems simply ignore it.
385 */ 502 */
386 if (disable_apic_timer) { 503 if (disable_apic_timer) {
387 printk(KERN_INFO "Disabling APIC timer\n"); 504 printk(KERN_INFO "Disabling APIC timer\n");
@@ -393,15 +510,10 @@ void __init setup_boot_APIC_clock(void)
393 return; 510 return;
394 } 511 }
395 512
396 printk(KERN_INFO "Using local APIC timer interrupts.\n"); 513 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
397 calibrate_APIC_clock(); 514 "calibrating APIC timer ...\n");
398 515
399 /* 516 if (calibrate_APIC_clock()) {
400 * Do a sanity check on the APIC calibration result
401 */
402 if (calibration_result < (1000000 / HZ)) {
403 printk(KERN_WARNING
404 "APIC frequency too slow, disabling apic timer\n");
405 /* No broadcast on UP ! */ 517 /* No broadcast on UP ! */
406 if (num_possible_cpus() > 1) 518 if (num_possible_cpus() > 1)
407 setup_APIC_timer(); 519 setup_APIC_timer();
@@ -417,37 +529,14 @@ void __init setup_boot_APIC_clock(void)
417 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; 529 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
418 else 530 else
419 printk(KERN_WARNING "APIC timer registered as dummy," 531 printk(KERN_WARNING "APIC timer registered as dummy,"
420 " due to nmi_watchdog=1!\n"); 532 " due to nmi_watchdog=%d!\n", nmi_watchdog);
421 533
534 /* Setup the lapic or request the broadcast */
422 setup_APIC_timer(); 535 setup_APIC_timer();
423} 536}
424 537
425/*
426 * AMD C1E enabled CPUs have a real nasty problem: Some BIOSes set the
427 * C1E flag only in the secondary CPU, so when we detect the wreckage
428 * we already have enabled the boot CPU local apic timer. Check, if
429 * disable_apic_timer is set and the DUMMY flag is cleared. If yes,
430 * set the DUMMY flag again and force the broadcast mode in the
431 * clockevents layer.
432 */
433static void __cpuinit check_boot_apic_timer_broadcast(void)
434{
435 if (!disable_apic_timer ||
436 (lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY))
437 return;
438
439 printk(KERN_INFO "AMD C1E detected late. Force timer broadcast.\n");
440 lapic_clockevent.features |= CLOCK_EVT_FEAT_DUMMY;
441
442 local_irq_enable();
443 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
444 &boot_cpu_physical_apicid);
445 local_irq_disable();
446}
447
448void __cpuinit setup_secondary_APIC_clock(void) 538void __cpuinit setup_secondary_APIC_clock(void)
449{ 539{
450 check_boot_apic_timer_broadcast();
451 setup_APIC_timer(); 540 setup_APIC_timer();
452} 541}
453 542
@@ -481,7 +570,11 @@ static void local_apic_timer_interrupt(void)
481 /* 570 /*
482 * the NMI deadlock-detector uses this. 571 * the NMI deadlock-detector uses this.
483 */ 572 */
573#ifdef CONFIG_X86_64
484 add_pda(apic_timer_irqs, 1); 574 add_pda(apic_timer_irqs, 1);
575#else
576 per_cpu(irq_stat, cpu).apic_timer_irqs++;
577#endif
485 578
486 evt->event_handler(evt); 579 evt->event_handler(evt);
487} 580}
@@ -512,6 +605,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)
512 irq_enter(); 605 irq_enter();
513 local_apic_timer_interrupt(); 606 local_apic_timer_interrupt();
514 irq_exit(); 607 irq_exit();
608
515 set_irq_regs(old_regs); 609 set_irq_regs(old_regs);
516} 610}
517 611
@@ -565,6 +659,13 @@ void clear_local_APIC(void)
565 apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); 659 apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
566 } 660 }
567 661
662 /* lets not touch this if we didn't frob it */
663#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL)
664 if (maxlvt >= 5) {
665 v = apic_read(APIC_LVTTHMR);
666 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
667 }
668#endif
568 /* 669 /*
569 * Clean APIC state for other OSs: 670 * Clean APIC state for other OSs:
570 */ 671 */
@@ -575,8 +676,14 @@ void clear_local_APIC(void)
575 apic_write(APIC_LVTERR, APIC_LVT_MASKED); 676 apic_write(APIC_LVTERR, APIC_LVT_MASKED);
576 if (maxlvt >= 4) 677 if (maxlvt >= 4)
577 apic_write(APIC_LVTPC, APIC_LVT_MASKED); 678 apic_write(APIC_LVTPC, APIC_LVT_MASKED);
578 apic_write(APIC_ESR, 0); 679
579 apic_read(APIC_ESR); 680 /* Integrated APIC (!82489DX) ? */
681 if (lapic_is_integrated()) {
682 if (maxlvt > 3)
683 /* Clear ESR due to Pentium errata 3AP and 11AP */
684 apic_write(APIC_ESR, 0);
685 apic_read(APIC_ESR);
686 }
580} 687}
581 688
582/** 689/**
@@ -595,8 +702,28 @@ void disable_local_APIC(void)
595 value = apic_read(APIC_SPIV); 702 value = apic_read(APIC_SPIV);
596 value &= ~APIC_SPIV_APIC_ENABLED; 703 value &= ~APIC_SPIV_APIC_ENABLED;
597 apic_write(APIC_SPIV, value); 704 apic_write(APIC_SPIV, value);
705
706#ifdef CONFIG_X86_32
707 /*
708 * When LAPIC was disabled by the BIOS and enabled by the kernel,
709 * restore the disabled state.
710 */
711 if (enabled_via_apicbase) {
712 unsigned int l, h;
713
714 rdmsr(MSR_IA32_APICBASE, l, h);
715 l &= ~MSR_IA32_APICBASE_ENABLE;
716 wrmsr(MSR_IA32_APICBASE, l, h);
717 }
718#endif
598} 719}
599 720
721/*
722 * If Linux enabled the LAPIC against the BIOS default disable it down before
723 * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and
724 * not power-off. Additionally clear all LVT entries before disable_local_APIC
725 * for the case where Linux didn't enable the LAPIC.
726 */
600void lapic_shutdown(void) 727void lapic_shutdown(void)
601{ 728{
602 unsigned long flags; 729 unsigned long flags;
@@ -606,7 +733,13 @@ void lapic_shutdown(void)
606 733
607 local_irq_save(flags); 734 local_irq_save(flags);
608 735
609 disable_local_APIC(); 736#ifdef CONFIG_X86_32
737 if (!enabled_via_apicbase)
738 clear_local_APIC();
739 else
740#endif
741 disable_local_APIC();
742
610 743
611 local_irq_restore(flags); 744 local_irq_restore(flags);
612} 745}
@@ -650,10 +783,10 @@ int __init verify_local_APIC(void)
650 /* 783 /*
651 * The ID register is read/write in a real APIC. 784 * The ID register is read/write in a real APIC.
652 */ 785 */
653 reg0 = read_apic_id(); 786 reg0 = apic_read(APIC_ID);
654 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); 787 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
655 apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); 788 apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
656 reg1 = read_apic_id(); 789 reg1 = apic_read(APIC_ID);
657 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); 790 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
658 apic_write(APIC_ID, reg0); 791 apic_write(APIC_ID, reg0);
659 if (reg1 != (reg0 ^ APIC_ID_MASK)) 792 if (reg1 != (reg0 ^ APIC_ID_MASK))
@@ -677,8 +810,11 @@ int __init verify_local_APIC(void)
677 */ 810 */
678void __init sync_Arb_IDs(void) 811void __init sync_Arb_IDs(void)
679{ 812{
680 /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */ 813 /*
681 if (modern_apic()) 814 * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not
815 * needed on AMD.
816 */
817 if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
682 return; 818 return;
683 819
684 /* 820 /*
@@ -687,8 +823,8 @@ void __init sync_Arb_IDs(void)
687 apic_wait_icr_idle(); 823 apic_wait_icr_idle();
688 824
689 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); 825 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
690 apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG 826 apic_write(APIC_ICR, APIC_DEST_ALLINC |
691 | APIC_DM_INIT); 827 APIC_INT_LEVELTRIG | APIC_DM_INIT);
692} 828}
693 829
694/* 830/*
@@ -705,8 +841,6 @@ void __init init_bsp_APIC(void)
705 if (smp_found_config || !cpu_has_apic) 841 if (smp_found_config || !cpu_has_apic)
706 return; 842 return;
707 843
708 value = apic_read(APIC_LVR);
709
710 /* 844 /*
711 * Do not trust the local APIC being empty at bootup. 845 * Do not trust the local APIC being empty at bootup.
712 */ 846 */
@@ -718,7 +852,15 @@ void __init init_bsp_APIC(void)
718 value = apic_read(APIC_SPIV); 852 value = apic_read(APIC_SPIV);
719 value &= ~APIC_VECTOR_MASK; 853 value &= ~APIC_VECTOR_MASK;
720 value |= APIC_SPIV_APIC_ENABLED; 854 value |= APIC_SPIV_APIC_ENABLED;
721 value |= APIC_SPIV_FOCUS_DISABLED; 855
856#ifdef CONFIG_X86_32
857 /* This bit is reserved on P4/Xeon and should be cleared */
858 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
859 (boot_cpu_data.x86 == 15))
860 value &= ~APIC_SPIV_FOCUS_DISABLED;
861 else
862#endif
863 value |= APIC_SPIV_FOCUS_DISABLED;
722 value |= SPURIOUS_APIC_VECTOR; 864 value |= SPURIOUS_APIC_VECTOR;
723 apic_write(APIC_SPIV, value); 865 apic_write(APIC_SPIV, value);
724 866
@@ -727,9 +869,50 @@ void __init init_bsp_APIC(void)
727 */ 869 */
728 apic_write(APIC_LVT0, APIC_DM_EXTINT); 870 apic_write(APIC_LVT0, APIC_DM_EXTINT);
729 value = APIC_DM_NMI; 871 value = APIC_DM_NMI;
872 if (!lapic_is_integrated()) /* 82489DX */
873 value |= APIC_LVT_LEVEL_TRIGGER;
730 apic_write(APIC_LVT1, value); 874 apic_write(APIC_LVT1, value);
731} 875}
732 876
877static void __cpuinit lapic_setup_esr(void)
878{
879 unsigned long oldvalue, value, maxlvt;
880 if (lapic_is_integrated() && !esr_disable) {
881 if (esr_disable) {
882 /*
883 * Something untraceable is creating bad interrupts on
884 * secondary quads ... for the moment, just leave the
885 * ESR disabled - we can't do anything useful with the
886 * errors anyway - mbligh
887 */
888 printk(KERN_INFO "Leaving ESR disabled.\n");
889 return;
890 }
891 /* !82489DX */
892 maxlvt = lapic_get_maxlvt();
893 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
894 apic_write(APIC_ESR, 0);
895 oldvalue = apic_read(APIC_ESR);
896
897 /* enables sending errors */
898 value = ERROR_APIC_VECTOR;
899 apic_write(APIC_LVTERR, value);
900 /*
901 * spec says clear errors after enabling vector.
902 */
903 if (maxlvt > 3)
904 apic_write(APIC_ESR, 0);
905 value = apic_read(APIC_ESR);
906 if (value != oldvalue)
907 apic_printk(APIC_VERBOSE, "ESR value before enabling "
908 "vector: 0x%08lx after: 0x%08lx\n",
909 oldvalue, value);
910 } else {
911 printk(KERN_INFO "No ESR for 82489DX.\n");
912 }
913}
914
915
733/** 916/**
734 * setup_local_APIC - setup the local APIC 917 * setup_local_APIC - setup the local APIC
735 */ 918 */
@@ -835,26 +1018,143 @@ void __cpuinit setup_local_APIC(void)
835 preempt_enable(); 1018 preempt_enable();
836} 1019}
837 1020
838static void __cpuinit lapic_setup_esr(void)
839{
840 unsigned maxlvt = lapic_get_maxlvt();
841
842 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR);
843 /*
844 * spec says clear errors after enabling vector.
845 */
846 if (maxlvt > 3)
847 apic_write(APIC_ESR, 0);
848}
849
850void __cpuinit end_local_APIC_setup(void) 1021void __cpuinit end_local_APIC_setup(void)
851{ 1022{
852 lapic_setup_esr(); 1023 lapic_setup_esr();
853 nmi_watchdog_default(); 1024
1025#ifdef CONFIG_X86_32
1026 {
1027 unsigned int value;
1028 /* Disable the local apic timer */
1029 value = apic_read(APIC_LVTT);
1030 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
1031 apic_write(APIC_LVTT, value);
1032 }
1033#endif
1034
854 setup_apic_nmi_watchdog(NULL); 1035 setup_apic_nmi_watchdog(NULL);
855 apic_pm_activate(); 1036 apic_pm_activate();
856} 1037}
857 1038
1039void check_x2apic(void)
1040{
1041 int msr, msr2;
1042
1043 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1044
1045 if (msr & X2APIC_ENABLE) {
1046 printk("x2apic enabled by BIOS, switching to x2apic ops\n");
1047 x2apic_preenabled = x2apic = 1;
1048 apic_ops = &x2apic_ops;
1049 }
1050}
1051
1052void enable_x2apic(void)
1053{
1054 int msr, msr2;
1055
1056 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1057 if (!(msr & X2APIC_ENABLE)) {
1058 printk("Enabling x2apic\n");
1059 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
1060 }
1061}
1062
1063void enable_IR_x2apic(void)
1064{
1065#ifdef CONFIG_INTR_REMAP
1066 int ret;
1067 unsigned long flags;
1068
1069 if (!cpu_has_x2apic)
1070 return;
1071
1072 if (!x2apic_preenabled && disable_x2apic) {
1073 printk(KERN_INFO
1074 "Skipped enabling x2apic and Interrupt-remapping "
1075 "because of nox2apic\n");
1076 return;
1077 }
1078
1079 if (x2apic_preenabled && disable_x2apic)
1080 panic("Bios already enabled x2apic, can't enforce nox2apic");
1081
1082 if (!x2apic_preenabled && skip_ioapic_setup) {
1083 printk(KERN_INFO
1084 "Skipped enabling x2apic and Interrupt-remapping "
1085 "because of skipping io-apic setup\n");
1086 return;
1087 }
1088
1089 ret = dmar_table_init();
1090 if (ret) {
1091 printk(KERN_INFO
1092 "dmar_table_init() failed with %d:\n", ret);
1093
1094 if (x2apic_preenabled)
1095 panic("x2apic enabled by bios. But IR enabling failed");
1096 else
1097 printk(KERN_INFO
1098 "Not enabling x2apic,Intr-remapping\n");
1099 return;
1100 }
1101
1102 local_irq_save(flags);
1103 mask_8259A();
1104 save_mask_IO_APIC_setup();
1105
1106 ret = enable_intr_remapping(1);
1107
1108 if (ret && x2apic_preenabled) {
1109 local_irq_restore(flags);
1110 panic("x2apic enabled by bios. But IR enabling failed");
1111 }
1112
1113 if (ret)
1114 goto end;
1115
1116 if (!x2apic) {
1117 x2apic = 1;
1118 apic_ops = &x2apic_ops;
1119 enable_x2apic();
1120 }
1121end:
1122 if (ret)
1123 /*
1124 * IR enabling failed
1125 */
1126 restore_IO_APIC_setup();
1127 else
1128 reinit_intr_remapped_IO_APIC(x2apic_preenabled);
1129
1130 unmask_8259A();
1131 local_irq_restore(flags);
1132
1133 if (!ret) {
1134 if (!x2apic_preenabled)
1135 printk(KERN_INFO
1136 "Enabled x2apic and interrupt-remapping\n");
1137 else
1138 printk(KERN_INFO
1139 "Enabled Interrupt-remapping\n");
1140 } else
1141 printk(KERN_ERR
1142 "Failed to enable Interrupt-remapping and x2apic\n");
1143#else
1144 if (!cpu_has_x2apic)
1145 return;
1146
1147 if (x2apic_preenabled)
1148 panic("x2apic enabled prior OS handover,"
1149 " enable CONFIG_INTR_REMAP");
1150
1151 printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping "
1152 " and x2apic\n");
1153#endif
1154
1155 return;
1156}
1157
858/* 1158/*
859 * Detect and enable local APICs on non-SMP boards. 1159 * Detect and enable local APICs on non-SMP boards.
860 * Original code written by Keir Fraser. 1160 * Original code written by Keir Fraser.
@@ -875,7 +1175,7 @@ static int __init detect_init_APIC(void)
875 1175
876void __init early_init_lapic_mapping(void) 1176void __init early_init_lapic_mapping(void)
877{ 1177{
878 unsigned long apic_phys; 1178 unsigned long phys_addr;
879 1179
880 /* 1180 /*
881 * If no local APIC can be found then go out 1181 * If no local APIC can be found then go out
@@ -884,17 +1184,17 @@ void __init early_init_lapic_mapping(void)
884 if (!smp_found_config) 1184 if (!smp_found_config)
885 return; 1185 return;
886 1186
887 apic_phys = mp_lapic_addr; 1187 phys_addr = mp_lapic_addr;
888 1188
889 set_fixmap_nocache(FIX_APIC_BASE, apic_phys); 1189 set_fixmap_nocache(FIX_APIC_BASE, phys_addr);
890 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", 1190 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
891 APIC_BASE, apic_phys); 1191 APIC_BASE, phys_addr);
892 1192
893 /* 1193 /*
894 * Fetch the APIC ID of the BSP in case we have a 1194 * Fetch the APIC ID of the BSP in case we have a
895 * default configuration (or the MP table is broken). 1195 * default configuration (or the MP table is broken).
896 */ 1196 */
897 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); 1197 boot_cpu_physical_apicid = read_apic_id();
898} 1198}
899 1199
900/** 1200/**
@@ -902,6 +1202,11 @@ void __init early_init_lapic_mapping(void)
902 */ 1202 */
903void __init init_apic_mappings(void) 1203void __init init_apic_mappings(void)
904{ 1204{
1205 if (x2apic) {
1206 boot_cpu_physical_apicid = read_apic_id();
1207 return;
1208 }
1209
905 /* 1210 /*
906 * If no local APIC can be found then set up a fake all 1211 * If no local APIC can be found then set up a fake all
907 * zeroes page to simulate the local APIC and another 1212 * zeroes page to simulate the local APIC and another
@@ -921,13 +1226,15 @@ void __init init_apic_mappings(void)
921 * Fetch the APIC ID of the BSP in case we have a 1226 * Fetch the APIC ID of the BSP in case we have a
922 * default configuration (or the MP table is broken). 1227 * default configuration (or the MP table is broken).
923 */ 1228 */
924 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); 1229 boot_cpu_physical_apicid = read_apic_id();
925} 1230}
926 1231
927/* 1232/*
928 * This initializes the IO-APIC and APIC hardware if this is 1233 * This initializes the IO-APIC and APIC hardware if this is
929 * a UP kernel. 1234 * a UP kernel.
930 */ 1235 */
1236int apic_version[MAX_APICS];
1237
931int __init APIC_init_uniprocessor(void) 1238int __init APIC_init_uniprocessor(void)
932{ 1239{
933 if (disable_apic) { 1240 if (disable_apic) {
@@ -940,9 +1247,14 @@ int __init APIC_init_uniprocessor(void)
940 return -1; 1247 return -1;
941 } 1248 }
942 1249
1250 enable_IR_x2apic();
1251 setup_apic_routing();
1252
943 verify_local_APIC(); 1253 verify_local_APIC();
944 1254
945 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); 1255 connect_bsp_APIC();
1256
1257 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
946 apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); 1258 apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
947 1259
948 setup_local_APIC(); 1260 setup_local_APIC();
@@ -954,6 +1266,8 @@ int __init APIC_init_uniprocessor(void)
954 if (!skip_ioapic_setup && nr_ioapics) 1266 if (!skip_ioapic_setup && nr_ioapics)
955 enable_IO_APIC(); 1267 enable_IO_APIC();
956 1268
1269 if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
1270 localise_nmi_watchdog();
957 end_local_APIC_setup(); 1271 end_local_APIC_setup();
958 1272
959 if (smp_found_config && !skip_ioapic_setup && nr_ioapics) 1273 if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
@@ -1021,10 +1335,58 @@ asmlinkage void smp_error_interrupt(void)
1021 irq_exit(); 1335 irq_exit();
1022} 1336}
1023 1337
1338/**
1339 * connect_bsp_APIC - attach the APIC to the interrupt system
1340 */
1341void __init connect_bsp_APIC(void)
1342{
1343#ifdef CONFIG_X86_32
1344 if (pic_mode) {
1345 /*
1346 * Do not trust the local APIC being empty at bootup.
1347 */
1348 clear_local_APIC();
1349 /*
1350 * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's
1351 * local APIC to INT and NMI lines.
1352 */
1353 apic_printk(APIC_VERBOSE, "leaving PIC mode, "
1354 "enabling APIC mode.\n");
1355 outb(0x70, 0x22);
1356 outb(0x01, 0x23);
1357 }
1358#endif
1359 enable_apic_mode();
1360}
1361
1362/**
1363 * disconnect_bsp_APIC - detach the APIC from the interrupt system
1364 * @virt_wire_setup: indicates, whether virtual wire mode is selected
1365 *
1366 * Virtual wire mode is necessary to deliver legacy interrupts even when the
1367 * APIC is disabled.
1368 */
1024void disconnect_bsp_APIC(int virt_wire_setup) 1369void disconnect_bsp_APIC(int virt_wire_setup)
1025{ 1370{
1371 unsigned int value;
1372
1373#ifdef CONFIG_X86_32
1374 if (pic_mode) {
1375 /*
1376 * Put the board back into PIC mode (has an effect only on
1377 * certain older boards). Note that APIC interrupts, including
1378 * IPIs, won't work beyond this point! The only exception are
1379 * INIT IPIs.
1380 */
1381 apic_printk(APIC_VERBOSE, "disabling APIC mode, "
1382 "entering PIC mode.\n");
1383 outb(0x70, 0x22);
1384 outb(0x00, 0x23);
1385 return;
1386 }
1387#endif
1388
1026 /* Go back to Virtual Wire compatibility mode */ 1389 /* Go back to Virtual Wire compatibility mode */
1027 unsigned long value;
1028 1390
1029 /* For the spurious interrupt use vector F, and enable it */ 1391 /* For the spurious interrupt use vector F, and enable it */
1030 value = apic_read(APIC_SPIV); 1392 value = apic_read(APIC_SPIV);
@@ -1050,7 +1412,10 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1050 apic_write(APIC_LVT0, APIC_LVT_MASKED); 1412 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1051 } 1413 }
1052 1414
1053 /* For LVT1 make it edge triggered, active high, nmi and enabled */ 1415 /*
1416 * For LVT1 make it edge triggered, active high,
1417 * nmi and enabled
1418 */
1054 value = apic_read(APIC_LVT1); 1419 value = apic_read(APIC_LVT1);
1055 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | 1420 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1056 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | 1421 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
@@ -1065,15 +1430,20 @@ void __cpuinit generic_processor_info(int apicid, int version)
1065 int cpu; 1430 int cpu;
1066 cpumask_t tmp_map; 1431 cpumask_t tmp_map;
1067 1432
1068 if (num_processors >= NR_CPUS) { 1433 /*
1069 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." 1434 * Validate version
1070 " Processor ignored.\n", NR_CPUS); 1435 */
1071 return; 1436 if (version == 0x0) {
1437 printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
1438 "fixing up to 0x10. (tell your hw vendor)\n",
1439 version);
1440 version = 0x10;
1072 } 1441 }
1442 apic_version[apicid] = version;
1073 1443
1074 if (num_processors >= maxcpus) { 1444 if (num_processors >= NR_CPUS) {
1075 printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." 1445 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
1076 " Processor ignored.\n", maxcpus); 1446 " Processor ignored.\n", NR_CPUS);
1077 return; 1447 return;
1078 } 1448 }
1079 1449
@@ -1090,10 +1460,36 @@ void __cpuinit generic_processor_info(int apicid, int version)
1090 */ 1460 */
1091 cpu = 0; 1461 cpu = 0;
1092 } 1462 }
1463 if (apicid > max_physical_apicid)
1464 max_physical_apicid = apicid;
1465
1466#ifdef CONFIG_X86_32
1467 /*
1468 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
1469 * but we need to work other dependencies like SMP_SUSPEND etc
1470 * before this can be done without some confusion.
1471 * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
1472 * - Ashok Raj <ashok.raj@intel.com>
1473 */
1474 if (max_physical_apicid >= 8) {
1475 switch (boot_cpu_data.x86_vendor) {
1476 case X86_VENDOR_INTEL:
1477 if (!APIC_XAPIC(version)) {
1478 def_to_bigsmp = 0;
1479 break;
1480 }
1481 /* If P4 and above fall through */
1482 case X86_VENDOR_AMD:
1483 def_to_bigsmp = 1;
1484 }
1485 }
1486#endif
1487
1488#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
1093 /* are we being called early in kernel startup? */ 1489 /* are we being called early in kernel startup? */
1094 if (x86_cpu_to_apicid_early_ptr) { 1490 if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
1095 u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr; 1491 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
1096 u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; 1492 u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
1097 1493
1098 cpu_to_apicid[cpu] = apicid; 1494 cpu_to_apicid[cpu] = apicid;
1099 bios_cpu_apicid[cpu] = apicid; 1495 bios_cpu_apicid[cpu] = apicid;
@@ -1101,20 +1497,28 @@ void __cpuinit generic_processor_info(int apicid, int version)
1101 per_cpu(x86_cpu_to_apicid, cpu) = apicid; 1497 per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1102 per_cpu(x86_bios_cpu_apicid, cpu) = apicid; 1498 per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1103 } 1499 }
1500#endif
1104 1501
1105 cpu_set(cpu, cpu_possible_map); 1502 cpu_set(cpu, cpu_possible_map);
1106 cpu_set(cpu, cpu_present_map); 1503 cpu_set(cpu, cpu_present_map);
1107} 1504}
1108 1505
1506int hard_smp_processor_id(void)
1507{
1508 return read_apic_id();
1509}
1510
1109/* 1511/*
1110 * Power management 1512 * Power management
1111 */ 1513 */
1112#ifdef CONFIG_PM 1514#ifdef CONFIG_PM
1113 1515
1114static struct { 1516static struct {
1115 /* 'active' is true if the local APIC was enabled by us and 1517 /*
1116 not the BIOS; this signifies that we are also responsible 1518 * 'active' is true if the local APIC was enabled by us and
1117 for disabling it before entering apm/acpi suspend */ 1519 * not the BIOS; this signifies that we are also responsible
1520 * for disabling it before entering apm/acpi suspend
1521 */
1118 int active; 1522 int active;
1119 /* r/w apic fields */ 1523 /* r/w apic fields */
1120 unsigned int apic_id; 1524 unsigned int apic_id;
@@ -1142,7 +1546,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1142 1546
1143 maxlvt = lapic_get_maxlvt(); 1547 maxlvt = lapic_get_maxlvt();
1144 1548
1145 apic_pm_state.apic_id = read_apic_id(); 1549 apic_pm_state.apic_id = apic_read(APIC_ID);
1146 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); 1550 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
1147 apic_pm_state.apic_ldr = apic_read(APIC_LDR); 1551 apic_pm_state.apic_ldr = apic_read(APIC_LDR);
1148 apic_pm_state.apic_dfr = apic_read(APIC_DFR); 1552 apic_pm_state.apic_dfr = apic_read(APIC_DFR);
@@ -1155,10 +1559,11 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1155 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); 1559 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
1156 apic_pm_state.apic_tmict = apic_read(APIC_TMICT); 1560 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
1157 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); 1561 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
1158#ifdef CONFIG_X86_MCE_INTEL 1562#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1159 if (maxlvt >= 5) 1563 if (maxlvt >= 5)
1160 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); 1564 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
1161#endif 1565#endif
1566
1162 local_irq_save(flags); 1567 local_irq_save(flags);
1163 disable_local_APIC(); 1568 disable_local_APIC();
1164 local_irq_restore(flags); 1569 local_irq_restore(flags);
@@ -1177,10 +1582,25 @@ static int lapic_resume(struct sys_device *dev)
1177 maxlvt = lapic_get_maxlvt(); 1582 maxlvt = lapic_get_maxlvt();
1178 1583
1179 local_irq_save(flags); 1584 local_irq_save(flags);
1180 rdmsr(MSR_IA32_APICBASE, l, h); 1585
1181 l &= ~MSR_IA32_APICBASE_BASE; 1586#ifdef CONFIG_X86_64
1182 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; 1587 if (x2apic)
1183 wrmsr(MSR_IA32_APICBASE, l, h); 1588 enable_x2apic();
1589 else
1590#endif
1591 {
1592 /*
1593 * Make sure the APICBASE points to the right address
1594 *
1595 * FIXME! This will be wrong if we ever support suspend on
1596 * SMP! We'll need to do this as part of the CPU restore!
1597 */
1598 rdmsr(MSR_IA32_APICBASE, l, h);
1599 l &= ~MSR_IA32_APICBASE_BASE;
1600 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
1601 wrmsr(MSR_IA32_APICBASE, l, h);
1602 }
1603
1184 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); 1604 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
1185 apic_write(APIC_ID, apic_pm_state.apic_id); 1605 apic_write(APIC_ID, apic_pm_state.apic_id);
1186 apic_write(APIC_DFR, apic_pm_state.apic_dfr); 1606 apic_write(APIC_DFR, apic_pm_state.apic_dfr);
@@ -1189,7 +1609,7 @@ static int lapic_resume(struct sys_device *dev)
1189 apic_write(APIC_SPIV, apic_pm_state.apic_spiv); 1609 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
1190 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); 1610 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
1191 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); 1611 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
1192#ifdef CONFIG_X86_MCE_INTEL 1612#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1193 if (maxlvt >= 5) 1613 if (maxlvt >= 5)
1194 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); 1614 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
1195#endif 1615#endif
@@ -1203,10 +1623,17 @@ static int lapic_resume(struct sys_device *dev)
1203 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); 1623 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
1204 apic_write(APIC_ESR, 0); 1624 apic_write(APIC_ESR, 0);
1205 apic_read(APIC_ESR); 1625 apic_read(APIC_ESR);
1626
1206 local_irq_restore(flags); 1627 local_irq_restore(flags);
1628
1207 return 0; 1629 return 0;
1208} 1630}
1209 1631
1632/*
1633 * This device has no shutdown method - fully functioning local APICs
1634 * are needed on every CPU up until machine_halt/restart/poweroff.
1635 */
1636
1210static struct sysdev_class lapic_sysclass = { 1637static struct sysdev_class lapic_sysclass = {
1211 .name = "lapic", 1638 .name = "lapic",
1212 .resume = lapic_resume, 1639 .resume = lapic_resume,
@@ -1269,7 +1696,7 @@ __cpuinit int apic_is_clustered_box(void)
1269 if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) 1696 if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
1270 return 0; 1697 return 0;
1271 1698
1272 bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; 1699 bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
1273 bitmap_zero(clustermap, NUM_APIC_CLUSTERS); 1700 bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
1274 1701
1275 for (i = 0; i < NR_CPUS; i++) { 1702 for (i = 0; i < NR_CPUS; i++) {
@@ -1320,42 +1747,30 @@ __cpuinit int apic_is_clustered_box(void)
1320 return (clusters > 2); 1747 return (clusters > 2);
1321} 1748}
1322 1749
1323/* 1750static __init int setup_nox2apic(char *str)
1324 * APIC command line parameters
1325 */
1326static int __init apic_set_verbosity(char *str)
1327{ 1751{
1328 if (str == NULL) { 1752 disable_x2apic = 1;
1329 skip_ioapic_setup = 0; 1753 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_X2APIC);
1330 ioapic_force = 1;
1331 return 0;
1332 }
1333 if (strcmp("debug", str) == 0)
1334 apic_verbosity = APIC_DEBUG;
1335 else if (strcmp("verbose", str) == 0)
1336 apic_verbosity = APIC_VERBOSE;
1337 else {
1338 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
1339 " use apic=verbose or apic=debug\n", str);
1340 return -EINVAL;
1341 }
1342
1343 return 0; 1754 return 0;
1344} 1755}
1345early_param("apic", apic_set_verbosity); 1756early_param("nox2apic", setup_nox2apic);
1757
1346 1758
1347static __init int setup_disableapic(char *str) 1759/*
1760 * APIC command line parameters
1761 */
1762static int __init setup_disableapic(char *arg)
1348{ 1763{
1349 disable_apic = 1; 1764 disable_apic = 1;
1350 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 1765 setup_clear_cpu_cap(X86_FEATURE_APIC);
1351 return 0; 1766 return 0;
1352} 1767}
1353early_param("disableapic", setup_disableapic); 1768early_param("disableapic", setup_disableapic);
1354 1769
1355/* same as disableapic, for compatibility */ 1770/* same as disableapic, for compatibility */
1356static __init int setup_nolapic(char *str) 1771static int __init setup_nolapic(char *arg)
1357{ 1772{
1358 return setup_disableapic(str); 1773 return setup_disableapic(arg);
1359} 1774}
1360early_param("nolapic", setup_nolapic); 1775early_param("nolapic", setup_nolapic);
1361 1776
@@ -1366,14 +1781,19 @@ static int __init parse_lapic_timer_c2_ok(char *arg)
1366} 1781}
1367early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); 1782early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1368 1783
1369static __init int setup_noapictimer(char *str) 1784static int __init parse_disable_apic_timer(char *arg)
1370{ 1785{
1371 if (str[0] != ' ' && str[0] != 0)
1372 return 0;
1373 disable_apic_timer = 1; 1786 disable_apic_timer = 1;
1374 return 1; 1787 return 0;
1375} 1788}
1376__setup("noapictimer", setup_noapictimer); 1789early_param("noapictimer", parse_disable_apic_timer);
1790
1791static int __init parse_nolapic_timer(char *arg)
1792{
1793 disable_apic_timer = 1;
1794 return 0;
1795}
1796early_param("nolapic_timer", parse_nolapic_timer);
1377 1797
1378static __init int setup_apicpmtimer(char *s) 1798static __init int setup_apicpmtimer(char *s)
1379{ 1799{
@@ -1383,6 +1803,31 @@ static __init int setup_apicpmtimer(char *s)
1383} 1803}
1384__setup("apicpmtimer", setup_apicpmtimer); 1804__setup("apicpmtimer", setup_apicpmtimer);
1385 1805
1806static int __init apic_set_verbosity(char *arg)
1807{
1808 if (!arg) {
1809#ifdef CONFIG_X86_64
1810 skip_ioapic_setup = 0;
1811 ioapic_force = 1;
1812 return 0;
1813#endif
1814 return -EINVAL;
1815 }
1816
1817 if (strcmp("debug", arg) == 0)
1818 apic_verbosity = APIC_DEBUG;
1819 else if (strcmp("verbose", arg) == 0)
1820 apic_verbosity = APIC_VERBOSE;
1821 else {
1822 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
1823 " use apic=verbose or apic=debug\n", arg);
1824 return -EINVAL;
1825 }
1826
1827 return 0;
1828}
1829early_param("apic", apic_set_verbosity);
1830
1386static int __init lapic_insert_resource(void) 1831static int __init lapic_insert_resource(void)
1387{ 1832{
1388 if (!apic_phys) 1833 if (!apic_phys)
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index bf9290e29013..5145a6e72bbb 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -204,6 +204,7 @@
204#include <linux/module.h> 204#include <linux/module.h>
205 205
206#include <linux/poll.h> 206#include <linux/poll.h>
207#include <linux/smp_lock.h>
207#include <linux/types.h> 208#include <linux/types.h>
208#include <linux/stddef.h> 209#include <linux/stddef.h>
209#include <linux/timer.h> 210#include <linux/timer.h>
@@ -218,7 +219,6 @@
218#include <linux/time.h> 219#include <linux/time.h>
219#include <linux/sched.h> 220#include <linux/sched.h>
220#include <linux/pm.h> 221#include <linux/pm.h>
221#include <linux/pm_legacy.h>
222#include <linux/capability.h> 222#include <linux/capability.h>
223#include <linux/device.h> 223#include <linux/device.h>
224#include <linux/kernel.h> 224#include <linux/kernel.h>
@@ -233,6 +233,7 @@
233#include <asm/uaccess.h> 233#include <asm/uaccess.h>
234#include <asm/desc.h> 234#include <asm/desc.h>
235#include <asm/i8253.h> 235#include <asm/i8253.h>
236#include <asm/olpc.h>
236#include <asm/paravirt.h> 237#include <asm/paravirt.h>
237#include <asm/reboot.h> 238#include <asm/reboot.h>
238 239
@@ -1149,7 +1150,7 @@ static void queue_event(apm_event_t event, struct apm_user *sender)
1149 as->event_tail = 0; 1150 as->event_tail = 0;
1150 } 1151 }
1151 as->events[as->event_head] = event; 1152 as->events[as->event_head] = event;
1152 if ((!as->suser) || (!as->writer)) 1153 if (!as->suser || !as->writer)
1153 continue; 1154 continue;
1154 switch (event) { 1155 switch (event) {
1155 case APM_SYS_SUSPEND: 1156 case APM_SYS_SUSPEND:
@@ -1211,9 +1212,9 @@ static int suspend(int vetoable)
1211 if (err != APM_SUCCESS) 1212 if (err != APM_SUCCESS)
1212 apm_error("suspend", err); 1213 apm_error("suspend", err);
1213 err = (err == APM_SUCCESS) ? 0 : -EIO; 1214 err = (err == APM_SUCCESS) ? 0 : -EIO;
1214 device_power_up(); 1215 device_power_up(PMSG_RESUME);
1215 local_irq_enable(); 1216 local_irq_enable();
1216 device_resume(); 1217 device_resume(PMSG_RESUME);
1217 queue_event(APM_NORMAL_RESUME, NULL); 1218 queue_event(APM_NORMAL_RESUME, NULL);
1218 spin_lock(&user_list_lock); 1219 spin_lock(&user_list_lock);
1219 for (as = user_list; as != NULL; as = as->next) { 1220 for (as = user_list; as != NULL; as = as->next) {
@@ -1238,7 +1239,7 @@ static void standby(void)
1238 apm_error("standby", err); 1239 apm_error("standby", err);
1239 1240
1240 local_irq_disable(); 1241 local_irq_disable();
1241 device_power_up(); 1242 device_power_up(PMSG_RESUME);
1242 local_irq_enable(); 1243 local_irq_enable();
1243} 1244}
1244 1245
@@ -1324,7 +1325,7 @@ static void check_events(void)
1324 ignore_bounce = 1; 1325 ignore_bounce = 1;
1325 if ((event != APM_NORMAL_RESUME) 1326 if ((event != APM_NORMAL_RESUME)
1326 || (ignore_normal_resume == 0)) { 1327 || (ignore_normal_resume == 0)) {
1327 device_resume(); 1328 device_resume(PMSG_RESUME);
1328 queue_event(event, NULL); 1329 queue_event(event, NULL);
1329 } 1330 }
1330 ignore_normal_resume = 0; 1331 ignore_normal_resume = 0;
@@ -1396,7 +1397,7 @@ static void apm_mainloop(void)
1396 1397
1397static int check_apm_user(struct apm_user *as, const char *func) 1398static int check_apm_user(struct apm_user *as, const char *func)
1398{ 1399{
1399 if ((as == NULL) || (as->magic != APM_BIOS_MAGIC)) { 1400 if (as == NULL || as->magic != APM_BIOS_MAGIC) {
1400 printk(KERN_ERR "apm: %s passed bad filp\n", func); 1401 printk(KERN_ERR "apm: %s passed bad filp\n", func);
1401 return 1; 1402 return 1;
1402 } 1403 }
@@ -1459,18 +1460,19 @@ static unsigned int do_poll(struct file *fp, poll_table *wait)
1459 return 0; 1460 return 0;
1460} 1461}
1461 1462
1462static int do_ioctl(struct inode *inode, struct file *filp, 1463static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
1463 u_int cmd, u_long arg)
1464{ 1464{
1465 struct apm_user *as; 1465 struct apm_user *as;
1466 int ret;
1466 1467
1467 as = filp->private_data; 1468 as = filp->private_data;
1468 if (check_apm_user(as, "ioctl")) 1469 if (check_apm_user(as, "ioctl"))
1469 return -EIO; 1470 return -EIO;
1470 if ((!as->suser) || (!as->writer)) 1471 if (!as->suser || !as->writer)
1471 return -EPERM; 1472 return -EPERM;
1472 switch (cmd) { 1473 switch (cmd) {
1473 case APM_IOC_STANDBY: 1474 case APM_IOC_STANDBY:
1475 lock_kernel();
1474 if (as->standbys_read > 0) { 1476 if (as->standbys_read > 0) {
1475 as->standbys_read--; 1477 as->standbys_read--;
1476 as->standbys_pending--; 1478 as->standbys_pending--;
@@ -1479,8 +1481,10 @@ static int do_ioctl(struct inode *inode, struct file *filp,
1479 queue_event(APM_USER_STANDBY, as); 1481 queue_event(APM_USER_STANDBY, as);
1480 if (standbys_pending <= 0) 1482 if (standbys_pending <= 0)
1481 standby(); 1483 standby();
1484 unlock_kernel();
1482 break; 1485 break;
1483 case APM_IOC_SUSPEND: 1486 case APM_IOC_SUSPEND:
1487 lock_kernel();
1484 if (as->suspends_read > 0) { 1488 if (as->suspends_read > 0) {
1485 as->suspends_read--; 1489 as->suspends_read--;
1486 as->suspends_pending--; 1490 as->suspends_pending--;
@@ -1488,16 +1492,17 @@ static int do_ioctl(struct inode *inode, struct file *filp,
1488 } else 1492 } else
1489 queue_event(APM_USER_SUSPEND, as); 1493 queue_event(APM_USER_SUSPEND, as);
1490 if (suspends_pending <= 0) { 1494 if (suspends_pending <= 0) {
1491 return suspend(1); 1495 ret = suspend(1);
1492 } else { 1496 } else {
1493 as->suspend_wait = 1; 1497 as->suspend_wait = 1;
1494 wait_event_interruptible(apm_suspend_waitqueue, 1498 wait_event_interruptible(apm_suspend_waitqueue,
1495 as->suspend_wait == 0); 1499 as->suspend_wait == 0);
1496 return as->suspend_result; 1500 ret = as->suspend_result;
1497 } 1501 }
1498 break; 1502 unlock_kernel();
1503 return ret;
1499 default: 1504 default:
1500 return -EINVAL; 1505 return -ENOTTY;
1501 } 1506 }
1502 return 0; 1507 return 0;
1503} 1508}
@@ -1544,10 +1549,12 @@ static int do_open(struct inode *inode, struct file *filp)
1544{ 1549{
1545 struct apm_user *as; 1550 struct apm_user *as;
1546 1551
1552 lock_kernel();
1547 as = kmalloc(sizeof(*as), GFP_KERNEL); 1553 as = kmalloc(sizeof(*as), GFP_KERNEL);
1548 if (as == NULL) { 1554 if (as == NULL) {
1549 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", 1555 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
1550 sizeof(*as)); 1556 sizeof(*as));
1557 unlock_kernel();
1551 return -ENOMEM; 1558 return -ENOMEM;
1552 } 1559 }
1553 as->magic = APM_BIOS_MAGIC; 1560 as->magic = APM_BIOS_MAGIC;
@@ -1569,6 +1576,7 @@ static int do_open(struct inode *inode, struct file *filp)
1569 user_list = as; 1576 user_list = as;
1570 spin_unlock(&user_list_lock); 1577 spin_unlock(&user_list_lock);
1571 filp->private_data = as; 1578 filp->private_data = as;
1579 unlock_kernel();
1572 return 0; 1580 return 0;
1573} 1581}
1574 1582
@@ -1860,7 +1868,7 @@ static const struct file_operations apm_bios_fops = {
1860 .owner = THIS_MODULE, 1868 .owner = THIS_MODULE,
1861 .read = do_read, 1869 .read = do_read,
1862 .poll = do_poll, 1870 .poll = do_poll,
1863 .ioctl = do_ioctl, 1871 .unlocked_ioctl = do_ioctl,
1864 .open = do_open, 1872 .open = do_open,
1865 .release = do_release, 1873 .release = do_release,
1866}; 1874};
@@ -2209,7 +2217,7 @@ static int __init apm_init(void)
2209 2217
2210 dmi_check_system(apm_dmi_table); 2218 dmi_check_system(apm_dmi_table);
2211 2219
2212 if (apm_info.bios.version == 0 || paravirt_enabled()) { 2220 if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) {
2213 printk(KERN_INFO "apm: BIOS not found.\n"); 2221 printk(KERN_INFO "apm: BIOS not found.\n");
2214 return -ENODEV; 2222 return -ENODEV;
2215 } 2223 }
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 92588083950f..6649d09ad88f 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -111,7 +111,7 @@ void foo(void)
111 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); 111 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
112 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); 112 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
113 OFFSET(PV_CPU_iret, pv_cpu_ops, iret); 113 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
114 OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret); 114 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
115 OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); 115 OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
116#endif 116#endif
117 117
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index f126c05d6170..505543a75a56 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -18,9 +18,11 @@
18#include <asm/ia32.h> 18#include <asm/ia32.h>
19#include <asm/bootparam.h> 19#include <asm/bootparam.h>
20 20
21#include <xen/interface/xen.h>
22
21#define __NO_STUBS 1 23#define __NO_STUBS 1
22#undef __SYSCALL 24#undef __SYSCALL
23#undef _ASM_X86_64_UNISTD_H_ 25#undef ASM_X86__UNISTD_64_H
24#define __SYSCALL(nr, sym) [nr] = 1, 26#define __SYSCALL(nr, sym) [nr] = 1,
25static char syscalls[] = { 27static char syscalls[] = {
26#include <asm/unistd.h> 28#include <asm/unistd.h>
@@ -34,7 +36,7 @@ int main(void)
34 ENTRY(pid); 36 ENTRY(pid);
35 BLANK(); 37 BLANK();
36#undef ENTRY 38#undef ENTRY
37#define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry)) 39#define ENTRY(entry) DEFINE(TI_ ## entry, offsetof(struct thread_info, entry))
38 ENTRY(flags); 40 ENTRY(flags);
39 ENTRY(addr_limit); 41 ENTRY(addr_limit);
40 ENTRY(preempt_count); 42 ENTRY(preempt_count);
@@ -61,8 +63,11 @@ int main(void)
61 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); 63 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
62 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); 64 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
63 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); 65 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
66 OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame);
64 OFFSET(PV_CPU_iret, pv_cpu_ops, iret); 67 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
65 OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret); 68 OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32);
69 OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
70 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
66 OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); 71 OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
67 OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); 72 OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
68#endif 73#endif
@@ -128,5 +133,14 @@ int main(void)
128 OFFSET(BP_loadflags, boot_params, hdr.loadflags); 133 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
129 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); 134 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
130 OFFSET(BP_version, boot_params, hdr.version); 135 OFFSET(BP_version, boot_params, hdr.version);
136
137 BLANK();
138 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
139#ifdef CONFIG_XEN
140 BLANK();
141 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
142 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
143#undef ENTRY
144#endif
131 return 0; 145 return 0;
132} 146}
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
new file mode 100644
index 000000000000..fdd585f9c53d
--- /dev/null
+++ b/arch/x86/kernel/bios_uv.c
@@ -0,0 +1,48 @@
1/*
2 * BIOS run time interface routines.
3 *
4 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20
21#include <asm/uv/bios.h>
22
23const char *
24x86_bios_strerror(long status)
25{
26 const char *str;
27 switch (status) {
28 case 0: str = "Call completed without error"; break;
29 case -1: str = "Not implemented"; break;
30 case -2: str = "Invalid argument"; break;
31 case -3: str = "Call completed with error"; break;
32 default: str = "Unknown BIOS status code"; break;
33 }
34 return str;
35}
36
37long
38x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second,
39 unsigned long *drift_info)
40{
41 struct uv_bios_retval isrv;
42
43 BIOS_CALL(isrv, BIOS_FREQ_BASE, which, 0, 0, 0, 0, 0, 0);
44 *ticks_per_second = isrv.v0;
45 *drift_info = isrv.v1;
46 return isrv.status;
47}
48EXPORT_SYMBOL_GPL(x86_bios_freq_base);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index a0c6f8190887..7f0b45a5d788 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -3,18 +3,30 @@
3# 3#
4 4
5obj-y := intel_cacheinfo.o addon_cpuid_features.o 5obj-y := intel_cacheinfo.o addon_cpuid_features.o
6obj-y += proc.o feature_names.o 6obj-y += proc.o capflags.o powerflags.o common.o
7 7
8obj-$(CONFIG_X86_32) += common.o bugs.o 8obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
9obj-$(CONFIG_X86_32) += amd.o 9obj-$(CONFIG_X86_64) += bugs_64.o
10obj-$(CONFIG_X86_32) += cyrix.o 10
11obj-$(CONFIG_X86_32) += centaur.o 11obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
12obj-$(CONFIG_X86_32) += transmeta.o 12obj-$(CONFIG_CPU_SUP_AMD) += amd.o
13obj-$(CONFIG_X86_32) += intel.o 13obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
14obj-$(CONFIG_X86_32) += umc.o 14obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o
15obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o
16obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
17obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
15 18
16obj-$(CONFIG_X86_MCE) += mcheck/ 19obj-$(CONFIG_X86_MCE) += mcheck/
17obj-$(CONFIG_MTRR) += mtrr/ 20obj-$(CONFIG_MTRR) += mtrr/
18obj-$(CONFIG_CPU_FREQ) += cpufreq/ 21obj-$(CONFIG_CPU_FREQ) += cpufreq/
19 22
20obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o 23obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
24
25quiet_cmd_mkcapflags = MKCAP $@
26 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
27
28cpufeature = $(src)/../../../../include/asm-x86/cpufeature.h
29
30targets += capflags.c
31$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.pl FORCE
32 $(call if_changed,mkcapflags)
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index c2e1ce33c7cb..0d9c993aa93e 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -1,14 +1,14 @@
1
2/* 1/*
3 * Routines to indentify additional cpu features that are scattered in 2 * Routines to indentify additional cpu features that are scattered in
4 * cpuid space. 3 * cpuid space.
5 */ 4 */
6
7#include <linux/cpu.h> 5#include <linux/cpu.h>
8 6
9#include <asm/pat.h> 7#include <asm/pat.h>
10#include <asm/processor.h> 8#include <asm/processor.h>
11 9
10#include <mach_apic.h>
11
12struct cpuid_bit { 12struct cpuid_bit {
13 u16 feature; 13 u16 feature;
14 u8 reg; 14 u8 reg;
@@ -50,22 +50,122 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
50 } 50 }
51} 51}
52 52
53/* leaf 0xb SMT level */
54#define SMT_LEVEL 0
55
56/* leaf 0xb sub-leaf types */
57#define INVALID_TYPE 0
58#define SMT_TYPE 1
59#define CORE_TYPE 2
60
61#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff)
62#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f)
63#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff)
64
65/*
66 * Check for extended topology enumeration cpuid leaf 0xb and if it
67 * exists, use it for populating initial_apicid and cpu topology
68 * detection.
69 */
70void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
71{
72#ifdef CONFIG_SMP
73 unsigned int eax, ebx, ecx, edx, sub_index;
74 unsigned int ht_mask_width, core_plus_mask_width;
75 unsigned int core_select_mask, core_level_siblings;
76
77 if (c->cpuid_level < 0xb)
78 return;
79
80 cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
81
82 /*
83 * check if the cpuid leaf 0xb is actually implemented.
84 */
85 if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
86 return;
87
88 set_cpu_cap(c, X86_FEATURE_XTOPOLOGY);
89
90 /*
91 * initial apic id, which also represents 32-bit extended x2apic id.
92 */
93 c->initial_apicid = edx;
94
95 /*
96 * Populate HT related information from sub-leaf level 0.
97 */
98 core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
99 core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
100
101 sub_index = 1;
102 do {
103 cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx);
104
105 /*
106 * Check for the Core type in the implemented sub leaves.
107 */
108 if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) {
109 core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
110 core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
111 break;
112 }
113
114 sub_index++;
115 } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
116
117 core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width;
118
119#ifdef CONFIG_X86_32
120 c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width)
121 & core_select_mask;
122 c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width);
123#else
124 c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask;
125 c->phys_proc_id = phys_pkg_id(core_plus_mask_width);
126#endif
127 c->x86_max_cores = (core_level_siblings / smp_num_siblings);
128
129
130 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
131 c->phys_proc_id);
132 if (c->x86_max_cores > 1)
133 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
134 c->cpu_core_id);
135 return;
136#endif
137}
138
53#ifdef CONFIG_X86_PAT 139#ifdef CONFIG_X86_PAT
54void __cpuinit validate_pat_support(struct cpuinfo_x86 *c) 140void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
55{ 141{
142 if (!cpu_has_pat)
143 pat_disable("PAT not supported by CPU.");
144
56 switch (c->x86_vendor) { 145 switch (c->x86_vendor) {
57 case X86_VENDOR_AMD:
58 if (c->x86 >= 0xf && c->x86 <= 0x11)
59 return;
60 break;
61 case X86_VENDOR_INTEL: 146 case X86_VENDOR_INTEL:
62 if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15)) 147 /*
148 * There is a known erratum on Pentium III and Core Solo
149 * and Core Duo CPUs.
150 * " Page with PAT set to WC while associated MTRR is UC
151 * may consolidate to UC "
152 * Because of this erratum, it is better to stick with
153 * setting WC in MTRR rather than using PAT on these CPUs.
154 *
155 * Enable PAT WC only on P4, Core 2 or later CPUs.
156 */
157 if (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 15))
63 return; 158 return;
64 break; 159
160 pat_disable("PAT WC disabled due to known CPU erratum.");
161 return;
162
163 case X86_VENDOR_AMD:
164 case X86_VENDOR_CENTAUR:
165 case X86_VENDOR_TRANSMETA:
166 return;
65 } 167 }
66 168
67 pat_disable(cpu_has_pat ? 169 pat_disable("PAT disabled. Not yet verified on this CPU type.");
68 "PAT disabled. Not yet verified on this CPU type." :
69 "PAT not supported by CPU.");
70} 170}
71#endif 171#endif
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 245866828294..32e73520adf7 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1,13 +1,22 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/bitops.h> 2#include <linux/bitops.h>
3#include <linux/mm.h> 3#include <linux/mm.h>
4
4#include <asm/io.h> 5#include <asm/io.h>
5#include <asm/processor.h> 6#include <asm/processor.h>
6#include <asm/apic.h> 7#include <asm/apic.h>
7 8
9#ifdef CONFIG_X86_64
10# include <asm/numa_64.h>
11# include <asm/mmconfig.h>
12# include <asm/cacheflush.h>
13#endif
14
8#include <mach_apic.h> 15#include <mach_apic.h>
16
9#include "cpu.h" 17#include "cpu.h"
10 18
19#ifdef CONFIG_X86_32
11/* 20/*
12 * B step AMD K6 before B 9730xxxx have hardware bugs that can cause 21 * B step AMD K6 before B 9730xxxx have hardware bugs that can cause
13 * misexecution of code under Linux. Owners of such processors should 22 * misexecution of code under Linux. Owners of such processors should
@@ -24,60 +33,273 @@
24extern void vide(void); 33extern void vide(void);
25__asm__(".align 4\nvide: ret"); 34__asm__(".align 4\nvide: ret");
26 35
27#ifdef CONFIG_X86_LOCAL_APIC 36static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c)
28#define ENABLE_C1E_MASK 0x18000000
29#define CPUID_PROCESSOR_SIGNATURE 1
30#define CPUID_XFAM 0x0ff00000
31#define CPUID_XFAM_K8 0x00000000
32#define CPUID_XFAM_10H 0x00100000
33#define CPUID_XFAM_11H 0x00200000
34#define CPUID_XMOD 0x000f0000
35#define CPUID_XMOD_REV_F 0x00040000
36
37/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
38static __cpuinit int amd_apic_timer_broken(void)
39{ 37{
40 u32 lo, hi; 38/*
41 u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); 39 * General Systems BIOSen alias the cpu frequency registers
42 switch (eax & CPUID_XFAM) { 40 * of the Elan at 0x000df000. Unfortuantly, one of the Linux
43 case CPUID_XFAM_K8: 41 * drivers subsequently pokes it, and changes the CPU speed.
44 if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F) 42 * Workaround : Remove the unneeded alias.
45 break; 43 */
46 case CPUID_XFAM_10H: 44#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */
47 case CPUID_XFAM_11H: 45#define CBAR_ENB (0x80000000)
48 rdmsr(MSR_K8_ENABLE_C1E, lo, hi); 46#define CBAR_KEY (0X000000CB)
49 if (lo & ENABLE_C1E_MASK) { 47 if (c->x86_model == 9 || c->x86_model == 10) {
50 if (smp_processor_id() != boot_cpu_physical_apicid) 48 if (inl (CBAR) & CBAR_ENB)
51 printk(KERN_INFO "AMD C1E detected late. " 49 outl (0 | CBAR_KEY, CBAR);
52 " Force timer broadcast.\n"); 50 }
53 return 1; 51}
52
53
54static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
55{
56 u32 l, h;
57 int mbytes = num_physpages >> (20-PAGE_SHIFT);
58
59 if (c->x86_model < 6) {
60 /* Based on AMD doc 20734R - June 2000 */
61 if (c->x86_model == 0) {
62 clear_cpu_cap(c, X86_FEATURE_APIC);
63 set_cpu_cap(c, X86_FEATURE_PGE);
54 } 64 }
55 break; 65 return;
56 default: 66 }
57 /* err on the side of caution */ 67
58 return 1; 68 if (c->x86_model == 6 && c->x86_mask == 1) {
69 const int K6_BUG_LOOP = 1000000;
70 int n;
71 void (*f_vide)(void);
72 unsigned long d, d2;
73
74 printk(KERN_INFO "AMD K6 stepping B detected - ");
75
76 /*
77 * It looks like AMD fixed the 2.6.2 bug and improved indirect
78 * calls at the same time.
79 */
80
81 n = K6_BUG_LOOP;
82 f_vide = vide;
83 rdtscl(d);
84 while (n--)
85 f_vide();
86 rdtscl(d2);
87 d = d2-d;
88
89 if (d > 20*K6_BUG_LOOP)
90 printk("system stability may be impaired when more than 32 MB are used.\n");
91 else
92 printk("probably OK (after B9730xxxx).\n");
93 printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
94 }
95
96 /* K6 with old style WHCR */
97 if (c->x86_model < 8 ||
98 (c->x86_model == 8 && c->x86_mask < 8)) {
99 /* We can only write allocate on the low 508Mb */
100 if (mbytes > 508)
101 mbytes = 508;
102
103 rdmsr(MSR_K6_WHCR, l, h);
104 if ((l&0x0000FFFF) == 0) {
105 unsigned long flags;
106 l = (1<<0)|((mbytes/4)<<1);
107 local_irq_save(flags);
108 wbinvd();
109 wrmsr(MSR_K6_WHCR, l, h);
110 local_irq_restore(flags);
111 printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
112 mbytes);
113 }
114 return;
115 }
116
117 if ((c->x86_model == 8 && c->x86_mask > 7) ||
118 c->x86_model == 9 || c->x86_model == 13) {
119 /* The more serious chips .. */
120
121 if (mbytes > 4092)
122 mbytes = 4092;
123
124 rdmsr(MSR_K6_WHCR, l, h);
125 if ((l&0xFFFF0000) == 0) {
126 unsigned long flags;
127 l = ((mbytes>>2)<<22)|(1<<16);
128 local_irq_save(flags);
129 wbinvd();
130 wrmsr(MSR_K6_WHCR, l, h);
131 local_irq_restore(flags);
132 printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
133 mbytes);
134 }
135
136 return;
137 }
138
139 if (c->x86_model == 10) {
140 /* AMD Geode LX is model 10 */
141 /* placeholder for any needed mods */
142 return;
59 } 143 }
60 return 0; 144}
145
146static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
147{
148 u32 l, h;
149
150 /*
151 * Bit 15 of Athlon specific MSR 15, needs to be 0
152 * to enable SSE on Palomino/Morgan/Barton CPU's.
153 * If the BIOS didn't enable it already, enable it here.
154 */
155 if (c->x86_model >= 6 && c->x86_model <= 10) {
156 if (!cpu_has(c, X86_FEATURE_XMM)) {
157 printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
158 rdmsr(MSR_K7_HWCR, l, h);
159 l &= ~0x00008000;
160 wrmsr(MSR_K7_HWCR, l, h);
161 set_cpu_cap(c, X86_FEATURE_XMM);
162 }
163 }
164
165 /*
166 * It's been determined by AMD that Athlons since model 8 stepping 1
167 * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
168 * As per AMD technical note 27212 0.2
169 */
170 if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
171 rdmsr(MSR_K7_CLK_CTL, l, h);
172 if ((l & 0xfff00000) != 0x20000000) {
173 printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l,
174 ((l & 0x000fffff)|0x20000000));
175 wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
176 }
177 }
178
179 set_cpu_cap(c, X86_FEATURE_K7);
61} 180}
62#endif 181#endif
63 182
64int force_mwait __cpuinitdata; 183#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
184static int __cpuinit nearby_node(int apicid)
185{
186 int i, node;
65 187
66static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) 188 for (i = apicid - 1; i >= 0; i--) {
189 node = apicid_to_node[i];
190 if (node != NUMA_NO_NODE && node_online(node))
191 return node;
192 }
193 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
194 node = apicid_to_node[i];
195 if (node != NUMA_NO_NODE && node_online(node))
196 return node;
197 }
198 return first_node(node_online_map); /* Shouldn't happen */
199}
200#endif
201
202/*
203 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
204 * Assumes number of cores is a power of two.
205 */
206static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
207{
208#ifdef CONFIG_X86_HT
209 unsigned bits;
210
211 bits = c->x86_coreid_bits;
212
213 /* Low order bits define the core id (index of core in socket) */
214 c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
215 /* Convert the initial APIC ID into the socket ID */
216 c->phys_proc_id = c->initial_apicid >> bits;
217#endif
218}
219
220static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
67{ 221{
68 if (cpuid_eax(0x80000000) >= 0x80000007) { 222#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
69 c->x86_power = cpuid_edx(0x80000007); 223 int cpu = smp_processor_id();
70 if (c->x86_power & (1<<8)) 224 int node;
71 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 225 unsigned apicid = hard_smp_processor_id();
226
227 node = c->phys_proc_id;
228 if (apicid_to_node[apicid] != NUMA_NO_NODE)
229 node = apicid_to_node[apicid];
230 if (!node_online(node)) {
231 /* Two possibilities here:
232 - The CPU is missing memory and no node was created.
233 In that case try picking one from a nearby CPU
234 - The APIC IDs differ from the HyperTransport node IDs
235 which the K8 northbridge parsing fills in.
236 Assume they are all increased by a constant offset,
237 but in the same order as the HT nodeids.
238 If that doesn't result in a usable node fall back to the
239 path for the previous case. */
240
241 int ht_nodeid = c->initial_apicid;
242
243 if (ht_nodeid >= 0 &&
244 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
245 node = apicid_to_node[ht_nodeid];
246 /* Pick a nearby node */
247 if (!node_online(node))
248 node = nearby_node(apicid);
72 } 249 }
250 numa_set_node(cpu, node);
251
252 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
253#endif
73} 254}
74 255
75static void __cpuinit init_amd(struct cpuinfo_x86 *c) 256static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
76{ 257{
77 u32 l, h; 258#ifdef CONFIG_X86_HT
78 int mbytes = num_physpages >> (20-PAGE_SHIFT); 259 unsigned bits, ecx;
79 int r; 260
261 /* Multi core CPU? */
262 if (c->extended_cpuid_level < 0x80000008)
263 return;
264
265 ecx = cpuid_ecx(0x80000008);
266
267 c->x86_max_cores = (ecx & 0xff) + 1;
268
269 /* CPU telling us the core id bits shift? */
270 bits = (ecx >> 12) & 0xF;
271
272 /* Otherwise recompute */
273 if (bits == 0) {
274 while ((1 << bits) < c->x86_max_cores)
275 bits++;
276 }
277
278 c->x86_coreid_bits = bits;
279#endif
280}
281
282static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
283{
284 early_init_amd_mc(c);
285
286 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
287 if (c->x86_power & (1<<8))
288 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
289
290#ifdef CONFIG_X86_64
291 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
292#else
293 /* Set MTRR capability flag if appropriate */
294 if (c->x86 == 5)
295 if (c->x86_model == 13 || c->x86_model == 9 ||
296 (c->x86_model == 8 && c->x86_mask >= 8))
297 set_cpu_cap(c, X86_FEATURE_K6_MTRR);
298#endif
299}
80 300
301static void __cpuinit init_amd(struct cpuinfo_x86 *c)
302{
81#ifdef CONFIG_SMP 303#ifdef CONFIG_SMP
82 unsigned long long value; 304 unsigned long long value;
83 305
@@ -88,7 +310,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
88 * Errata 63 for SH-B3 steppings 310 * Errata 63 for SH-B3 steppings
89 * Errata 122 for all steppings (F+ have it disabled by default) 311 * Errata 122 for all steppings (F+ have it disabled by default)
90 */ 312 */
91 if (c->x86 == 15) { 313 if (c->x86 == 0xf) {
92 rdmsrl(MSR_K7_HWCR, value); 314 rdmsrl(MSR_K7_HWCR, value);
93 value |= 1 << 6; 315 value |= 1 << 6;
94 wrmsrl(MSR_K7_HWCR, value); 316 wrmsrl(MSR_K7_HWCR, value);
@@ -98,218 +320,119 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
98 early_init_amd(c); 320 early_init_amd(c);
99 321
100 /* 322 /*
101 * FIXME: We should handle the K5 here. Set up the write
102 * range and also turn on MSR 83 bits 4 and 31 (write alloc,
103 * no bus pipeline)
104 */
105
106 /*
107 * Bit 31 in normal CPUID used for nonstandard 3DNow ID; 323 * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
108 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway 324 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
109 */ 325 */
110 clear_cpu_cap(c, 0*32+31); 326 clear_cpu_cap(c, 0*32+31);
111 327
112 r = get_model_name(c); 328#ifdef CONFIG_X86_64
113 329 /* On C+ stepping K8 rep microcode works well for copy/memset */
114 switch (c->x86) { 330 if (c->x86 == 0xf) {
115 case 4: 331 u32 level;
116 /*
117 * General Systems BIOSen alias the cpu frequency registers
118 * of the Elan at 0x000df000. Unfortuantly, one of the Linux
119 * drivers subsequently pokes it, and changes the CPU speed.
120 * Workaround : Remove the unneeded alias.
121 */
122#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */
123#define CBAR_ENB (0x80000000)
124#define CBAR_KEY (0X000000CB)
125 if (c->x86_model == 9 || c->x86_model == 10) {
126 if (inl (CBAR) & CBAR_ENB)
127 outl (0 | CBAR_KEY, CBAR);
128 }
129 break;
130 case 5:
131 if (c->x86_model < 6) {
132 /* Based on AMD doc 20734R - June 2000 */
133 if (c->x86_model == 0) {
134 clear_cpu_cap(c, X86_FEATURE_APIC);
135 set_cpu_cap(c, X86_FEATURE_PGE);
136 }
137 break;
138 }
139
140 if (c->x86_model == 6 && c->x86_mask == 1) {
141 const int K6_BUG_LOOP = 1000000;
142 int n;
143 void (*f_vide)(void);
144 unsigned long d, d2;
145
146 printk(KERN_INFO "AMD K6 stepping B detected - ");
147
148 /*
149 * It looks like AMD fixed the 2.6.2 bug and improved indirect
150 * calls at the same time.
151 */
152
153 n = K6_BUG_LOOP;
154 f_vide = vide;
155 rdtscl(d);
156 while (n--)
157 f_vide();
158 rdtscl(d2);
159 d = d2-d;
160
161 if (d > 20*K6_BUG_LOOP)
162 printk("system stability may be impaired when more than 32 MB are used.\n");
163 else
164 printk("probably OK (after B9730xxxx).\n");
165 printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
166 }
167
168 /* K6 with old style WHCR */
169 if (c->x86_model < 8 ||
170 (c->x86_model == 8 && c->x86_mask < 8)) {
171 /* We can only write allocate on the low 508Mb */
172 if (mbytes > 508)
173 mbytes = 508;
174
175 rdmsr(MSR_K6_WHCR, l, h);
176 if ((l&0x0000FFFF) == 0) {
177 unsigned long flags;
178 l = (1<<0)|((mbytes/4)<<1);
179 local_irq_save(flags);
180 wbinvd();
181 wrmsr(MSR_K6_WHCR, l, h);
182 local_irq_restore(flags);
183 printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
184 mbytes);
185 }
186 break;
187 }
188
189 if ((c->x86_model == 8 && c->x86_mask > 7) ||
190 c->x86_model == 9 || c->x86_model == 13) {
191 /* The more serious chips .. */
192
193 if (mbytes > 4092)
194 mbytes = 4092;
195
196 rdmsr(MSR_K6_WHCR, l, h);
197 if ((l&0xFFFF0000) == 0) {
198 unsigned long flags;
199 l = ((mbytes>>2)<<22)|(1<<16);
200 local_irq_save(flags);
201 wbinvd();
202 wrmsr(MSR_K6_WHCR, l, h);
203 local_irq_restore(flags);
204 printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
205 mbytes);
206 }
207
208 /* Set MTRR capability flag if appropriate */
209 if (c->x86_model == 13 || c->x86_model == 9 ||
210 (c->x86_model == 8 && c->x86_mask >= 8))
211 set_cpu_cap(c, X86_FEATURE_K6_MTRR);
212 break;
213 }
214
215 if (c->x86_model == 10) {
216 /* AMD Geode LX is model 10 */
217 /* placeholder for any needed mods */
218 break;
219 }
220 break;
221 case 6: /* An Athlon/Duron */
222 332
223 /* 333 level = cpuid_eax(1);
224 * Bit 15 of Athlon specific MSR 15, needs to be 0 334 if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
225 * to enable SSE on Palomino/Morgan/Barton CPU's. 335 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
226 * If the BIOS didn't enable it already, enable it here.
227 */
228 if (c->x86_model >= 6 && c->x86_model <= 10) {
229 if (!cpu_has(c, X86_FEATURE_XMM)) {
230 printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
231 rdmsr(MSR_K7_HWCR, l, h);
232 l &= ~0x00008000;
233 wrmsr(MSR_K7_HWCR, l, h);
234 set_cpu_cap(c, X86_FEATURE_XMM);
235 }
236 }
237
238 /*
239 * It's been determined by AMD that Athlons since model 8 stepping 1
240 * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
241 * As per AMD technical note 27212 0.2
242 */
243 if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
244 rdmsr(MSR_K7_CLK_CTL, l, h);
245 if ((l & 0xfff00000) != 0x20000000) {
246 printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l,
247 ((l & 0x000fffff)|0x20000000));
248 wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
249 }
250 }
251 break;
252 } 336 }
337 if (c->x86 == 0x10 || c->x86 == 0x11)
338 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
339#else
340
341 /*
342 * FIXME: We should handle the K5 here. Set up the write
343 * range and also turn on MSR 83 bits 4 and 31 (write alloc,
344 * no bus pipeline)
345 */
253 346
254 switch (c->x86) { 347 switch (c->x86) {
255 case 15: 348 case 4:
256 /* Use K8 tuning for Fam10h and Fam11h */ 349 init_amd_k5(c);
257 case 0x10:
258 case 0x11:
259 set_cpu_cap(c, X86_FEATURE_K8);
260 break; 350 break;
261 case 6: 351 case 5:
262 set_cpu_cap(c, X86_FEATURE_K7); 352 init_amd_k6(c);
353 break;
354 case 6: /* An Athlon/Duron */
355 init_amd_k7(c);
263 break; 356 break;
264 } 357 }
358
359 /* K6s reports MCEs but don't actually have all the MSRs */
360 if (c->x86 < 6)
361 clear_cpu_cap(c, X86_FEATURE_MCE);
362#endif
363
364 /* Enable workaround for FXSAVE leak */
265 if (c->x86 >= 6) 365 if (c->x86 >= 6)
266 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); 366 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
267 367
268 display_cacheinfo(c); 368 if (!c->x86_model_id[0]) {
269 369 switch (c->x86) {
270 if (cpuid_eax(0x80000000) >= 0x80000008) 370 case 0xf:
271 c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; 371 /* Should distinguish Models here, but this is only
372 a fallback anyways. */
373 strcpy(c->x86_model_id, "Hammer");
374 break;
375 }
376 }
272 377
273#ifdef CONFIG_X86_HT 378 display_cacheinfo(c);
274 /*
275 * On a AMD multi core setup the lower bits of the APIC id
276 * distinguish the cores.
277 */
278 if (c->x86_max_cores > 1) {
279 int cpu = smp_processor_id();
280 unsigned bits = (cpuid_ecx(0x80000008) >> 12) & 0xf;
281 379
282 if (bits == 0) { 380 /* Multi core CPU? */
283 while ((1 << bits) < c->x86_max_cores) 381 if (c->extended_cpuid_level >= 0x80000008) {
284 bits++; 382 amd_detect_cmp(c);
285 } 383 srat_detect_node(c);
286 c->cpu_core_id = c->phys_proc_id & ((1<<bits)-1);
287 c->phys_proc_id >>= bits;
288 printk(KERN_INFO "CPU %d(%d) -> Core %d\n",
289 cpu, c->x86_max_cores, c->cpu_core_id);
290 } 384 }
385
386#ifdef CONFIG_X86_32
387 detect_ht(c);
291#endif 388#endif
292 389
293 if (cpuid_eax(0x80000000) >= 0x80000006) { 390 if (c->extended_cpuid_level >= 0x80000006) {
294 if ((c->x86 == 0x10) && (cpuid_edx(0x80000006) & 0xf000)) 391 if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000))
295 num_cache_leaves = 4; 392 num_cache_leaves = 4;
296 else 393 else
297 num_cache_leaves = 3; 394 num_cache_leaves = 3;
298 } 395 }
299 396
300#ifdef CONFIG_X86_LOCAL_APIC 397 if (c->x86 >= 0xf && c->x86 <= 0x11)
301 if (amd_apic_timer_broken()) 398 set_cpu_cap(c, X86_FEATURE_K8);
302 local_apic_timer_disabled = 1;
303#endif
304
305 /* K6s reports MCEs but don't actually have all the MSRs */
306 if (c->x86 < 6)
307 clear_cpu_cap(c, X86_FEATURE_MCE);
308 399
309 if (cpu_has_xmm2) 400 if (cpu_has_xmm2) {
401 /* MFENCE stops RDTSC speculation */
310 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); 402 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
403 }
404
405#ifdef CONFIG_X86_64
406 if (c->x86 == 0x10) {
407 /* do this for boot cpu */
408 if (c == &boot_cpu_data)
409 check_enable_amd_mmconf_dmi();
410
411 fam10h_check_enable_mmcfg();
412 }
413
414 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
415 unsigned long long tseg;
416
417 /*
418 * Split up direct mapping around the TSEG SMM area.
419 * Don't do it for gbpages because there seems very little
420 * benefit in doing so.
421 */
422 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
423 printk(KERN_DEBUG "tseg: %010llx\n", tseg);
424 if ((tseg>>PMD_SHIFT) <
425 (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
426 ((tseg>>PMD_SHIFT) <
427 (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
428 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
429 set_memory_4k((unsigned long)__va(tseg), 1);
430 }
431 }
432#endif
311} 433}
312 434
435#ifdef CONFIG_X86_32
313static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) 436static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
314{ 437{
315 /* AMD errata T13 (order #21922) */ 438 /* AMD errata T13 (order #21922) */
@@ -322,10 +445,12 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int
322 } 445 }
323 return size; 446 return size;
324} 447}
448#endif
325 449
326static struct cpu_dev amd_cpu_dev __cpuinitdata = { 450static struct cpu_dev amd_cpu_dev __cpuinitdata = {
327 .c_vendor = "AMD", 451 .c_vendor = "AMD",
328 .c_ident = { "AuthenticAMD" }, 452 .c_ident = { "AuthenticAMD" },
453#ifdef CONFIG_X86_32
329 .c_models = { 454 .c_models = {
330 { .vendor = X86_VENDOR_AMD, .family = 4, .model_names = 455 { .vendor = X86_VENDOR_AMD, .family = 4, .model_names =
331 { 456 {
@@ -338,9 +463,11 @@ static struct cpu_dev amd_cpu_dev __cpuinitdata = {
338 } 463 }
339 }, 464 },
340 }, 465 },
466 .c_size_cache = amd_size_cache,
467#endif
341 .c_early_init = early_init_amd, 468 .c_early_init = early_init_amd,
342 .c_init = init_amd, 469 .c_init = init_amd,
343 .c_size_cache = amd_size_cache, 470 .c_x86_vendor = X86_VENDOR_AMD,
344}; 471};
345 472
346cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev); 473cpu_dev_register(amd_cpu_dev);
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 170d2f5523b2..c8e315f1aa83 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -50,6 +50,8 @@ static double __initdata y = 3145727.0;
50 */ 50 */
51static void __init check_fpu(void) 51static void __init check_fpu(void)
52{ 52{
53 s32 fdiv_bug;
54
53 if (!boot_cpu_data.hard_math) { 55 if (!boot_cpu_data.hard_math) {
54#ifndef CONFIG_MATH_EMULATION 56#ifndef CONFIG_MATH_EMULATION
55 printk(KERN_EMERG "No coprocessor found and no math emulation present.\n"); 57 printk(KERN_EMERG "No coprocessor found and no math emulation present.\n");
@@ -59,8 +61,12 @@ static void __init check_fpu(void)
59 return; 61 return;
60 } 62 }
61 63
62/* trap_init() enabled FXSR and company _before_ testing for FP problems here. */ 64 /*
63 /* Test for the divl bug.. */ 65 * trap_init() enabled FXSR and company _before_ testing for FP
66 * problems here.
67 *
68 * Test for the divl bug..
69 */
64 __asm__("fninit\n\t" 70 __asm__("fninit\n\t"
65 "fldl %1\n\t" 71 "fldl %1\n\t"
66 "fdivl %2\n\t" 72 "fdivl %2\n\t"
@@ -70,8 +76,10 @@ static void __init check_fpu(void)
70 "fistpl %0\n\t" 76 "fistpl %0\n\t"
71 "fwait\n\t" 77 "fwait\n\t"
72 "fninit" 78 "fninit"
73 : "=m" (*&boot_cpu_data.fdiv_bug) 79 : "=m" (*&fdiv_bug)
74 : "m" (*&x), "m" (*&y)); 80 : "m" (*&x), "m" (*&y));
81
82 boot_cpu_data.fdiv_bug = fdiv_bug;
75 if (boot_cpu_data.fdiv_bug) 83 if (boot_cpu_data.fdiv_bug)
76 printk("Hmm, FPU with FDIV bug.\n"); 84 printk("Hmm, FPU with FDIV bug.\n");
77} 85}
@@ -108,10 +116,15 @@ static void __init check_popad(void)
108 "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx " 116 "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx "
109 : "=&a" (res) 117 : "=&a" (res)
110 : "d" (inp) 118 : "d" (inp)
111 : "ecx", "edi" ); 119 : "ecx", "edi");
112 /* If this fails, it means that any user program may lock the CPU hard. Too bad. */ 120 /*
113 if (res != 12345678) printk( "Buggy.\n" ); 121 * If this fails, it means that any user program may lock the
114 else printk( "OK.\n" ); 122 * CPU hard. Too bad.
123 */
124 if (res != 12345678)
125 printk("Buggy.\n");
126 else
127 printk("OK.\n");
115#endif 128#endif
116} 129}
117 130
@@ -122,13 +135,7 @@ static void __init check_popad(void)
122 * (for due to lack of "invlpg" and working WP on a i386) 135 * (for due to lack of "invlpg" and working WP on a i386)
123 * - In order to run on anything without a TSC, we need to be 136 * - In order to run on anything without a TSC, we need to be
124 * compiled for a i486. 137 * compiled for a i486.
125 * - In order to support the local APIC on a buggy Pentium machine, 138 */
126 * we need to be compiled with CONFIG_X86_GOOD_APIC disabled,
127 * which happens implicitly if compiled for a Pentium or lower
128 * (unless an advanced selection of CPU features is used) as an
129 * otherwise config implies a properly working local APIC without
130 * the need to do extra reads from the APIC.
131*/
132 139
133static void __init check_config(void) 140static void __init check_config(void)
134{ 141{
@@ -137,25 +144,11 @@ static void __init check_config(void)
137 * i486+ only features! (WP works in supervisor mode and the 144 * i486+ only features! (WP works in supervisor mode and the
138 * new "invlpg" and "bswap" instructions) 145 * new "invlpg" and "bswap" instructions)
139 */ 146 */
140#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_BSWAP) 147#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || \
148 defined(CONFIG_X86_BSWAP)
141 if (boot_cpu_data.x86 == 3) 149 if (boot_cpu_data.x86 == 3)
142 panic("Kernel requires i486+ for 'invlpg' and other features"); 150 panic("Kernel requires i486+ for 'invlpg' and other features");
143#endif 151#endif
144
145/*
146 * If we were told we had a good local APIC, check for buggy Pentia,
147 * i.e. all B steppings and the C2 stepping of P54C when using their
148 * integrated APIC (see 11AP erratum in "Pentium Processor
149 * Specification Update").
150 */
151#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC)
152 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL
153 && cpu_has_apic
154 && boot_cpu_data.x86 == 5
155 && boot_cpu_data.x86_model == 2
156 && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11))
157 panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!");
158#endif
159} 152}
160 153
161 154
@@ -170,6 +163,7 @@ void __init check_bugs(void)
170 check_fpu(); 163 check_fpu();
171 check_hlt(); 164 check_hlt();
172 check_popad(); 165 check_popad();
173 init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); 166 init_utsname()->machine[1] =
167 '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
174 alternative_instructions(); 168 alternative_instructions();
175} 169}
diff --git a/arch/x86/kernel/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c
index 9a3ed0649d4e..9a3ed0649d4e 100644
--- a/arch/x86/kernel/bugs_64.c
+++ b/arch/x86/kernel/cpu/bugs_64.c
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index e0f45edd6a55..89bfdd9cacc6 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -289,7 +289,6 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
289 if (c->x86_model >= 6 && c->x86_model < 9) 289 if (c->x86_model >= 6 && c->x86_model < 9)
290 set_cpu_cap(c, X86_FEATURE_3DNOW); 290 set_cpu_cap(c, X86_FEATURE_3DNOW);
291 291
292 get_model_name(c);
293 display_cacheinfo(c); 292 display_cacheinfo(c);
294} 293}
295 294
@@ -314,6 +313,16 @@ enum {
314 EAMD3D = 1<<20, 313 EAMD3D = 1<<20,
315}; 314};
316 315
316static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
317{
318 switch (c->x86) {
319 case 5:
320 /* Emulate MTRRs using Centaur's MCR. */
321 set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
322 break;
323 }
324}
325
317static void __cpuinit init_centaur(struct cpuinfo_x86 *c) 326static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
318{ 327{
319 328
@@ -462,8 +471,10 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
462static struct cpu_dev centaur_cpu_dev __cpuinitdata = { 471static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
463 .c_vendor = "Centaur", 472 .c_vendor = "Centaur",
464 .c_ident = { "CentaurHauls" }, 473 .c_ident = { "CentaurHauls" },
474 .c_early_init = early_init_centaur,
465 .c_init = init_centaur, 475 .c_init = init_centaur,
466 .c_size_cache = centaur_size_cache, 476 .c_size_cache = centaur_size_cache,
477 .c_x86_vendor = X86_VENDOR_CENTAUR,
467}; 478};
468 479
469cpu_vendor_dev_register(X86_VENDOR_CENTAUR, &centaur_cpu_dev); 480cpu_dev_register(centaur_cpu_dev);
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
new file mode 100644
index 000000000000..a1625f5a1e78
--- /dev/null
+++ b/arch/x86/kernel/cpu/centaur_64.c
@@ -0,0 +1,37 @@
1#include <linux/init.h>
2#include <linux/smp.h>
3
4#include <asm/cpufeature.h>
5#include <asm/processor.h>
6
7#include "cpu.h"
8
9static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
10{
11 if (c->x86 == 0x6 && c->x86_model >= 0xf)
12 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
13
14 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
15}
16
17static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
18{
19 early_init_centaur(c);
20
21 if (c->x86 == 0x6 && c->x86_model >= 0xf) {
22 c->x86_cache_alignment = c->x86_clflush_size * 2;
23 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
24 }
25 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
26}
27
28static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
29 .c_vendor = "Centaur",
30 .c_ident = { "CentaurHauls" },
31 .c_early_init = early_init_centaur,
32 .c_init = init_centaur,
33 .c_x86_vendor = X86_VENDOR_CENTAUR,
34};
35
36cpu_dev_register(centaur_cpu_dev);
37
diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c
new file mode 100644
index 000000000000..2056ccf572cc
--- /dev/null
+++ b/arch/x86/kernel/cpu/cmpxchg.c
@@ -0,0 +1,72 @@
1/*
2 * cmpxchg*() fallbacks for CPU not supporting these instructions
3 */
4
5#include <linux/kernel.h>
6#include <linux/smp.h>
7#include <linux/module.h>
8
9#ifndef CONFIG_X86_CMPXCHG
10unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
11{
12 u8 prev;
13 unsigned long flags;
14
15 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
16 local_irq_save(flags);
17 prev = *(u8 *)ptr;
18 if (prev == old)
19 *(u8 *)ptr = new;
20 local_irq_restore(flags);
21 return prev;
22}
23EXPORT_SYMBOL(cmpxchg_386_u8);
24
25unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
26{
27 u16 prev;
28 unsigned long flags;
29
30 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
31 local_irq_save(flags);
32 prev = *(u16 *)ptr;
33 if (prev == old)
34 *(u16 *)ptr = new;
35 local_irq_restore(flags);
36 return prev;
37}
38EXPORT_SYMBOL(cmpxchg_386_u16);
39
40unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
41{
42 u32 prev;
43 unsigned long flags;
44
45 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
46 local_irq_save(flags);
47 prev = *(u32 *)ptr;
48 if (prev == old)
49 *(u32 *)ptr = new;
50 local_irq_restore(flags);
51 return prev;
52}
53EXPORT_SYMBOL(cmpxchg_386_u32);
54#endif
55
56#ifndef CONFIG_X86_CMPXCHG64
57unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
58{
59 u64 prev;
60 unsigned long flags;
61
62 /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
63 local_irq_save(flags);
64 prev = *(u64 *)ptr;
65 if (prev == old)
66 *(u64 *)ptr = new;
67 local_irq_restore(flags);
68 return prev;
69}
70EXPORT_SYMBOL(cmpxchg_486_u64);
71#endif
72
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index d0463a946247..25581dcb280e 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1,27 +1,62 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/kernel.h>
3#include <linux/sched.h>
2#include <linux/string.h> 4#include <linux/string.h>
5#include <linux/bootmem.h>
6#include <linux/bitops.h>
7#include <linux/module.h>
8#include <linux/kgdb.h>
9#include <linux/topology.h>
3#include <linux/delay.h> 10#include <linux/delay.h>
4#include <linux/smp.h> 11#include <linux/smp.h>
5#include <linux/module.h>
6#include <linux/percpu.h> 12#include <linux/percpu.h>
7#include <linux/bootmem.h>
8#include <asm/processor.h>
9#include <asm/i387.h> 13#include <asm/i387.h>
10#include <asm/msr.h> 14#include <asm/msr.h>
11#include <asm/io.h> 15#include <asm/io.h>
16#include <asm/linkage.h>
12#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
13#include <asm/mtrr.h> 18#include <asm/mtrr.h>
14#include <asm/mce.h> 19#include <asm/mce.h>
15#include <asm/pat.h> 20#include <asm/pat.h>
21#include <asm/asm.h>
22#include <asm/numa.h>
16#ifdef CONFIG_X86_LOCAL_APIC 23#ifdef CONFIG_X86_LOCAL_APIC
17#include <asm/mpspec.h> 24#include <asm/mpspec.h>
18#include <asm/apic.h> 25#include <asm/apic.h>
19#include <mach_apic.h> 26#include <mach_apic.h>
27#include <asm/genapic.h>
20#endif 28#endif
21 29
30#include <asm/pda.h>
31#include <asm/pgtable.h>
32#include <asm/processor.h>
33#include <asm/desc.h>
34#include <asm/atomic.h>
35#include <asm/proto.h>
36#include <asm/sections.h>
37#include <asm/setup.h>
38
22#include "cpu.h" 39#include "cpu.h"
23 40
41static struct cpu_dev *this_cpu __cpuinitdata;
42
43#ifdef CONFIG_X86_64
44/* We need valid kernel segments for data and code in long mode too
45 * IRET will check the segment types kkeil 2000/10/28
46 * Also sysret mandates a special GDT layout
47 */
48/* The TLS descriptors are currently at a different place compared to i386.
49 Hopefully nobody expects them at a fixed place (Wine?) */
24DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { 50DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
51 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
52 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
53 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
54 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
55 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
56 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
57} };
58#else
59DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
25 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, 60 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
26 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, 61 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
27 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, 62 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
@@ -55,17 +90,157 @@ DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
55 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, 90 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
56 [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, 91 [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
57} }; 92} };
93#endif
58EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); 94EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
59 95
60__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; 96#ifdef CONFIG_X86_32
61
62static int cachesize_override __cpuinitdata = -1; 97static int cachesize_override __cpuinitdata = -1;
63static int disable_x86_serial_nr __cpuinitdata = 1; 98static int disable_x86_serial_nr __cpuinitdata = 1;
64 99
65struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; 100static int __init cachesize_setup(char *str)
101{
102 get_option(&str, &cachesize_override);
103 return 1;
104}
105__setup("cachesize=", cachesize_setup);
106
107static int __init x86_fxsr_setup(char *s)
108{
109 setup_clear_cpu_cap(X86_FEATURE_FXSR);
110 setup_clear_cpu_cap(X86_FEATURE_XMM);
111 return 1;
112}
113__setup("nofxsr", x86_fxsr_setup);
114
115static int __init x86_sep_setup(char *s)
116{
117 setup_clear_cpu_cap(X86_FEATURE_SEP);
118 return 1;
119}
120__setup("nosep", x86_sep_setup);
121
122/* Standard macro to see if a specific flag is changeable */
123static inline int flag_is_changeable_p(u32 flag)
124{
125 u32 f1, f2;
126
127 /*
128 * Cyrix and IDT cpus allow disabling of CPUID
129 * so the code below may return different results
130 * when it is executed before and after enabling
131 * the CPUID. Add "volatile" to not allow gcc to
132 * optimize the subsequent calls to this function.
133 */
134 asm volatile ("pushfl\n\t"
135 "pushfl\n\t"
136 "popl %0\n\t"
137 "movl %0,%1\n\t"
138 "xorl %2,%0\n\t"
139 "pushl %0\n\t"
140 "popfl\n\t"
141 "pushfl\n\t"
142 "popl %0\n\t"
143 "popfl\n\t"
144 : "=&r" (f1), "=&r" (f2)
145 : "ir" (flag));
146
147 return ((f1^f2) & flag) != 0;
148}
149
150/* Probe for the CPUID instruction */
151static int __cpuinit have_cpuid_p(void)
152{
153 return flag_is_changeable_p(X86_EFLAGS_ID);
154}
155
156static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
157{
158 if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
159 /* Disable processor serial number */
160 unsigned long lo, hi;
161 rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
162 lo |= 0x200000;
163 wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
164 printk(KERN_NOTICE "CPU serial number disabled.\n");
165 clear_cpu_cap(c, X86_FEATURE_PN);
166
167 /* Disabling the serial number may affect the cpuid level */
168 c->cpuid_level = cpuid_eax(0);
169 }
170}
171
172static int __init x86_serial_nr_setup(char *s)
173{
174 disable_x86_serial_nr = 0;
175 return 1;
176}
177__setup("serialnumber", x86_serial_nr_setup);
178#else
179static inline int flag_is_changeable_p(u32 flag)
180{
181 return 1;
182}
183/* Probe for the CPUID instruction */
184static inline int have_cpuid_p(void)
185{
186 return 1;
187}
188static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
189{
190}
191#endif
192
193/*
194 * Naming convention should be: <Name> [(<Codename>)]
195 * This table only is used unless init_<vendor>() below doesn't set it;
196 * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
197 *
198 */
199
200/* Look up CPU names by table lookup. */
201static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
202{
203 struct cpu_model_info *info;
204
205 if (c->x86_model >= 16)
206 return NULL; /* Range check */
207
208 if (!this_cpu)
209 return NULL;
210
211 info = this_cpu->c_models;
212
213 while (info && info->family) {
214 if (info->family == c->x86)
215 return info->model_names[c->x86_model];
216 info++;
217 }
218 return NULL; /* Not found */
219}
220
221__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
222
223/* Current gdt points %fs at the "master" per-cpu area: after this,
224 * it's on the real one. */
225void switch_to_new_gdt(void)
226{
227 struct desc_ptr gdt_descr;
228
229 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
230 gdt_descr.size = GDT_SIZE - 1;
231 load_gdt(&gdt_descr);
232#ifdef CONFIG_X86_32
233 asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
234#endif
235}
236
237static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
66 238
67static void __cpuinit default_init(struct cpuinfo_x86 *c) 239static void __cpuinit default_init(struct cpuinfo_x86 *c)
68{ 240{
241#ifdef CONFIG_X86_64
242 display_cacheinfo(c);
243#else
69 /* Not much we can do here... */ 244 /* Not much we can do here... */
70 /* Check if at least it has cpuid */ 245 /* Check if at least it has cpuid */
71 if (c->cpuid_level == -1) { 246 if (c->cpuid_level == -1) {
@@ -75,28 +250,22 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c)
75 else if (c->x86 == 3) 250 else if (c->x86 == 3)
76 strcpy(c->x86_model_id, "386"); 251 strcpy(c->x86_model_id, "386");
77 } 252 }
253#endif
78} 254}
79 255
80static struct cpu_dev __cpuinitdata default_cpu = { 256static struct cpu_dev __cpuinitdata default_cpu = {
81 .c_init = default_init, 257 .c_init = default_init,
82 .c_vendor = "Unknown", 258 .c_vendor = "Unknown",
259 .c_x86_vendor = X86_VENDOR_UNKNOWN,
83}; 260};
84static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
85
86static int __init cachesize_setup(char *str)
87{
88 get_option(&str, &cachesize_override);
89 return 1;
90}
91__setup("cachesize=", cachesize_setup);
92 261
93int __cpuinit get_model_name(struct cpuinfo_x86 *c) 262static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
94{ 263{
95 unsigned int *v; 264 unsigned int *v;
96 char *p, *q; 265 char *p, *q;
97 266
98 if (cpuid_eax(0x80000000) < 0x80000004) 267 if (c->extended_cpuid_level < 0x80000004)
99 return 0; 268 return;
100 269
101 v = (unsigned int *) c->x86_model_id; 270 v = (unsigned int *) c->x86_model_id;
102 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); 271 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
@@ -115,30 +284,34 @@ int __cpuinit get_model_name(struct cpuinfo_x86 *c)
115 while (q <= &c->x86_model_id[48]) 284 while (q <= &c->x86_model_id[48])
116 *q++ = '\0'; /* Zero-pad the rest */ 285 *q++ = '\0'; /* Zero-pad the rest */
117 } 286 }
118
119 return 1;
120} 287}
121 288
122
123void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) 289void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
124{ 290{
125 unsigned int n, dummy, ecx, edx, l2size; 291 unsigned int n, dummy, ebx, ecx, edx, l2size;
126 292
127 n = cpuid_eax(0x80000000); 293 n = c->extended_cpuid_level;
128 294
129 if (n >= 0x80000005) { 295 if (n >= 0x80000005) {
130 cpuid(0x80000005, &dummy, &dummy, &ecx, &edx); 296 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
131 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", 297 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
132 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); 298 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
133 c->x86_cache_size = (ecx>>24)+(edx>>24); 299 c->x86_cache_size = (ecx>>24) + (edx>>24);
300#ifdef CONFIG_X86_64
301 /* On K8 L1 TLB is inclusive, so don't count it */
302 c->x86_tlbsize = 0;
303#endif
134 } 304 }
135 305
136 if (n < 0x80000006) /* Some chips just has a large L1. */ 306 if (n < 0x80000006) /* Some chips just has a large L1. */
137 return; 307 return;
138 308
139 ecx = cpuid_ecx(0x80000006); 309 cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
140 l2size = ecx >> 16; 310 l2size = ecx >> 16;
141 311
312#ifdef CONFIG_X86_64
313 c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
314#else
142 /* do processor-specific cache resizing */ 315 /* do processor-specific cache resizing */
143 if (this_cpu->c_size_cache) 316 if (this_cpu->c_size_cache)
144 l2size = this_cpu->c_size_cache(c, l2size); 317 l2size = this_cpu->c_size_cache(c, l2size);
@@ -149,116 +322,106 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
149 322
150 if (l2size == 0) 323 if (l2size == 0)
151 return; /* Again, no L2 cache is possible */ 324 return; /* Again, no L2 cache is possible */
325#endif
152 326
153 c->x86_cache_size = l2size; 327 c->x86_cache_size = l2size;
154 328
155 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", 329 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
156 l2size, ecx & 0xFF); 330 l2size, ecx & 0xFF);
157} 331}
158 332
159/* 333void __cpuinit detect_ht(struct cpuinfo_x86 *c)
160 * Naming convention should be: <Name> [(<Codename>)]
161 * This table only is used unless init_<vendor>() below doesn't set it;
162 * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
163 *
164 */
165
166/* Look up CPU names by table lookup. */
167static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
168{ 334{
169 struct cpu_model_info *info; 335#ifdef CONFIG_X86_HT
336 u32 eax, ebx, ecx, edx;
337 int index_msb, core_bits;
170 338
171 if (c->x86_model >= 16) 339 if (!cpu_has(c, X86_FEATURE_HT))
172 return NULL; /* Range check */ 340 return;
173 341
174 if (!this_cpu) 342 if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
175 return NULL; 343 goto out;
176 344
177 info = this_cpu->c_models; 345 if (cpu_has(c, X86_FEATURE_XTOPOLOGY))
346 return;
178 347
179 while (info && info->family) { 348 cpuid(1, &eax, &ebx, &ecx, &edx);
180 if (info->family == c->x86) 349
181 return info->model_names[c->x86_model]; 350 smp_num_siblings = (ebx & 0xff0000) >> 16;
182 info++; 351
352 if (smp_num_siblings == 1) {
353 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
354 } else if (smp_num_siblings > 1) {
355
356 if (smp_num_siblings > NR_CPUS) {
357 printk(KERN_WARNING "CPU: Unsupported number of siblings %d",
358 smp_num_siblings);
359 smp_num_siblings = 1;
360 return;
361 }
362
363 index_msb = get_count_order(smp_num_siblings);
364#ifdef CONFIG_X86_64
365 c->phys_proc_id = phys_pkg_id(index_msb);
366#else
367 c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb);
368#endif
369
370 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
371
372 index_msb = get_count_order(smp_num_siblings);
373
374 core_bits = get_count_order(c->x86_max_cores);
375
376#ifdef CONFIG_X86_64
377 c->cpu_core_id = phys_pkg_id(index_msb) &
378 ((1 << core_bits) - 1);
379#else
380 c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) &
381 ((1 << core_bits) - 1);
382#endif
183 } 383 }
184 return NULL; /* Not found */
185}
186 384
385out:
386 if ((c->x86_max_cores * smp_num_siblings) > 1) {
387 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
388 c->phys_proc_id);
389 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
390 c->cpu_core_id);
391 }
392#endif
393}
187 394
188static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early) 395static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
189{ 396{
190 char *v = c->x86_vendor_id; 397 char *v = c->x86_vendor_id;
191 int i; 398 int i;
192 static int printed; 399 static int printed;
193 400
194 for (i = 0; i < X86_VENDOR_NUM; i++) { 401 for (i = 0; i < X86_VENDOR_NUM; i++) {
195 if (cpu_devs[i]) { 402 if (!cpu_devs[i])
196 if (!strcmp(v, cpu_devs[i]->c_ident[0]) || 403 break;
197 (cpu_devs[i]->c_ident[1] && 404
198 !strcmp(v, cpu_devs[i]->c_ident[1]))) { 405 if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
199 c->x86_vendor = i; 406 (cpu_devs[i]->c_ident[1] &&
200 if (!early) 407 !strcmp(v, cpu_devs[i]->c_ident[1]))) {
201 this_cpu = cpu_devs[i]; 408 this_cpu = cpu_devs[i];
202 return; 409 c->x86_vendor = this_cpu->c_x86_vendor;
203 } 410 return;
204 } 411 }
205 } 412 }
413
206 if (!printed) { 414 if (!printed) {
207 printed++; 415 printed++;
208 printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); 416 printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v);
209 printk(KERN_ERR "CPU: Your system may be unstable.\n"); 417 printk(KERN_ERR "CPU: Your system may be unstable.\n");
210 } 418 }
419
211 c->x86_vendor = X86_VENDOR_UNKNOWN; 420 c->x86_vendor = X86_VENDOR_UNKNOWN;
212 this_cpu = &default_cpu; 421 this_cpu = &default_cpu;
213} 422}
214 423
215 424void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
216static int __init x86_fxsr_setup(char *s)
217{
218 setup_clear_cpu_cap(X86_FEATURE_FXSR);
219 setup_clear_cpu_cap(X86_FEATURE_XMM);
220 return 1;
221}
222__setup("nofxsr", x86_fxsr_setup);
223
224
225static int __init x86_sep_setup(char *s)
226{
227 setup_clear_cpu_cap(X86_FEATURE_SEP);
228 return 1;
229}
230__setup("nosep", x86_sep_setup);
231
232
233/* Standard macro to see if a specific flag is changeable */
234static inline int flag_is_changeable_p(u32 flag)
235{
236 u32 f1, f2;
237
238 asm("pushfl\n\t"
239 "pushfl\n\t"
240 "popl %0\n\t"
241 "movl %0,%1\n\t"
242 "xorl %2,%0\n\t"
243 "pushl %0\n\t"
244 "popfl\n\t"
245 "pushfl\n\t"
246 "popl %0\n\t"
247 "popfl\n\t"
248 : "=&r" (f1), "=&r" (f2)
249 : "ir" (flag));
250
251 return ((f1^f2) & flag) != 0;
252}
253
254
255/* Probe for the CPUID instruction */
256static int __cpuinit have_cpuid_p(void)
257{
258 return flag_is_changeable_p(X86_EFLAGS_ID);
259}
260
261void __init cpu_detect(struct cpuinfo_x86 *c)
262{ 425{
263 /* Get vendor name */ 426 /* Get vendor name */
264 cpuid(0x00000000, (unsigned int *)&c->cpuid_level, 427 cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
@@ -267,50 +430,87 @@ void __init cpu_detect(struct cpuinfo_x86 *c)
267 (unsigned int *)&c->x86_vendor_id[4]); 430 (unsigned int *)&c->x86_vendor_id[4]);
268 431
269 c->x86 = 4; 432 c->x86 = 4;
433 /* Intel-defined flags: level 0x00000001 */
270 if (c->cpuid_level >= 0x00000001) { 434 if (c->cpuid_level >= 0x00000001) {
271 u32 junk, tfms, cap0, misc; 435 u32 junk, tfms, cap0, misc;
272 cpuid(0x00000001, &tfms, &misc, &junk, &cap0); 436 cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
273 c->x86 = (tfms >> 8) & 15; 437 c->x86 = (tfms >> 8) & 0xf;
274 c->x86_model = (tfms >> 4) & 15; 438 c->x86_model = (tfms >> 4) & 0xf;
439 c->x86_mask = tfms & 0xf;
275 if (c->x86 == 0xf) 440 if (c->x86 == 0xf)
276 c->x86 += (tfms >> 20) & 0xff; 441 c->x86 += (tfms >> 20) & 0xff;
277 if (c->x86 >= 0x6) 442 if (c->x86 >= 0x6)
278 c->x86_model += ((tfms >> 16) & 0xF) << 4; 443 c->x86_model += ((tfms >> 16) & 0xf) << 4;
279 c->x86_mask = tfms & 15;
280 if (cap0 & (1<<19)) { 444 if (cap0 & (1<<19)) {
281 c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
282 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; 445 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
446 c->x86_cache_alignment = c->x86_clflush_size;
283 } 447 }
284 } 448 }
285} 449}
286static void __cpuinit early_get_cap(struct cpuinfo_x86 *c) 450
451static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
287{ 452{
288 u32 tfms, xlvl; 453 u32 tfms, xlvl;
289 unsigned int ebx; 454 u32 ebx;
290 455
291 memset(&c->x86_capability, 0, sizeof c->x86_capability); 456 /* Intel-defined flags: level 0x00000001 */
292 if (have_cpuid_p()) { 457 if (c->cpuid_level >= 0x00000001) {
293 /* Intel-defined flags: level 0x00000001 */ 458 u32 capability, excap;
294 if (c->cpuid_level >= 0x00000001) { 459 cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
295 u32 capability, excap; 460 c->x86_capability[0] = capability;
296 cpuid(0x00000001, &tfms, &ebx, &excap, &capability); 461 c->x86_capability[4] = excap;
297 c->x86_capability[0] = capability; 462 }
298 c->x86_capability[4] = excap;
299 }
300 463
301 /* AMD-defined flags: level 0x80000001 */ 464 /* AMD-defined flags: level 0x80000001 */
302 xlvl = cpuid_eax(0x80000000); 465 xlvl = cpuid_eax(0x80000000);
303 if ((xlvl & 0xffff0000) == 0x80000000) { 466 c->extended_cpuid_level = xlvl;
304 if (xlvl >= 0x80000001) { 467 if ((xlvl & 0xffff0000) == 0x80000000) {
305 c->x86_capability[1] = cpuid_edx(0x80000001); 468 if (xlvl >= 0x80000001) {
306 c->x86_capability[6] = cpuid_ecx(0x80000001); 469 c->x86_capability[1] = cpuid_edx(0x80000001);
307 } 470 c->x86_capability[6] = cpuid_ecx(0x80000001);
308 } 471 }
472 }
473
474#ifdef CONFIG_X86_64
475 if (c->extended_cpuid_level >= 0x80000008) {
476 u32 eax = cpuid_eax(0x80000008);
309 477
478 c->x86_virt_bits = (eax >> 8) & 0xff;
479 c->x86_phys_bits = eax & 0xff;
310 } 480 }
481#endif
482
483 if (c->extended_cpuid_level >= 0x80000007)
484 c->x86_power = cpuid_edx(0x80000007);
311 485
312} 486}
313 487
488static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
489{
490#ifdef CONFIG_X86_32
491 int i;
492
493 /*
494 * First of all, decide if this is a 486 or higher
495 * It's a 486 if we can modify the AC flag
496 */
497 if (flag_is_changeable_p(X86_EFLAGS_AC))
498 c->x86 = 4;
499 else
500 c->x86 = 3;
501
502 for (i = 0; i < X86_VENDOR_NUM; i++)
503 if (cpu_devs[i] && cpu_devs[i]->c_identify) {
504 c->x86_vendor_id[0] = 0;
505 cpu_devs[i]->c_identify(c);
506 if (c->x86_vendor_id[0]) {
507 get_cpu_vendor(c);
508 break;
509 }
510 }
511#endif
512}
513
314/* 514/*
315 * Do minimum CPU detection early. 515 * Do minimum CPU detection early.
316 * Fields really needed: vendor, cpuid_level, family, model, mask, 516 * Fields really needed: vendor, cpuid_level, family, model, mask,
@@ -320,144 +520,147 @@ static void __cpuinit early_get_cap(struct cpuinfo_x86 *c)
320 * WARNING: this function is only called on the BP. Don't add code here 520 * WARNING: this function is only called on the BP. Don't add code here
321 * that is supposed to run on all CPUs. 521 * that is supposed to run on all CPUs.
322 */ 522 */
323static void __init early_cpu_detect(void) 523static void __init early_identify_cpu(struct cpuinfo_x86 *c)
324{ 524{
325 struct cpuinfo_x86 *c = &boot_cpu_data; 525#ifdef CONFIG_X86_64
326 526 c->x86_clflush_size = 64;
327 c->x86_cache_alignment = 32; 527#else
328 c->x86_clflush_size = 32; 528 c->x86_clflush_size = 32;
529#endif
530 c->x86_cache_alignment = c->x86_clflush_size;
531
532 memset(&c->x86_capability, 0, sizeof c->x86_capability);
533 c->extended_cpuid_level = 0;
534
535 if (!have_cpuid_p())
536 identify_cpu_without_cpuid(c);
329 537
538 /* cyrix could have cpuid enabled via c_identify()*/
330 if (!have_cpuid_p()) 539 if (!have_cpuid_p())
331 return; 540 return;
332 541
333 cpu_detect(c); 542 cpu_detect(c);
334 543
335 get_cpu_vendor(c, 1); 544 get_cpu_vendor(c);
545
546 get_cpu_cap(c);
336 547
337 if (c->x86_vendor != X86_VENDOR_UNKNOWN && 548 if (this_cpu->c_early_init)
338 cpu_devs[c->x86_vendor]->c_early_init) 549 this_cpu->c_early_init(c);
339 cpu_devs[c->x86_vendor]->c_early_init(c);
340 550
341 early_get_cap(c); 551 validate_pat_support(c);
342} 552}
343 553
344static void __cpuinit generic_identify(struct cpuinfo_x86 *c) 554void __init early_cpu_init(void)
345{ 555{
346 u32 tfms, xlvl; 556 struct cpu_dev **cdev;
347 unsigned int ebx; 557 int count = 0;
348 558
349 if (have_cpuid_p()) { 559 printk("KERNEL supported cpus:\n");
350 /* Get vendor name */ 560 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
351 cpuid(0x00000000, (unsigned int *)&c->cpuid_level, 561 struct cpu_dev *cpudev = *cdev;
352 (unsigned int *)&c->x86_vendor_id[0], 562 unsigned int j;
353 (unsigned int *)&c->x86_vendor_id[8], 563
354 (unsigned int *)&c->x86_vendor_id[4]); 564 if (count >= X86_VENDOR_NUM)
355 565 break;
356 get_cpu_vendor(c, 0); 566 cpu_devs[count] = cpudev;
357 /* Initialize the standard set of capabilities */ 567 count++;
358 /* Note that the vendor-specific code below might override */ 568
359 /* Intel-defined flags: level 0x00000001 */ 569 for (j = 0; j < 2; j++) {
360 if (c->cpuid_level >= 0x00000001) { 570 if (!cpudev->c_ident[j])
361 u32 capability, excap; 571 continue;
362 cpuid(0x00000001, &tfms, &ebx, &excap, &capability); 572 printk(" %s %s\n", cpudev->c_vendor,
363 c->x86_capability[0] = capability; 573 cpudev->c_ident[j]);
364 c->x86_capability[4] = excap;
365 c->x86 = (tfms >> 8) & 15;
366 c->x86_model = (tfms >> 4) & 15;
367 if (c->x86 == 0xf)
368 c->x86 += (tfms >> 20) & 0xff;
369 if (c->x86 >= 0x6)
370 c->x86_model += ((tfms >> 16) & 0xF) << 4;
371 c->x86_mask = tfms & 15;
372 c->initial_apicid = (ebx >> 24) & 0xFF;
373#ifdef CONFIG_X86_HT
374 c->apicid = phys_pkg_id(c->initial_apicid, 0);
375 c->phys_proc_id = c->initial_apicid;
376#else
377 c->apicid = c->initial_apicid;
378#endif
379 if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
380 c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
381 } else {
382 /* Have CPUID level 0 only - unheard of */
383 c->x86 = 4;
384 }
385
386 /* AMD-defined flags: level 0x80000001 */
387 xlvl = cpuid_eax(0x80000000);
388 if ((xlvl & 0xffff0000) == 0x80000000) {
389 if (xlvl >= 0x80000001) {
390 c->x86_capability[1] = cpuid_edx(0x80000001);
391 c->x86_capability[6] = cpuid_ecx(0x80000001);
392 }
393 if (xlvl >= 0x80000004)
394 get_model_name(c); /* Default name */
395 } 574 }
396
397 init_scattered_cpuid_features(c);
398 } 575 }
399 576
577 early_identify_cpu(&boot_cpu_data);
400} 578}
401 579
402static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) 580/*
581 * The NOPL instruction is supposed to exist on all CPUs with
582 * family >= 6; unfortunately, that's not true in practice because
583 * of early VIA chips and (more importantly) broken virtualizers that
584 * are not easy to detect. In the latter case it doesn't even *fail*
585 * reliably, so probing for it doesn't even work. Disable it completely
586 * unless we can find a reliable way to detect all the broken cases.
587 */
588static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
403{ 589{
404 if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) { 590 clear_cpu_cap(c, X86_FEATURE_NOPL);
405 /* Disable processor serial number */
406 unsigned long lo, hi;
407 rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
408 lo |= 0x200000;
409 wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
410 printk(KERN_NOTICE "CPU serial number disabled.\n");
411 clear_cpu_cap(c, X86_FEATURE_PN);
412
413 /* Disabling the serial number may affect the cpuid level */
414 c->cpuid_level = cpuid_eax(0);
415 }
416} 591}
417 592
418static int __init x86_serial_nr_setup(char *s) 593static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
419{ 594{
420 disable_x86_serial_nr = 0; 595 c->extended_cpuid_level = 0;
421 return 1; 596
422} 597 if (!have_cpuid_p())
423__setup("serialnumber", x86_serial_nr_setup); 598 identify_cpu_without_cpuid(c);
599
600 /* cyrix could have cpuid enabled via c_identify()*/
601 if (!have_cpuid_p())
602 return;
603
604 cpu_detect(c);
605
606 get_cpu_vendor(c);
424 607
608 get_cpu_cap(c);
425 609
610 if (c->cpuid_level >= 0x00000001) {
611 c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
612#ifdef CONFIG_X86_32
613# ifdef CONFIG_X86_HT
614 c->apicid = phys_pkg_id(c->initial_apicid, 0);
615# else
616 c->apicid = c->initial_apicid;
617# endif
618#endif
619
620#ifdef CONFIG_X86_HT
621 c->phys_proc_id = c->initial_apicid;
622#endif
623 }
624
625 get_model_name(c); /* Default name */
626
627 init_scattered_cpuid_features(c);
628 detect_nopl(c);
629}
426 630
427/* 631/*
428 * This does the hard work of actually picking apart the CPU stuff... 632 * This does the hard work of actually picking apart the CPU stuff...
429 */ 633 */
430void __cpuinit identify_cpu(struct cpuinfo_x86 *c) 634static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
431{ 635{
432 int i; 636 int i;
433 637
434 c->loops_per_jiffy = loops_per_jiffy; 638 c->loops_per_jiffy = loops_per_jiffy;
435 c->x86_cache_size = -1; 639 c->x86_cache_size = -1;
436 c->x86_vendor = X86_VENDOR_UNKNOWN; 640 c->x86_vendor = X86_VENDOR_UNKNOWN;
437 c->cpuid_level = -1; /* CPUID not detected */
438 c->x86_model = c->x86_mask = 0; /* So far unknown... */ 641 c->x86_model = c->x86_mask = 0; /* So far unknown... */
439 c->x86_vendor_id[0] = '\0'; /* Unset */ 642 c->x86_vendor_id[0] = '\0'; /* Unset */
440 c->x86_model_id[0] = '\0'; /* Unset */ 643 c->x86_model_id[0] = '\0'; /* Unset */
441 c->x86_max_cores = 1; 644 c->x86_max_cores = 1;
645 c->x86_coreid_bits = 0;
646#ifdef CONFIG_X86_64
647 c->x86_clflush_size = 64;
648#else
649 c->cpuid_level = -1; /* CPUID not detected */
442 c->x86_clflush_size = 32; 650 c->x86_clflush_size = 32;
651#endif
652 c->x86_cache_alignment = c->x86_clflush_size;
443 memset(&c->x86_capability, 0, sizeof c->x86_capability); 653 memset(&c->x86_capability, 0, sizeof c->x86_capability);
444 654
445 if (!have_cpuid_p()) {
446 /*
447 * First of all, decide if this is a 486 or higher
448 * It's a 486 if we can modify the AC flag
449 */
450 if (flag_is_changeable_p(X86_EFLAGS_AC))
451 c->x86 = 4;
452 else
453 c->x86 = 3;
454 }
455
456 generic_identify(c); 655 generic_identify(c);
457 656
458 if (this_cpu->c_identify) 657 if (this_cpu->c_identify)
459 this_cpu->c_identify(c); 658 this_cpu->c_identify(c);
460 659
660#ifdef CONFIG_X86_64
661 c->apicid = phys_pkg_id(0);
662#endif
663
461 /* 664 /*
462 * Vendor-specific initialization. In this section we 665 * Vendor-specific initialization. In this section we
463 * canonicalize the feature flags, meaning if there are 666 * canonicalize the feature flags, meaning if there are
@@ -491,6 +694,10 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
491 c->x86, c->x86_model); 694 c->x86, c->x86_model);
492 } 695 }
493 696
697#ifdef CONFIG_X86_64
698 detect_ht(c);
699#endif
700
494 /* 701 /*
495 * On SMP, boot_cpu_data holds the common feature set between 702 * On SMP, boot_cpu_data holds the common feature set between
496 * all CPUs; so make sure that we indicate which features are 703 * all CPUs; so make sure that we indicate which features are
@@ -499,7 +706,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
499 */ 706 */
500 if (c != &boot_cpu_data) { 707 if (c != &boot_cpu_data) {
501 /* AND the already accumulated flags with these */ 708 /* AND the already accumulated flags with these */
502 for (i = 0 ; i < NCAPINTS ; i++) 709 for (i = 0; i < NCAPINTS; i++)
503 boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; 710 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
504 } 711 }
505 712
@@ -507,72 +714,91 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
507 for (i = 0; i < NCAPINTS; i++) 714 for (i = 0; i < NCAPINTS; i++)
508 c->x86_capability[i] &= ~cleared_cpu_caps[i]; 715 c->x86_capability[i] &= ~cleared_cpu_caps[i];
509 716
717#ifdef CONFIG_X86_MCE
510 /* Init Machine Check Exception if available. */ 718 /* Init Machine Check Exception if available. */
511 mcheck_init(c); 719 mcheck_init(c);
720#endif
512 721
513 select_idle_routine(c); 722 select_idle_routine(c);
723
724#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
725 numa_add_cpu(smp_processor_id());
726#endif
514} 727}
515 728
729#ifdef CONFIG_X86_64
730static void vgetcpu_set_mode(void)
731{
732 if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
733 vgetcpu_mode = VGETCPU_RDTSCP;
734 else
735 vgetcpu_mode = VGETCPU_LSL;
736}
737#endif
738
516void __init identify_boot_cpu(void) 739void __init identify_boot_cpu(void)
517{ 740{
518 identify_cpu(&boot_cpu_data); 741 identify_cpu(&boot_cpu_data);
742#ifdef CONFIG_X86_32
519 sysenter_setup(); 743 sysenter_setup();
520 enable_sep_cpu(); 744 enable_sep_cpu();
745#else
746 vgetcpu_set_mode();
747#endif
521} 748}
522 749
523void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 750void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
524{ 751{
525 BUG_ON(c == &boot_cpu_data); 752 BUG_ON(c == &boot_cpu_data);
526 identify_cpu(c); 753 identify_cpu(c);
754#ifdef CONFIG_X86_32
527 enable_sep_cpu(); 755 enable_sep_cpu();
756#endif
528 mtrr_ap_init(); 757 mtrr_ap_init();
529} 758}
530 759
531#ifdef CONFIG_X86_HT 760struct msr_range {
532void __cpuinit detect_ht(struct cpuinfo_x86 *c) 761 unsigned min;
533{ 762 unsigned max;
534 u32 eax, ebx, ecx, edx; 763};
535 int index_msb, core_bits;
536
537 cpuid(1, &eax, &ebx, &ecx, &edx);
538
539 if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
540 return;
541
542 smp_num_siblings = (ebx & 0xff0000) >> 16;
543 764
544 if (smp_num_siblings == 1) { 765static struct msr_range msr_range_array[] __cpuinitdata = {
545 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); 766 { 0x00000000, 0x00000418},
546 } else if (smp_num_siblings > 1) { 767 { 0xc0000000, 0xc000040b},
768 { 0xc0010000, 0xc0010142},
769 { 0xc0011000, 0xc001103b},
770};
547 771
548 if (smp_num_siblings > NR_CPUS) { 772static void __cpuinit print_cpu_msr(void)
549 printk(KERN_WARNING "CPU: Unsupported number of the " 773{
550 "siblings %d", smp_num_siblings); 774 unsigned index;
551 smp_num_siblings = 1; 775 u64 val;
552 return; 776 int i;
777 unsigned index_min, index_max;
778
779 for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
780 index_min = msr_range_array[i].min;
781 index_max = msr_range_array[i].max;
782 for (index = index_min; index < index_max; index++) {
783 if (rdmsrl_amd_safe(index, &val))
784 continue;
785 printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
553 } 786 }
787 }
788}
554 789
555 index_msb = get_count_order(smp_num_siblings); 790static int show_msr __cpuinitdata;
556 c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb); 791static __init int setup_show_msr(char *arg)
557 792{
558 printk(KERN_INFO "CPU: Physical Processor ID: %d\n", 793 int num;
559 c->phys_proc_id);
560
561 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
562
563 index_msb = get_count_order(smp_num_siblings) ;
564
565 core_bits = get_count_order(c->x86_max_cores);
566 794
567 c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) & 795 get_option(&arg, &num);
568 ((1 << core_bits) - 1);
569 796
570 if (c->x86_max_cores > 1) 797 if (num > 0)
571 printk(KERN_INFO "CPU: Processor Core ID: %d\n", 798 show_msr = num;
572 c->cpu_core_id); 799 return 1;
573 }
574} 800}
575#endif 801__setup("show_msr=", setup_show_msr);
576 802
577static __init int setup_noclflush(char *arg) 803static __init int setup_noclflush(char *arg)
578{ 804{
@@ -590,18 +816,26 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
590 else if (c->cpuid_level >= 0) 816 else if (c->cpuid_level >= 0)
591 vendor = c->x86_vendor_id; 817 vendor = c->x86_vendor_id;
592 818
593 if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor))) 819 if (vendor && !strstr(c->x86_model_id, vendor))
594 printk("%s ", vendor); 820 printk(KERN_CONT "%s ", vendor);
595 821
596 if (!c->x86_model_id[0]) 822 if (c->x86_model_id[0])
597 printk("%d86", c->x86); 823 printk(KERN_CONT "%s", c->x86_model_id);
598 else 824 else
599 printk("%s", c->x86_model_id); 825 printk(KERN_CONT "%d86", c->x86);
600 826
601 if (c->x86_mask || c->cpuid_level >= 0) 827 if (c->x86_mask || c->cpuid_level >= 0)
602 printk(" stepping %02x\n", c->x86_mask); 828 printk(KERN_CONT " stepping %02x\n", c->x86_mask);
603 else 829 else
604 printk("\n"); 830 printk(KERN_CONT "\n");
831
832#ifdef CONFIG_SMP
833 if (c->cpu_index < show_msr)
834 print_cpu_msr();
835#else
836 if (show_msr)
837 print_cpu_msr();
838#endif
605} 839}
606 840
607static __init int setup_disablecpuid(char *arg) 841static __init int setup_disablecpuid(char *arg)
@@ -617,19 +851,89 @@ __setup("clearcpuid=", setup_disablecpuid);
617 851
618cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; 852cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
619 853
620void __init early_cpu_init(void) 854#ifdef CONFIG_X86_64
855struct x8664_pda **_cpu_pda __read_mostly;
856EXPORT_SYMBOL(_cpu_pda);
857
858struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
859
860char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
861
862void __cpuinit pda_init(int cpu)
621{ 863{
622 struct cpu_vendor_dev *cvdev; 864 struct x8664_pda *pda = cpu_pda(cpu);
865
866 /* Setup up data that may be needed in __get_free_pages early */
867 loadsegment(fs, 0);
868 loadsegment(gs, 0);
869 /* Memory clobbers used to order PDA accessed */
870 mb();
871 wrmsrl(MSR_GS_BASE, pda);
872 mb();
873
874 pda->cpunumber = cpu;
875 pda->irqcount = -1;
876 pda->kernelstack = (unsigned long)stack_thread_info() -
877 PDA_STACKOFFSET + THREAD_SIZE;
878 pda->active_mm = &init_mm;
879 pda->mmu_state = 0;
880
881 if (cpu == 0) {
882 /* others are initialized in smpboot.c */
883 pda->pcurrent = &init_task;
884 pda->irqstackptr = boot_cpu_stack;
885 pda->irqstackptr += IRQSTACKSIZE - 64;
886 } else {
887 if (!pda->irqstackptr) {
888 pda->irqstackptr = (char *)
889 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
890 if (!pda->irqstackptr)
891 panic("cannot allocate irqstack for cpu %d",
892 cpu);
893 pda->irqstackptr += IRQSTACKSIZE - 64;
894 }
895
896 if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
897 pda->nodenumber = cpu_to_node(cpu);
898 }
899}
900
901char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
902 DEBUG_STKSZ] __page_aligned_bss;
623 903
624 for (cvdev = __x86cpuvendor_start ; 904extern asmlinkage void ignore_sysret(void);
625 cvdev < __x86cpuvendor_end ;
626 cvdev++)
627 cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
628 905
629 early_cpu_detect(); 906/* May not be marked __init: used by software suspend */
630 validate_pat_support(&boot_cpu_data); 907void syscall_init(void)
908{
909 /*
910 * LSTAR and STAR live in a bit strange symbiosis.
911 * They both write to the same internal register. STAR allows to
912 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
913 */
914 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
915 wrmsrl(MSR_LSTAR, system_call);
916 wrmsrl(MSR_CSTAR, ignore_sysret);
917
918#ifdef CONFIG_IA32_EMULATION
919 syscall32_cpu_init();
920#endif
921
922 /* Flags to clear on syscall */
923 wrmsrl(MSR_SYSCALL_MASK,
924 X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
631} 925}
632 926
927unsigned long kernel_eflags;
928
929/*
930 * Copies of the original ist values from the tss are only accessed during
931 * debugging, no special alignment required.
932 */
933DEFINE_PER_CPU(struct orig_ist, orig_ist);
934
935#else
936
633/* Make sure %fs is initialized properly in idle threads */ 937/* Make sure %fs is initialized properly in idle threads */
634struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) 938struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
635{ 939{
@@ -637,25 +941,136 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
637 regs->fs = __KERNEL_PERCPU; 941 regs->fs = __KERNEL_PERCPU;
638 return regs; 942 return regs;
639} 943}
640 944#endif
641/* Current gdt points %fs at the "master" per-cpu area: after this,
642 * it's on the real one. */
643void switch_to_new_gdt(void)
644{
645 struct desc_ptr gdt_descr;
646
647 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
648 gdt_descr.size = GDT_SIZE - 1;
649 load_gdt(&gdt_descr);
650 asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
651}
652 945
653/* 946/*
654 * cpu_init() initializes state that is per-CPU. Some data is already 947 * cpu_init() initializes state that is per-CPU. Some data is already
655 * initialized (naturally) in the bootstrap process, such as the GDT 948 * initialized (naturally) in the bootstrap process, such as the GDT
656 * and IDT. We reload them nevertheless, this function acts as a 949 * and IDT. We reload them nevertheless, this function acts as a
657 * 'CPU state barrier', nothing should get across. 950 * 'CPU state barrier', nothing should get across.
951 * A lot of state is already set up in PDA init for 64 bit
658 */ 952 */
953#ifdef CONFIG_X86_64
954void __cpuinit cpu_init(void)
955{
956 int cpu = stack_smp_processor_id();
957 struct tss_struct *t = &per_cpu(init_tss, cpu);
958 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
959 unsigned long v;
960 char *estacks = NULL;
961 struct task_struct *me;
962 int i;
963
964 /* CPU 0 is initialised in head64.c */
965 if (cpu != 0)
966 pda_init(cpu);
967 else
968 estacks = boot_exception_stacks;
969
970 me = current;
971
972 if (cpu_test_and_set(cpu, cpu_initialized))
973 panic("CPU#%d already initialized!\n", cpu);
974
975 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
976
977 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
978
979 /*
980 * Initialize the per-CPU GDT with the boot GDT,
981 * and set up the GDT descriptor:
982 */
983
984 switch_to_new_gdt();
985 load_idt((const struct desc_ptr *)&idt_descr);
986
987 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
988 syscall_init();
989
990 wrmsrl(MSR_FS_BASE, 0);
991 wrmsrl(MSR_KERNEL_GS_BASE, 0);
992 barrier();
993
994 check_efer();
995 if (cpu != 0 && x2apic)
996 enable_x2apic();
997
998 /*
999 * set up and load the per-CPU TSS
1000 */
1001 if (!orig_ist->ist[0]) {
1002 static const unsigned int order[N_EXCEPTION_STACKS] = {
1003 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
1004 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
1005 };
1006 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
1007 if (cpu) {
1008 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
1009 if (!estacks)
1010 panic("Cannot allocate exception "
1011 "stack %ld %d\n", v, cpu);
1012 }
1013 estacks += PAGE_SIZE << order[v];
1014 orig_ist->ist[v] = t->x86_tss.ist[v] =
1015 (unsigned long)estacks;
1016 }
1017 }
1018
1019 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
1020 /*
1021 * <= is required because the CPU will access up to
1022 * 8 bits beyond the end of the IO permission bitmap.
1023 */
1024 for (i = 0; i <= IO_BITMAP_LONGS; i++)
1025 t->io_bitmap[i] = ~0UL;
1026
1027 atomic_inc(&init_mm.mm_count);
1028 me->active_mm = &init_mm;
1029 if (me->mm)
1030 BUG();
1031 enter_lazy_tlb(&init_mm, me);
1032
1033 load_sp0(t, &current->thread);
1034 set_tss_desc(cpu, t);
1035 load_TR_desc();
1036 load_LDT(&init_mm.context);
1037
1038#ifdef CONFIG_KGDB
1039 /*
1040 * If the kgdb is connected no debug regs should be altered. This
1041 * is only applicable when KGDB and a KGDB I/O module are built
1042 * into the kernel and you are using early debugging with
1043 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
1044 */
1045 if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
1046 arch_kgdb_ops.correct_hw_break();
1047 else {
1048#endif
1049 /*
1050 * Clear all 6 debug registers:
1051 */
1052
1053 set_debugreg(0UL, 0);
1054 set_debugreg(0UL, 1);
1055 set_debugreg(0UL, 2);
1056 set_debugreg(0UL, 3);
1057 set_debugreg(0UL, 6);
1058 set_debugreg(0UL, 7);
1059#ifdef CONFIG_KGDB
1060 /* If the kgdb is connected no debug regs should be altered. */
1061 }
1062#endif
1063
1064 fpu_init();
1065
1066 raw_local_save_flags(kernel_eflags);
1067
1068 if (is_uv_system())
1069 uv_cpu_init();
1070}
1071
1072#else
1073
659void __cpuinit cpu_init(void) 1074void __cpuinit cpu_init(void)
660{ 1075{
661 int cpu = smp_processor_id(); 1076 int cpu = smp_processor_id();
@@ -709,19 +1124,21 @@ void __cpuinit cpu_init(void)
709 /* 1124 /*
710 * Force FPU initialization: 1125 * Force FPU initialization:
711 */ 1126 */
712 current_thread_info()->status = 0; 1127 if (cpu_has_xsave)
1128 current_thread_info()->status = TS_XSAVE;
1129 else
1130 current_thread_info()->status = 0;
713 clear_used_math(); 1131 clear_used_math();
714 mxcsr_feature_mask_init(); 1132 mxcsr_feature_mask_init();
715}
716 1133
717#ifdef CONFIG_HOTPLUG_CPU 1134 /*
718void __cpuinit cpu_uninit(void) 1135 * Boot processor to setup the FP and extended state context info.
719{ 1136 */
720 int cpu = raw_smp_processor_id(); 1137 if (!smp_processor_id())
721 cpu_clear(cpu, cpu_initialized); 1138 init_thread_xstate();
722 1139
723 /* lazy TLB state */ 1140 xsave_init();
724 per_cpu(cpu_tlbstate, cpu).state = 0;
725 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
726} 1141}
1142
1143
727#endif 1144#endif
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 783691b2a738..de4094a39210 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -1,3 +1,6 @@
1#ifndef ARCH_X86_CPU_H
2
3#define ARCH_X86_CPU_H
1 4
2struct cpu_model_info { 5struct cpu_model_info {
3 int vendor; 6 int vendor;
@@ -18,21 +21,16 @@ struct cpu_dev {
18 void (*c_init)(struct cpuinfo_x86 * c); 21 void (*c_init)(struct cpuinfo_x86 * c);
19 void (*c_identify)(struct cpuinfo_x86 * c); 22 void (*c_identify)(struct cpuinfo_x86 * c);
20 unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size); 23 unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size);
24 int c_x86_vendor;
21}; 25};
22 26
23extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM]; 27#define cpu_dev_register(cpu_devX) \
24 28 static struct cpu_dev *__cpu_dev_##cpu_devX __used \
25struct cpu_vendor_dev { 29 __attribute__((__section__(".x86_cpu_dev.init"))) = \
26 int vendor; 30 &cpu_devX;
27 struct cpu_dev *cpu_dev;
28};
29
30#define cpu_vendor_dev_register(cpu_vendor_id, cpu_dev) \
31 static struct cpu_vendor_dev __cpu_vendor_dev_##cpu_vendor_id __used \
32 __attribute__((__section__(".x86cpuvendor.init"))) = \
33 { cpu_vendor_id, cpu_dev }
34 31
35extern struct cpu_vendor_dev __x86cpuvendor_start[], __x86cpuvendor_end[]; 32extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[];
36 33
37extern int get_model_name(struct cpuinfo_x86 *c);
38extern void display_cacheinfo(struct cpuinfo_x86 *c); 34extern void display_cacheinfo(struct cpuinfo_x86 *c);
35
36#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index cb7a5715596d..efae3b22a0ff 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -235,9 +235,9 @@ config X86_LONGHAUL
235 If in doubt, say N. 235 If in doubt, say N.
236 236
237config X86_E_POWERSAVER 237config X86_E_POWERSAVER
238 tristate "VIA C7 Enhanced PowerSaver (EXPERIMENTAL)" 238 tristate "VIA C7 Enhanced PowerSaver"
239 select CPU_FREQ_TABLE 239 select CPU_FREQ_TABLE
240 depends on X86_32 && EXPERIMENTAL 240 depends on X86_32
241 help 241 help
242 This adds the CPUFreq driver for VIA C7 processors. 242 This adds the CPUFreq driver for VIA C7 processors.
243 243
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index b0c8208df9fa..c24c4a487b7c 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -202,7 +202,7 @@ static void drv_write(struct drv_cmd *cmd)
202 cpumask_t saved_mask = current->cpus_allowed; 202 cpumask_t saved_mask = current->cpus_allowed;
203 unsigned int i; 203 unsigned int i;
204 204
205 for_each_cpu_mask(i, cmd->mask) { 205 for_each_cpu_mask_nr(i, cmd->mask) {
206 set_cpus_allowed_ptr(current, &cpumask_of_cpu(i)); 206 set_cpus_allowed_ptr(current, &cpumask_of_cpu(i));
207 do_drv_write(cmd); 207 do_drv_write(cmd);
208 } 208 }
@@ -256,7 +256,8 @@ static u32 get_cur_val(const cpumask_t *mask)
256 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and 256 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
257 * no meaning should be associated with absolute values of these MSRs. 257 * no meaning should be associated with absolute values of these MSRs.
258 */ 258 */
259static unsigned int get_measured_perf(unsigned int cpu) 259static unsigned int get_measured_perf(struct cpufreq_policy *policy,
260 unsigned int cpu)
260{ 261{
261 union { 262 union {
262 struct { 263 struct {
@@ -326,7 +327,7 @@ static unsigned int get_measured_perf(unsigned int cpu)
326 327
327#endif 328#endif
328 329
329 retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100; 330 retval = per_cpu(drv_data, policy->cpu)->max_freq * perf_percent / 100;
330 331
331 put_cpu(); 332 put_cpu();
332 set_cpus_allowed_ptr(current, &saved_mask); 333 set_cpus_allowed_ptr(current, &saved_mask);
@@ -451,7 +452,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
451 452
452 freqs.old = perf->states[perf->state].core_frequency * 1000; 453 freqs.old = perf->states[perf->state].core_frequency * 1000;
453 freqs.new = data->freq_table[next_state].frequency; 454 freqs.new = data->freq_table[next_state].frequency;
454 for_each_cpu_mask(i, cmd.mask) { 455 for_each_cpu_mask_nr(i, cmd.mask) {
455 freqs.cpu = i; 456 freqs.cpu = i;
456 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 457 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
457 } 458 }
@@ -466,7 +467,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
466 } 467 }
467 } 468 }
468 469
469 for_each_cpu_mask(i, cmd.mask) { 470 for_each_cpu_mask_nr(i, cmd.mask) {
470 freqs.cpu = i; 471 freqs.cpu = i;
471 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 472 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
472 } 473 }
@@ -785,7 +786,11 @@ static int __init acpi_cpufreq_init(void)
785 if (ret) 786 if (ret)
786 return ret; 787 return ret;
787 788
788 return cpufreq_register_driver(&acpi_cpufreq_driver); 789 ret = cpufreq_register_driver(&acpi_cpufreq_driver);
790 if (ret)
791 free_percpu(acpi_perf_data);
792
793 return ret;
789} 794}
790 795
791static void __exit acpi_cpufreq_exit(void) 796static void __exit acpi_cpufreq_exit(void)
@@ -795,8 +800,6 @@ static void __exit acpi_cpufreq_exit(void)
795 cpufreq_unregister_driver(&acpi_cpufreq_driver); 800 cpufreq_unregister_driver(&acpi_cpufreq_driver);
796 801
797 free_percpu(acpi_perf_data); 802 free_percpu(acpi_perf_data);
798
799 return;
800} 803}
801 804
802module_param(acpi_pstate_strict, uint, 0644); 805module_param(acpi_pstate_strict, uint, 0644);
diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
index f03e9153618e..965ea52767ac 100644
--- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
+++ b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
@@ -26,9 +26,10 @@
26#define NFORCE2_SAFE_DISTANCE 50 26#define NFORCE2_SAFE_DISTANCE 50
27 27
28/* Delay in ms between FSB changes */ 28/* Delay in ms between FSB changes */
29//#define NFORCE2_DELAY 10 29/* #define NFORCE2_DELAY 10 */
30 30
31/* nforce2_chipset: 31/*
32 * nforce2_chipset:
32 * FSB is changed using the chipset 33 * FSB is changed using the chipset
33 */ 34 */
34static struct pci_dev *nforce2_chipset_dev; 35static struct pci_dev *nforce2_chipset_dev;
@@ -36,13 +37,13 @@ static struct pci_dev *nforce2_chipset_dev;
36/* fid: 37/* fid:
37 * multiplier * 10 38 * multiplier * 10
38 */ 39 */
39static int fid = 0; 40static int fid;
40 41
41/* min_fsb, max_fsb: 42/* min_fsb, max_fsb:
42 * minimum and maximum FSB (= FSB at boot time) 43 * minimum and maximum FSB (= FSB at boot time)
43 */ 44 */
44static int min_fsb = 0; 45static int min_fsb;
45static int max_fsb = 0; 46static int max_fsb;
46 47
47MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>"); 48MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>");
48MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver"); 49MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver");
@@ -53,7 +54,7 @@ module_param(min_fsb, int, 0444);
53 54
54MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)"); 55MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)");
55MODULE_PARM_DESC(min_fsb, 56MODULE_PARM_DESC(min_fsb,
56 "Minimum FSB to use, if not defined: current FSB - 50"); 57 "Minimum FSB to use, if not defined: current FSB - 50");
57 58
58#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "cpufreq-nforce2", msg) 59#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "cpufreq-nforce2", msg)
59 60
@@ -139,7 +140,7 @@ static unsigned int nforce2_fsb_read(int bootfsb)
139 140
140 /* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */ 141 /* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */
141 nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, 142 nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
142 0x01EF,PCI_ANY_ID,PCI_ANY_ID,NULL); 143 0x01EF, PCI_ANY_ID, PCI_ANY_ID, NULL);
143 if (!nforce2_sub5) 144 if (!nforce2_sub5)
144 return 0; 145 return 0;
145 146
@@ -147,13 +148,13 @@ static unsigned int nforce2_fsb_read(int bootfsb)
147 fsb /= 1000000; 148 fsb /= 1000000;
148 149
149 /* Check if PLL register is already set */ 150 /* Check if PLL register is already set */
150 pci_read_config_byte(nforce2_chipset_dev,NFORCE2_PLLENABLE, (u8 *)&temp); 151 pci_read_config_byte(nforce2_chipset_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
151 152
152 if(bootfsb || !temp) 153 if (bootfsb || !temp)
153 return fsb; 154 return fsb;
154 155
155 /* Use PLL register FSB value */ 156 /* Use PLL register FSB value */
156 pci_read_config_dword(nforce2_chipset_dev,NFORCE2_PLLREG, &temp); 157 pci_read_config_dword(nforce2_chipset_dev, NFORCE2_PLLREG, &temp);
157 fsb = nforce2_calc_fsb(temp); 158 fsb = nforce2_calc_fsb(temp);
158 159
159 return fsb; 160 return fsb;
@@ -184,7 +185,7 @@ static int nforce2_set_fsb(unsigned int fsb)
184 } 185 }
185 186
186 /* First write? Then set actual value */ 187 /* First write? Then set actual value */
187 pci_read_config_byte(nforce2_chipset_dev,NFORCE2_PLLENABLE, (u8 *)&temp); 188 pci_read_config_byte(nforce2_chipset_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
188 if (!temp) { 189 if (!temp) {
189 pll = nforce2_calc_pll(tfsb); 190 pll = nforce2_calc_pll(tfsb);
190 191
@@ -210,7 +211,8 @@ static int nforce2_set_fsb(unsigned int fsb)
210 tfsb--; 211 tfsb--;
211 212
212 /* Calculate the PLL reg. value */ 213 /* Calculate the PLL reg. value */
213 if ((pll = nforce2_calc_pll(tfsb)) == -1) 214 pll = nforce2_calc_pll(tfsb);
215 if (pll == -1)
214 return -EINVAL; 216 return -EINVAL;
215 217
216 nforce2_write_pll(pll); 218 nforce2_write_pll(pll);
@@ -249,7 +251,7 @@ static unsigned int nforce2_get(unsigned int cpu)
249static int nforce2_target(struct cpufreq_policy *policy, 251static int nforce2_target(struct cpufreq_policy *policy,
250 unsigned int target_freq, unsigned int relation) 252 unsigned int target_freq, unsigned int relation)
251{ 253{
252// unsigned long flags; 254/* unsigned long flags; */
253 struct cpufreq_freqs freqs; 255 struct cpufreq_freqs freqs;
254 unsigned int target_fsb; 256 unsigned int target_fsb;
255 257
@@ -271,17 +273,17 @@ static int nforce2_target(struct cpufreq_policy *policy,
271 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 273 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
272 274
273 /* Disable IRQs */ 275 /* Disable IRQs */
274 //local_irq_save(flags); 276 /* local_irq_save(flags); */
275 277
276 if (nforce2_set_fsb(target_fsb) < 0) 278 if (nforce2_set_fsb(target_fsb) < 0)
277 printk(KERN_ERR "cpufreq: Changing FSB to %d failed\n", 279 printk(KERN_ERR "cpufreq: Changing FSB to %d failed\n",
278 target_fsb); 280 target_fsb);
279 else 281 else
280 dprintk("Changed FSB successfully to %d\n", 282 dprintk("Changed FSB successfully to %d\n",
281 target_fsb); 283 target_fsb);
282 284
283 /* Enable IRQs */ 285 /* Enable IRQs */
284 //local_irq_restore(flags); 286 /* local_irq_restore(flags); */
285 287
286 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 288 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
287 289
@@ -302,8 +304,8 @@ static int nforce2_verify(struct cpufreq_policy *policy)
302 policy->max = (fsb_pol_max + 1) * fid * 100; 304 policy->max = (fsb_pol_max + 1) * fid * 100;
303 305
304 cpufreq_verify_within_limits(policy, 306 cpufreq_verify_within_limits(policy,
305 policy->cpuinfo.min_freq, 307 policy->cpuinfo.min_freq,
306 policy->cpuinfo.max_freq); 308 policy->cpuinfo.max_freq);
307 return 0; 309 return 0;
308} 310}
309 311
@@ -347,7 +349,7 @@ static int nforce2_cpu_init(struct cpufreq_policy *policy)
347 /* Set maximum FSB to FSB at boot time */ 349 /* Set maximum FSB to FSB at boot time */
348 max_fsb = nforce2_fsb_read(1); 350 max_fsb = nforce2_fsb_read(1);
349 351
350 if(!max_fsb) 352 if (!max_fsb)
351 return -EIO; 353 return -EIO;
352 354
353 if (!min_fsb) 355 if (!min_fsb)
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
index 94619c22f563..fe613c93b366 100644
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
@@ -25,8 +25,8 @@
25#include <linux/cpufreq.h> 25#include <linux/cpufreq.h>
26 26
27#include <asm/msr.h> 27#include <asm/msr.h>
28#include <asm/timex.h> 28#include <linux/timex.h>
29#include <asm/io.h> 29#include <linux/io.h>
30 30
31#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */ 31#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */
32#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */ 32#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */
@@ -44,7 +44,7 @@ struct s_elan_multiplier {
44 * It is important that the frequencies 44 * It is important that the frequencies
45 * are listed in ascending order here! 45 * are listed in ascending order here!
46 */ 46 */
47struct s_elan_multiplier elan_multiplier[] = { 47static struct s_elan_multiplier elan_multiplier[] = {
48 {1000, 0x02, 0x18}, 48 {1000, 0x02, 0x18},
49 {2000, 0x02, 0x10}, 49 {2000, 0x02, 0x10},
50 {4000, 0x02, 0x08}, 50 {4000, 0x02, 0x08},
@@ -82,7 +82,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
82 u8 clockspeed_reg; /* Clock Speed Register */ 82 u8 clockspeed_reg; /* Clock Speed Register */
83 83
84 local_irq_disable(); 84 local_irq_disable();
85 outb_p(0x80,REG_CSCIR); 85 outb_p(0x80, REG_CSCIR);
86 clockspeed_reg = inb_p(REG_CSCDR); 86 clockspeed_reg = inb_p(REG_CSCDR);
87 local_irq_enable(); 87 local_irq_enable();
88 88
@@ -98,10 +98,10 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
98 } 98 }
99 99
100 /* 33 MHz is not 32 MHz... */ 100 /* 33 MHz is not 32 MHz... */
101 if ((clockspeed_reg & 0xE0)==0xA0) 101 if ((clockspeed_reg & 0xE0) == 0xA0)
102 return 33000; 102 return 33000;
103 103
104 return ((1<<((clockspeed_reg & 0xE0) >> 5)) * 1000); 104 return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000;
105} 105}
106 106
107 107
@@ -117,7 +117,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
117 * There is no return value. 117 * There is no return value.
118 */ 118 */
119 119
120static void elanfreq_set_cpu_state (unsigned int state) 120static void elanfreq_set_cpu_state(unsigned int state)
121{ 121{
122 struct cpufreq_freqs freqs; 122 struct cpufreq_freqs freqs;
123 123
@@ -144,20 +144,20 @@ static void elanfreq_set_cpu_state (unsigned int state)
144 */ 144 */
145 145
146 local_irq_disable(); 146 local_irq_disable();
147 outb_p(0x40,REG_CSCIR); /* Disable hyperspeed mode */ 147 outb_p(0x40, REG_CSCIR); /* Disable hyperspeed mode */
148 outb_p(0x00,REG_CSCDR); 148 outb_p(0x00, REG_CSCDR);
149 local_irq_enable(); /* wait till internal pipelines and */ 149 local_irq_enable(); /* wait till internal pipelines and */
150 udelay(1000); /* buffers have cleaned up */ 150 udelay(1000); /* buffers have cleaned up */
151 151
152 local_irq_disable(); 152 local_irq_disable();
153 153
154 /* now, set the CPU clock speed register (0x80) */ 154 /* now, set the CPU clock speed register (0x80) */
155 outb_p(0x80,REG_CSCIR); 155 outb_p(0x80, REG_CSCIR);
156 outb_p(elan_multiplier[state].val80h,REG_CSCDR); 156 outb_p(elan_multiplier[state].val80h, REG_CSCDR);
157 157
158 /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */ 158 /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */
159 outb_p(0x40,REG_CSCIR); 159 outb_p(0x40, REG_CSCIR);
160 outb_p(elan_multiplier[state].val40h,REG_CSCDR); 160 outb_p(elan_multiplier[state].val40h, REG_CSCDR);
161 udelay(10000); 161 udelay(10000);
162 local_irq_enable(); 162 local_irq_enable();
163 163
@@ -173,12 +173,12 @@ static void elanfreq_set_cpu_state (unsigned int state)
173 * for the hardware supported by the driver. 173 * for the hardware supported by the driver.
174 */ 174 */
175 175
176static int elanfreq_verify (struct cpufreq_policy *policy) 176static int elanfreq_verify(struct cpufreq_policy *policy)
177{ 177{
178 return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]); 178 return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]);
179} 179}
180 180
181static int elanfreq_target (struct cpufreq_policy *policy, 181static int elanfreq_target(struct cpufreq_policy *policy,
182 unsigned int target_freq, 182 unsigned int target_freq,
183 unsigned int relation) 183 unsigned int relation)
184{ 184{
@@ -205,7 +205,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
205 205
206 /* capability check */ 206 /* capability check */
207 if ((c->x86_vendor != X86_VENDOR_AMD) || 207 if ((c->x86_vendor != X86_VENDOR_AMD) ||
208 (c->x86 != 4) || (c->x86_model!=10)) 208 (c->x86 != 4) || (c->x86_model != 10))
209 return -ENODEV; 209 return -ENODEV;
210 210
211 /* max freq */ 211 /* max freq */
@@ -213,7 +213,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
213 max_freq = elanfreq_get_cpu_frequency(0); 213 max_freq = elanfreq_get_cpu_frequency(0);
214 214
215 /* table init */ 215 /* table init */
216 for (i=0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) { 216 for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) {
217 if (elanfreq_table[i].frequency > max_freq) 217 if (elanfreq_table[i].frequency > max_freq)
218 elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID; 218 elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID;
219 } 219 }
@@ -224,7 +224,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
224 224
225 result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table); 225 result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table);
226 if (result) 226 if (result)
227 return (result); 227 return result;
228 228
229 cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu); 229 cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu);
230 return 0; 230 return 0;
@@ -260,7 +260,7 @@ __setup("elanfreq=", elanfreq_setup);
260#endif 260#endif
261 261
262 262
263static struct freq_attr* elanfreq_attr[] = { 263static struct freq_attr *elanfreq_attr[] = {
264 &cpufreq_freq_attr_scaling_available_freqs, 264 &cpufreq_freq_attr_scaling_available_freqs,
265 NULL, 265 NULL,
266}; 266};
@@ -284,9 +284,9 @@ static int __init elanfreq_init(void)
284 284
285 /* Test if we have the right hardware */ 285 /* Test if we have the right hardware */
286 if ((c->x86_vendor != X86_VENDOR_AMD) || 286 if ((c->x86_vendor != X86_VENDOR_AMD) ||
287 (c->x86 != 4) || (c->x86_model!=10)) { 287 (c->x86 != 4) || (c->x86_model != 10)) {
288 printk(KERN_INFO "elanfreq: error: no Elan processor found!\n"); 288 printk(KERN_INFO "elanfreq: error: no Elan processor found!\n");
289 return -ENODEV; 289 return -ENODEV;
290 } 290 }
291 return cpufreq_register_driver(&elanfreq_driver); 291 return cpufreq_register_driver(&elanfreq_driver);
292} 292}
@@ -298,7 +298,7 @@ static void __exit elanfreq_exit(void)
298} 298}
299 299
300 300
301module_param (max_freq, int, 0444); 301module_param(max_freq, int, 0444);
302 302
303MODULE_LICENSE("GPL"); 303MODULE_LICENSE("GPL");
304MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>"); 304MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>");
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 199e4e05e5dc..b8e05ee4f736 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -122,7 +122,7 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
122 return 0; 122 return 0;
123 123
124 /* notifiers */ 124 /* notifiers */
125 for_each_cpu_mask(i, policy->cpus) { 125 for_each_cpu_mask_nr(i, policy->cpus) {
126 freqs.cpu = i; 126 freqs.cpu = i;
127 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 127 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
128 } 128 }
@@ -130,11 +130,11 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
130 /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software 130 /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software
131 * Developer's Manual, Volume 3 131 * Developer's Manual, Volume 3
132 */ 132 */
133 for_each_cpu_mask(i, policy->cpus) 133 for_each_cpu_mask_nr(i, policy->cpus)
134 cpufreq_p4_setdc(i, p4clockmod_table[newstate].index); 134 cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
135 135
136 /* notifiers */ 136 /* notifiers */
137 for_each_cpu_mask(i, policy->cpus) { 137 for_each_cpu_mask_nr(i, policy->cpus) {
138 freqs.cpu = i; 138 freqs.cpu = i;
139 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 139 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
140 } 140 }
@@ -171,7 +171,7 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
171 } 171 }
172 172
173 if (c->x86 != 0xF) { 173 if (c->x86 != 0xF) {
174 printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@lists.linux.org.uk>\n"); 174 printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@vger.kernel.org>\n");
175 return 0; 175 return 0;
176 } 176 }
177 177
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index eb9b62b0830c..b5ced806a316 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -15,12 +15,11 @@
15#include <linux/slab.h> 15#include <linux/slab.h>
16 16
17#include <asm/msr.h> 17#include <asm/msr.h>
18#include <asm/timex.h> 18#include <linux/timex.h>
19#include <asm/io.h> 19#include <linux/io.h>
20 20
21 21#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long
22#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long 22 as it is unused */
23 as it is unused */
24 23
25static unsigned int busfreq; /* FSB, in 10 kHz */ 24static unsigned int busfreq; /* FSB, in 10 kHz */
26static unsigned int max_multiplier; 25static unsigned int max_multiplier;
@@ -53,7 +52,7 @@ static int powernow_k6_get_cpu_multiplier(void)
53 52
54 msrval = POWERNOW_IOPORT + 0x1; 53 msrval = POWERNOW_IOPORT + 0x1;
55 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ 54 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
56 invalue=inl(POWERNOW_IOPORT + 0x8); 55 invalue = inl(POWERNOW_IOPORT + 0x8);
57 msrval = POWERNOW_IOPORT + 0x0; 56 msrval = POWERNOW_IOPORT + 0x0;
58 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ 57 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
59 58
@@ -67,9 +66,9 @@ static int powernow_k6_get_cpu_multiplier(void)
67 * 66 *
68 * Tries to change the PowerNow! multiplier 67 * Tries to change the PowerNow! multiplier
69 */ 68 */
70static void powernow_k6_set_state (unsigned int best_i) 69static void powernow_k6_set_state(unsigned int best_i)
71{ 70{
72 unsigned long outvalue=0, invalue=0; 71 unsigned long outvalue = 0, invalue = 0;
73 unsigned long msrval; 72 unsigned long msrval;
74 struct cpufreq_freqs freqs; 73 struct cpufreq_freqs freqs;
75 74
@@ -90,10 +89,10 @@ static void powernow_k6_set_state (unsigned int best_i)
90 89
91 msrval = POWERNOW_IOPORT + 0x1; 90 msrval = POWERNOW_IOPORT + 0x1;
92 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ 91 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
93 invalue=inl(POWERNOW_IOPORT + 0x8); 92 invalue = inl(POWERNOW_IOPORT + 0x8);
94 invalue = invalue & 0xf; 93 invalue = invalue & 0xf;
95 outvalue = outvalue | invalue; 94 outvalue = outvalue | invalue;
96 outl(outvalue ,(POWERNOW_IOPORT + 0x8)); 95 outl(outvalue , (POWERNOW_IOPORT + 0x8));
97 msrval = POWERNOW_IOPORT + 0x0; 96 msrval = POWERNOW_IOPORT + 0x0;
98 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ 97 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
99 98
@@ -124,7 +123,7 @@ static int powernow_k6_verify(struct cpufreq_policy *policy)
124 * 123 *
125 * sets a new CPUFreq policy 124 * sets a new CPUFreq policy
126 */ 125 */
127static int powernow_k6_target (struct cpufreq_policy *policy, 126static int powernow_k6_target(struct cpufreq_policy *policy,
128 unsigned int target_freq, 127 unsigned int target_freq,
129 unsigned int relation) 128 unsigned int relation)
130{ 129{
@@ -152,7 +151,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
152 busfreq = cpu_khz / max_multiplier; 151 busfreq = cpu_khz / max_multiplier;
153 152
154 /* table init */ 153 /* table init */
155 for (i=0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) { 154 for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
156 if (clock_ratio[i].index > max_multiplier) 155 if (clock_ratio[i].index > max_multiplier)
157 clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID; 156 clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
158 else 157 else
@@ -165,7 +164,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
165 164
166 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); 165 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
167 if (result) 166 if (result)
168 return (result); 167 return result;
169 168
170 cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu); 169 cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu);
171 170
@@ -176,8 +175,8 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
176static int powernow_k6_cpu_exit(struct cpufreq_policy *policy) 175static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
177{ 176{
178 unsigned int i; 177 unsigned int i;
179 for (i=0; i<8; i++) { 178 for (i = 0; i < 8; i++) {
180 if (i==max_multiplier) 179 if (i == max_multiplier)
181 powernow_k6_set_state(i); 180 powernow_k6_set_state(i);
182 } 181 }
183 cpufreq_frequency_table_put_attr(policy->cpu); 182 cpufreq_frequency_table_put_attr(policy->cpu);
@@ -189,7 +188,7 @@ static unsigned int powernow_k6_get(unsigned int cpu)
189 return busfreq * powernow_k6_get_cpu_multiplier(); 188 return busfreq * powernow_k6_get_cpu_multiplier();
190} 189}
191 190
192static struct freq_attr* powernow_k6_attr[] = { 191static struct freq_attr *powernow_k6_attr[] = {
193 &cpufreq_freq_attr_scaling_available_freqs, 192 &cpufreq_freq_attr_scaling_available_freqs,
194 NULL, 193 NULL,
195}; 194};
@@ -227,7 +226,7 @@ static int __init powernow_k6_init(void)
227 } 226 }
228 227
229 if (cpufreq_register_driver(&powernow_k6_driver)) { 228 if (cpufreq_register_driver(&powernow_k6_driver)) {
230 release_region (POWERNOW_IOPORT, 16); 229 release_region(POWERNOW_IOPORT, 16);
231 return -EINVAL; 230 return -EINVAL;
232 } 231 }
233 232
@@ -243,13 +242,13 @@ static int __init powernow_k6_init(void)
243static void __exit powernow_k6_exit(void) 242static void __exit powernow_k6_exit(void)
244{ 243{
245 cpufreq_unregister_driver(&powernow_k6_driver); 244 cpufreq_unregister_driver(&powernow_k6_driver);
246 release_region (POWERNOW_IOPORT, 16); 245 release_region(POWERNOW_IOPORT, 16);
247} 246}
248 247
249 248
250MODULE_AUTHOR ("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>"); 249MODULE_AUTHOR("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>");
251MODULE_DESCRIPTION ("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); 250MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
252MODULE_LICENSE ("GPL"); 251MODULE_LICENSE("GPL");
253 252
254module_init(powernow_k6_init); 253module_init(powernow_k6_init);
255module_exit(powernow_k6_exit); 254module_exit(powernow_k6_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
index f8a63b3664e3..35fb4eaf6e1c 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
@@ -1,5 +1,4 @@
1/* 1/*
2 * $Id: powernow-k7.h,v 1.2 2003/02/10 18:26:01 davej Exp $
3 * (C) 2003 Dave Jones. 2 * (C) 2003 Dave Jones.
4 * 3 *
5 * Licensed under the terms of the GNU GPL License version 2. 4 * Licensed under the terms of the GNU GPL License version 2.
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 206791eb46e3..84bb395038d8 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -66,7 +66,6 @@ static u32 find_freq_from_fid(u32 fid)
66 return 800 + (fid * 100); 66 return 800 + (fid * 100);
67} 67}
68 68
69
70/* Return a frequency in KHz, given an input fid */ 69/* Return a frequency in KHz, given an input fid */
71static u32 find_khz_freq_from_fid(u32 fid) 70static u32 find_khz_freq_from_fid(u32 fid)
72{ 71{
@@ -78,7 +77,6 @@ static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data, u32 p
78 return data[pstate].frequency; 77 return data[pstate].frequency;
79} 78}
80 79
81
82/* Return the vco fid for an input fid 80/* Return the vco fid for an input fid
83 * 81 *
84 * Each "low" fid has corresponding "high" fid, and you can get to "low" fids 82 * Each "low" fid has corresponding "high" fid, and you can get to "low" fids
@@ -166,7 +164,6 @@ static void fidvid_msr_init(void)
166 wrmsr(MSR_FIDVID_CTL, lo, hi); 164 wrmsr(MSR_FIDVID_CTL, lo, hi);
167} 165}
168 166
169
170/* write the new fid value along with the other control fields to the msr */ 167/* write the new fid value along with the other control fields to the msr */
171static int write_new_fid(struct powernow_k8_data *data, u32 fid) 168static int write_new_fid(struct powernow_k8_data *data, u32 fid)
172{ 169{
@@ -966,7 +963,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i
966 freqs.old = find_khz_freq_from_fid(data->currfid); 963 freqs.old = find_khz_freq_from_fid(data->currfid);
967 freqs.new = find_khz_freq_from_fid(fid); 964 freqs.new = find_khz_freq_from_fid(fid);
968 965
969 for_each_cpu_mask(i, *(data->available_cores)) { 966 for_each_cpu_mask_nr(i, *(data->available_cores)) {
970 freqs.cpu = i; 967 freqs.cpu = i;
971 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 968 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
972 } 969 }
@@ -974,7 +971,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i
974 res = transition_fid_vid(data, fid, vid); 971 res = transition_fid_vid(data, fid, vid);
975 freqs.new = find_khz_freq_from_fid(data->currfid); 972 freqs.new = find_khz_freq_from_fid(data->currfid);
976 973
977 for_each_cpu_mask(i, *(data->available_cores)) { 974 for_each_cpu_mask_nr(i, *(data->available_cores)) {
978 freqs.cpu = i; 975 freqs.cpu = i;
979 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 976 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
980 } 977 }
@@ -997,7 +994,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
997 freqs.old = find_khz_freq_from_pstate(data->powernow_table, data->currpstate); 994 freqs.old = find_khz_freq_from_pstate(data->powernow_table, data->currpstate);
998 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); 995 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
999 996
1000 for_each_cpu_mask(i, *(data->available_cores)) { 997 for_each_cpu_mask_nr(i, *(data->available_cores)) {
1001 freqs.cpu = i; 998 freqs.cpu = i;
1002 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 999 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
1003 } 1000 }
@@ -1005,7 +1002,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
1005 res = transition_pstate(data, pstate); 1002 res = transition_pstate(data, pstate);
1006 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); 1003 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
1007 1004
1008 for_each_cpu_mask(i, *(data->available_cores)) { 1005 for_each_cpu_mask_nr(i, *(data->available_cores)) {
1009 freqs.cpu = i; 1006 freqs.cpu = i;
1010 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 1007 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
1011 } 1008 }
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 908dd347c67e..3b5f06423e77 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -26,9 +26,10 @@
26#include <asm/cpufeature.h> 26#include <asm/cpufeature.h>
27 27
28#define PFX "speedstep-centrino: " 28#define PFX "speedstep-centrino: "
29#define MAINTAINER "cpufreq@lists.linux.org.uk" 29#define MAINTAINER "cpufreq@vger.kernel.org"
30 30
31#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) 31#define dprintk(msg...) \
32 cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
32 33
33#define INTEL_MSR_RANGE (0xffff) 34#define INTEL_MSR_RANGE (0xffff)
34 35
@@ -66,11 +67,12 @@ struct cpu_model
66 67
67 struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */ 68 struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */
68}; 69};
69static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x); 70static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
71 const struct cpu_id *x);
70 72
71/* Operating points for current CPU */ 73/* Operating points for current CPU */
72static struct cpu_model *centrino_model[NR_CPUS]; 74static DEFINE_PER_CPU(struct cpu_model *, centrino_model);
73static const struct cpu_id *centrino_cpu[NR_CPUS]; 75static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu);
74 76
75static struct cpufreq_driver centrino_driver; 77static struct cpufreq_driver centrino_driver;
76 78
@@ -255,7 +257,7 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy)
255 return -ENOENT; 257 return -ENOENT;
256 } 258 }
257 259
258 centrino_model[policy->cpu] = model; 260 per_cpu(centrino_model, policy->cpu) = model;
259 261
260 dprintk("found \"%s\": max frequency: %dkHz\n", 262 dprintk("found \"%s\": max frequency: %dkHz\n",
261 model->model_name, model->max_freq); 263 model->model_name, model->max_freq);
@@ -264,10 +266,14 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy)
264} 266}
265 267
266#else 268#else
267static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) { return -ENODEV; } 269static inline int centrino_cpu_init_table(struct cpufreq_policy *policy)
270{
271 return -ENODEV;
272}
268#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */ 273#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */
269 274
270static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x) 275static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
276 const struct cpu_id *x)
271{ 277{
272 if ((c->x86 == x->x86) && 278 if ((c->x86 == x->x86) &&
273 (c->x86_model == x->x86_model) && 279 (c->x86_model == x->x86_model) &&
@@ -286,23 +292,28 @@ static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe)
286 * for centrino, as some DSDTs are buggy. 292 * for centrino, as some DSDTs are buggy.
287 * Ideally, this can be done using the acpi_data structure. 293 * Ideally, this can be done using the acpi_data structure.
288 */ 294 */
289 if ((centrino_cpu[cpu] == &cpu_ids[CPU_BANIAS]) || 295 if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) ||
290 (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_A1]) || 296 (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) ||
291 (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_B0])) { 297 (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) {
292 msr = (msr >> 8) & 0xff; 298 msr = (msr >> 8) & 0xff;
293 return msr * 100000; 299 return msr * 100000;
294 } 300 }
295 301
296 if ((!centrino_model[cpu]) || (!centrino_model[cpu]->op_points)) 302 if ((!per_cpu(centrino_model, cpu)) ||
303 (!per_cpu(centrino_model, cpu)->op_points))
297 return 0; 304 return 0;
298 305
299 msr &= 0xffff; 306 msr &= 0xffff;
300 for (i=0;centrino_model[cpu]->op_points[i].frequency != CPUFREQ_TABLE_END; i++) { 307 for (i = 0;
301 if (msr == centrino_model[cpu]->op_points[i].index) 308 per_cpu(centrino_model, cpu)->op_points[i].frequency
302 return centrino_model[cpu]->op_points[i].frequency; 309 != CPUFREQ_TABLE_END;
310 i++) {
311 if (msr == per_cpu(centrino_model, cpu)->op_points[i].index)
312 return per_cpu(centrino_model, cpu)->
313 op_points[i].frequency;
303 } 314 }
304 if (failsafe) 315 if (failsafe)
305 return centrino_model[cpu]->op_points[i-1].frequency; 316 return per_cpu(centrino_model, cpu)->op_points[i-1].frequency;
306 else 317 else
307 return 0; 318 return 0;
308} 319}
@@ -347,7 +358,8 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
347 int i; 358 int i;
348 359
349 /* Only Intel makes Enhanced Speedstep-capable CPUs */ 360 /* Only Intel makes Enhanced Speedstep-capable CPUs */
350 if (cpu->x86_vendor != X86_VENDOR_INTEL || !cpu_has(cpu, X86_FEATURE_EST)) 361 if (cpu->x86_vendor != X86_VENDOR_INTEL ||
362 !cpu_has(cpu, X86_FEATURE_EST))
351 return -ENODEV; 363 return -ENODEV;
352 364
353 if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC)) 365 if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC))
@@ -361,9 +373,9 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
361 break; 373 break;
362 374
363 if (i != N_IDS) 375 if (i != N_IDS)
364 centrino_cpu[policy->cpu] = &cpu_ids[i]; 376 per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i];
365 377
366 if (!centrino_cpu[policy->cpu]) { 378 if (!per_cpu(centrino_cpu, policy->cpu)) {
367 dprintk("found unsupported CPU with " 379 dprintk("found unsupported CPU with "
368 "Enhanced SpeedStep: send /proc/cpuinfo to " 380 "Enhanced SpeedStep: send /proc/cpuinfo to "
369 MAINTAINER "\n"); 381 MAINTAINER "\n");
@@ -386,23 +398,26 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
386 /* check to see if it stuck */ 398 /* check to see if it stuck */
387 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 399 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
388 if (!(l & (1<<16))) { 400 if (!(l & (1<<16))) {
389 printk(KERN_INFO PFX "couldn't enable Enhanced SpeedStep\n"); 401 printk(KERN_INFO PFX
402 "couldn't enable Enhanced SpeedStep\n");
390 return -ENODEV; 403 return -ENODEV;
391 } 404 }
392 } 405 }
393 406
394 freq = get_cur_freq(policy->cpu); 407 freq = get_cur_freq(policy->cpu);
395 408 policy->cpuinfo.transition_latency = 10000;
396 policy->cpuinfo.transition_latency = 10000; /* 10uS transition latency */ 409 /* 10uS transition latency */
397 policy->cur = freq; 410 policy->cur = freq;
398 411
399 dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur); 412 dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur);
400 413
401 ret = cpufreq_frequency_table_cpuinfo(policy, centrino_model[policy->cpu]->op_points); 414 ret = cpufreq_frequency_table_cpuinfo(policy,
415 per_cpu(centrino_model, policy->cpu)->op_points);
402 if (ret) 416 if (ret)
403 return (ret); 417 return (ret);
404 418
405 cpufreq_frequency_table_get_attr(centrino_model[policy->cpu]->op_points, policy->cpu); 419 cpufreq_frequency_table_get_attr(
420 per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu);
406 421
407 return 0; 422 return 0;
408} 423}
@@ -411,12 +426,12 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy)
411{ 426{
412 unsigned int cpu = policy->cpu; 427 unsigned int cpu = policy->cpu;
413 428
414 if (!centrino_model[cpu]) 429 if (!per_cpu(centrino_model, cpu))
415 return -ENODEV; 430 return -ENODEV;
416 431
417 cpufreq_frequency_table_put_attr(cpu); 432 cpufreq_frequency_table_put_attr(cpu);
418 433
419 centrino_model[cpu] = NULL; 434 per_cpu(centrino_model, cpu) = NULL;
420 435
421 return 0; 436 return 0;
422} 437}
@@ -430,17 +445,26 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy)
430 */ 445 */
431static int centrino_verify (struct cpufreq_policy *policy) 446static int centrino_verify (struct cpufreq_policy *policy)
432{ 447{
433 return cpufreq_frequency_table_verify(policy, centrino_model[policy->cpu]->op_points); 448 return cpufreq_frequency_table_verify(policy,
449 per_cpu(centrino_model, policy->cpu)->op_points);
434} 450}
435 451
436/** 452/**
437 * centrino_setpolicy - set a new CPUFreq policy 453 * centrino_setpolicy - set a new CPUFreq policy
438 * @policy: new policy 454 * @policy: new policy
439 * @target_freq: the target frequency 455 * @target_freq: the target frequency
440 * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) 456 * @relation: how that frequency relates to achieved frequency
457 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
441 * 458 *
442 * Sets a new CPUFreq policy. 459 * Sets a new CPUFreq policy.
443 */ 460 */
461struct allmasks {
462 cpumask_t online_policy_cpus;
463 cpumask_t saved_mask;
464 cpumask_t set_mask;
465 cpumask_t covered_cpus;
466};
467
444static int centrino_target (struct cpufreq_policy *policy, 468static int centrino_target (struct cpufreq_policy *policy,
445 unsigned int target_freq, 469 unsigned int target_freq,
446 unsigned int relation) 470 unsigned int relation)
@@ -448,48 +472,55 @@ static int centrino_target (struct cpufreq_policy *policy,
448 unsigned int newstate = 0; 472 unsigned int newstate = 0;
449 unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu; 473 unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu;
450 struct cpufreq_freqs freqs; 474 struct cpufreq_freqs freqs;
451 cpumask_t online_policy_cpus;
452 cpumask_t saved_mask;
453 cpumask_t set_mask;
454 cpumask_t covered_cpus;
455 int retval = 0; 475 int retval = 0;
456 unsigned int j, k, first_cpu, tmp; 476 unsigned int j, k, first_cpu, tmp;
457 477 CPUMASK_ALLOC(allmasks);
458 if (unlikely(centrino_model[cpu] == NULL)) 478 CPUMASK_PTR(online_policy_cpus, allmasks);
459 return -ENODEV; 479 CPUMASK_PTR(saved_mask, allmasks);
480 CPUMASK_PTR(set_mask, allmasks);
481 CPUMASK_PTR(covered_cpus, allmasks);
482
483 if (unlikely(allmasks == NULL))
484 return -ENOMEM;
485
486 if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {
487 retval = -ENODEV;
488 goto out;
489 }
460 490
461 if (unlikely(cpufreq_frequency_table_target(policy, 491 if (unlikely(cpufreq_frequency_table_target(policy,
462 centrino_model[cpu]->op_points, 492 per_cpu(centrino_model, cpu)->op_points,
463 target_freq, 493 target_freq,
464 relation, 494 relation,
465 &newstate))) { 495 &newstate))) {
466 return -EINVAL; 496 retval = -EINVAL;
497 goto out;
467 } 498 }
468 499
469#ifdef CONFIG_HOTPLUG_CPU 500#ifdef CONFIG_HOTPLUG_CPU
470 /* cpufreq holds the hotplug lock, so we are safe from here on */ 501 /* cpufreq holds the hotplug lock, so we are safe from here on */
471 cpus_and(online_policy_cpus, cpu_online_map, policy->cpus); 502 cpus_and(*online_policy_cpus, cpu_online_map, policy->cpus);
472#else 503#else
473 online_policy_cpus = policy->cpus; 504 *online_policy_cpus = policy->cpus;
474#endif 505#endif
475 506
476 saved_mask = current->cpus_allowed; 507 *saved_mask = current->cpus_allowed;
477 first_cpu = 1; 508 first_cpu = 1;
478 cpus_clear(covered_cpus); 509 cpus_clear(*covered_cpus);
479 for_each_cpu_mask(j, online_policy_cpus) { 510 for_each_cpu_mask_nr(j, *online_policy_cpus) {
480 /* 511 /*
481 * Support for SMP systems. 512 * Support for SMP systems.
482 * Make sure we are running on CPU that wants to change freq 513 * Make sure we are running on CPU that wants to change freq
483 */ 514 */
484 cpus_clear(set_mask); 515 cpus_clear(*set_mask);
485 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) 516 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
486 cpus_or(set_mask, set_mask, online_policy_cpus); 517 cpus_or(*set_mask, *set_mask, *online_policy_cpus);
487 else 518 else
488 cpu_set(j, set_mask); 519 cpu_set(j, *set_mask);
489 520
490 set_cpus_allowed_ptr(current, &set_mask); 521 set_cpus_allowed_ptr(current, set_mask);
491 preempt_disable(); 522 preempt_disable();
492 if (unlikely(!cpu_isset(smp_processor_id(), set_mask))) { 523 if (unlikely(!cpu_isset(smp_processor_id(), *set_mask))) {
493 dprintk("couldn't limit to CPUs in this domain\n"); 524 dprintk("couldn't limit to CPUs in this domain\n");
494 retval = -EAGAIN; 525 retval = -EAGAIN;
495 if (first_cpu) { 526 if (first_cpu) {
@@ -500,7 +531,7 @@ static int centrino_target (struct cpufreq_policy *policy,
500 break; 531 break;
501 } 532 }
502 533
503 msr = centrino_model[cpu]->op_points[newstate].index; 534 msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;
504 535
505 if (first_cpu) { 536 if (first_cpu) {
506 rdmsr(MSR_IA32_PERF_CTL, oldmsr, h); 537 rdmsr(MSR_IA32_PERF_CTL, oldmsr, h);
@@ -517,7 +548,7 @@ static int centrino_target (struct cpufreq_policy *policy,
517 dprintk("target=%dkHz old=%d new=%d msr=%04x\n", 548 dprintk("target=%dkHz old=%d new=%d msr=%04x\n",
518 target_freq, freqs.old, freqs.new, msr); 549 target_freq, freqs.old, freqs.new, msr);
519 550
520 for_each_cpu_mask(k, online_policy_cpus) { 551 for_each_cpu_mask_nr(k, *online_policy_cpus) {
521 freqs.cpu = k; 552 freqs.cpu = k;
522 cpufreq_notify_transition(&freqs, 553 cpufreq_notify_transition(&freqs,
523 CPUFREQ_PRECHANGE); 554 CPUFREQ_PRECHANGE);
@@ -536,11 +567,11 @@ static int centrino_target (struct cpufreq_policy *policy,
536 break; 567 break;
537 } 568 }
538 569
539 cpu_set(j, covered_cpus); 570 cpu_set(j, *covered_cpus);
540 preempt_enable(); 571 preempt_enable();
541 } 572 }
542 573
543 for_each_cpu_mask(k, online_policy_cpus) { 574 for_each_cpu_mask_nr(k, *online_policy_cpus) {
544 freqs.cpu = k; 575 freqs.cpu = k;
545 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 576 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
546 } 577 }
@@ -553,30 +584,32 @@ static int centrino_target (struct cpufreq_policy *policy,
553 * Best effort undo.. 584 * Best effort undo..
554 */ 585 */
555 586
556 if (!cpus_empty(covered_cpus)) { 587 if (!cpus_empty(*covered_cpus))
557 for_each_cpu_mask(j, covered_cpus) { 588 for_each_cpu_mask_nr(j, *covered_cpus) {
558 set_cpus_allowed_ptr(current, 589 set_cpus_allowed_ptr(current,
559 &cpumask_of_cpu(j)); 590 &cpumask_of_cpu(j));
560 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); 591 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
561 } 592 }
562 }
563 593
564 tmp = freqs.new; 594 tmp = freqs.new;
565 freqs.new = freqs.old; 595 freqs.new = freqs.old;
566 freqs.old = tmp; 596 freqs.old = tmp;
567 for_each_cpu_mask(j, online_policy_cpus) { 597 for_each_cpu_mask_nr(j, *online_policy_cpus) {
568 freqs.cpu = j; 598 freqs.cpu = j;
569 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 599 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
570 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 600 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
571 } 601 }
572 } 602 }
573 set_cpus_allowed_ptr(current, &saved_mask); 603 set_cpus_allowed_ptr(current, saved_mask);
574 return 0; 604 retval = 0;
605 goto out;
575 606
576migrate_end: 607migrate_end:
577 preempt_enable(); 608 preempt_enable();
578 set_cpus_allowed_ptr(current, &saved_mask); 609 set_cpus_allowed_ptr(current, saved_mask);
579 return 0; 610out:
611 CPUMASK_FREE(allmasks);
612 return retval;
580} 613}
581 614
582static struct freq_attr* centrino_attr[] = { 615static struct freq_attr* centrino_attr[] = {
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 1b50244b1fdf..191f7263c61d 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -279,7 +279,7 @@ static int speedstep_target (struct cpufreq_policy *policy,
279 279
280 cpus_allowed = current->cpus_allowed; 280 cpus_allowed = current->cpus_allowed;
281 281
282 for_each_cpu_mask(i, policy->cpus) { 282 for_each_cpu_mask_nr(i, policy->cpus) {
283 freqs.cpu = i; 283 freqs.cpu = i;
284 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 284 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
285 } 285 }
@@ -292,7 +292,7 @@ static int speedstep_target (struct cpufreq_policy *policy,
292 /* allow to be run on all CPUs */ 292 /* allow to be run on all CPUs */
293 set_cpus_allowed_ptr(current, &cpus_allowed); 293 set_cpus_allowed_ptr(current, &cpus_allowed);
294 294
295 for_each_cpu_mask(i, policy->cpus) { 295 for_each_cpu_mask_nr(i, policy->cpus) {
296 freqs.cpu = i; 296 freqs.cpu = i;
297 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 297 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
298 } 298 }
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 3fd7a67bb06a..ffd0f5ed071a 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -15,13 +15,11 @@
15/* 15/*
16 * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU 16 * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU
17 */ 17 */
18static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) 18static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
19{ 19{
20 unsigned char ccr2, ccr3; 20 unsigned char ccr2, ccr3;
21 unsigned long flags;
22 21
23 /* we test for DEVID by checking whether CCR3 is writable */ 22 /* we test for DEVID by checking whether CCR3 is writable */
24 local_irq_save(flags);
25 ccr3 = getCx86(CX86_CCR3); 23 ccr3 = getCx86(CX86_CCR3);
26 setCx86(CX86_CCR3, ccr3 ^ 0x80); 24 setCx86(CX86_CCR3, ccr3 ^ 0x80);
27 getCx86(0xc0); /* dummy to change bus */ 25 getCx86(0xc0); /* dummy to change bus */
@@ -44,9 +42,16 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
44 *dir0 = getCx86(CX86_DIR0); 42 *dir0 = getCx86(CX86_DIR0);
45 *dir1 = getCx86(CX86_DIR1); 43 *dir1 = getCx86(CX86_DIR1);
46 } 44 }
47 local_irq_restore(flags);
48} 45}
49 46
47static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
48{
49 unsigned long flags;
50
51 local_irq_save(flags);
52 __do_cyrix_devid(dir0, dir1);
53 local_irq_restore(flags);
54}
50/* 55/*
51 * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in 56 * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in
52 * order to identify the Cyrix CPU model after we're out of setup.c 57 * order to identify the Cyrix CPU model after we're out of setup.c
@@ -116,7 +121,7 @@ static void __cpuinit set_cx86_reorder(void)
116 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ 121 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
117 122
118 /* Load/Store Serialize to mem access disable (=reorder it) */ 123 /* Load/Store Serialize to mem access disable (=reorder it) */
119 setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80); 124 setCx86_old(CX86_PCR0, getCx86_old(CX86_PCR0) & ~0x80);
120 /* set load/store serialize from 1GB to 4GB */ 125 /* set load/store serialize from 1GB to 4GB */
121 ccr3 |= 0xe0; 126 ccr3 |= 0xe0;
122 setCx86(CX86_CCR3, ccr3); 127 setCx86(CX86_CCR3, ccr3);
@@ -127,28 +132,11 @@ static void __cpuinit set_cx86_memwb(void)
127 printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); 132 printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
128 133
129 /* CCR2 bit 2: unlock NW bit */ 134 /* CCR2 bit 2: unlock NW bit */
130 setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04); 135 setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04);
131 /* set 'Not Write-through' */ 136 /* set 'Not Write-through' */
132 write_cr0(read_cr0() | X86_CR0_NW); 137 write_cr0(read_cr0() | X86_CR0_NW);
133 /* CCR2 bit 2: lock NW bit and set WT1 */ 138 /* CCR2 bit 2: lock NW bit and set WT1 */
134 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14); 139 setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x14);
135}
136
137static void __cpuinit set_cx86_inc(void)
138{
139 unsigned char ccr3;
140
141 printk(KERN_INFO "Enable Incrementor on Cyrix/NSC processor.\n");
142
143 ccr3 = getCx86(CX86_CCR3);
144 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
145 /* PCR1 -- Performance Control */
146 /* Incrementor on, whatever that is */
147 setCx86(CX86_PCR1, getCx86(CX86_PCR1) | 0x02);
148 /* PCR0 -- Performance Control */
149 /* Incrementor Margin 10 */
150 setCx86(CX86_PCR0, getCx86(CX86_PCR0) | 0x04);
151 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
152} 140}
153 141
154/* 142/*
@@ -162,23 +150,40 @@ static void __cpuinit geode_configure(void)
162 local_irq_save(flags); 150 local_irq_save(flags);
163 151
164 /* Suspend on halt power saving and enable #SUSP pin */ 152 /* Suspend on halt power saving and enable #SUSP pin */
165 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); 153 setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x88);
166 154
167 ccr3 = getCx86(CX86_CCR3); 155 ccr3 = getCx86(CX86_CCR3);
168 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ 156 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
169 157
170 158
171 /* FPU fast, DTE cache, Mem bypass */ 159 /* FPU fast, DTE cache, Mem bypass */
172 setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38); 160 setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x38);
173 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ 161 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
174 162
175 set_cx86_memwb(); 163 set_cx86_memwb();
176 set_cx86_reorder(); 164 set_cx86_reorder();
177 set_cx86_inc();
178 165
179 local_irq_restore(flags); 166 local_irq_restore(flags);
180} 167}
181 168
169static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c)
170{
171 unsigned char dir0, dir0_msn, dir1 = 0;
172
173 __do_cyrix_devid(&dir0, &dir1);
174 dir0_msn = dir0 >> 4; /* identifies CPU "family" */
175
176 switch (dir0_msn) {
177 case 3: /* 6x86/6x86L */
178 /* Emulate MTRRs using Cyrix's ARRs. */
179 set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
180 break;
181 case 5: /* 6x86MX/M II */
182 /* Emulate MTRRs using Cyrix's ARRs. */
183 set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
184 break;
185 }
186}
182 187
183static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) 188static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
184{ 189{
@@ -286,7 +291,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
286 /* GXm supports extended cpuid levels 'ala' AMD */ 291 /* GXm supports extended cpuid levels 'ala' AMD */
287 if (c->cpuid_level == 2) { 292 if (c->cpuid_level == 2) {
288 /* Enable cxMMX extensions (GX1 Datasheet 54) */ 293 /* Enable cxMMX extensions (GX1 Datasheet 54) */
289 setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1); 294 setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7) | 1);
290 295
291 /* 296 /*
292 * GXm : 0x30 ... 0x5f GXm datasheet 51 297 * GXm : 0x30 ... 0x5f GXm datasheet 51
@@ -296,7 +301,6 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
296 */ 301 */
297 if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f)) 302 if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f))
298 geode_configure(); 303 geode_configure();
299 get_model_name(c); /* get CPU marketing name */
300 return; 304 return;
301 } else { /* MediaGX */ 305 } else { /* MediaGX */
302 Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4'; 306 Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4';
@@ -309,7 +313,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
309 if (dir1 > 7) { 313 if (dir1 > 7) {
310 dir0_msn++; /* M II */ 314 dir0_msn++; /* M II */
311 /* Enable MMX extensions (App note 108) */ 315 /* Enable MMX extensions (App note 108) */
312 setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1); 316 setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7)|1);
313 } else { 317 } else {
314 c->coma_bug = 1; /* 6x86MX, it has the bug. */ 318 c->coma_bug = 1; /* 6x86MX, it has the bug. */
315 } 319 }
@@ -424,7 +428,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
424 local_irq_save(flags); 428 local_irq_save(flags);
425 ccr3 = getCx86(CX86_CCR3); 429 ccr3 = getCx86(CX86_CCR3);
426 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ 430 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
427 setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80); /* enable cpuid */ 431 setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); /* enable cpuid */
428 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ 432 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
429 local_irq_restore(flags); 433 local_irq_restore(flags);
430 } 434 }
@@ -434,16 +438,19 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
434static struct cpu_dev cyrix_cpu_dev __cpuinitdata = { 438static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
435 .c_vendor = "Cyrix", 439 .c_vendor = "Cyrix",
436 .c_ident = { "CyrixInstead" }, 440 .c_ident = { "CyrixInstead" },
441 .c_early_init = early_init_cyrix,
437 .c_init = init_cyrix, 442 .c_init = init_cyrix,
438 .c_identify = cyrix_identify, 443 .c_identify = cyrix_identify,
444 .c_x86_vendor = X86_VENDOR_CYRIX,
439}; 445};
440 446
441cpu_vendor_dev_register(X86_VENDOR_CYRIX, &cyrix_cpu_dev); 447cpu_dev_register(cyrix_cpu_dev);
442 448
443static struct cpu_dev nsc_cpu_dev __cpuinitdata = { 449static struct cpu_dev nsc_cpu_dev __cpuinitdata = {
444 .c_vendor = "NSC", 450 .c_vendor = "NSC",
445 .c_ident = { "Geode by NSC" }, 451 .c_ident = { "Geode by NSC" },
446 .c_init = init_nsc, 452 .c_init = init_nsc,
453 .c_x86_vendor = X86_VENDOR_NSC,
447}; 454};
448 455
449cpu_vendor_dev_register(X86_VENDOR_NSC, &nsc_cpu_dev); 456cpu_dev_register(nsc_cpu_dev);
diff --git a/arch/x86/kernel/cpu/feature_names.c b/arch/x86/kernel/cpu/feature_names.c
deleted file mode 100644
index e43ad4ad4cba..000000000000
--- a/arch/x86/kernel/cpu/feature_names.c
+++ /dev/null
@@ -1,83 +0,0 @@
1/*
2 * Strings for the various x86 capability flags.
3 *
4 * This file must not contain any executable code.
5 */
6
7#include <asm/cpufeature.h>
8
9/*
10 * These flag bits must match the definitions in <asm/cpufeature.h>.
11 * NULL means this bit is undefined or reserved; either way it doesn't
12 * have meaning as far as Linux is concerned. Note that it's important
13 * to realize there is a difference between this table and CPUID -- if
14 * applications want to get the raw CPUID data, they should access
15 * /dev/cpu/<cpu_nr>/cpuid instead.
16 */
17const char * const x86_cap_flags[NCAPINTS*32] = {
18 /* Intel-defined */
19 "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
20 "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
21 "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
22 "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
23
24 /* AMD-defined */
25 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
26 NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
27 NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL,
28 NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
29 "3dnowext", "3dnow",
30
31 /* Transmeta-defined */
32 "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
33 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
34 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
35 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
36
37 /* Other (Linux-defined) */
38 "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
39 NULL, NULL, NULL, NULL,
40 "constant_tsc", "up", NULL, "arch_perfmon",
41 "pebs", "bts", NULL, NULL,
42 "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
43 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
44
45 /* Intel-defined (#2) */
46 "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
47 "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
48 NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt",
49 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
50
51 /* VIA/Cyrix/Centaur-defined */
52 NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
53 "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
54 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
55 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
56
57 /* AMD-defined (#2) */
58 "lahf_lm", "cmp_legacy", "svm", "extapic",
59 "cr8_legacy", "abm", "sse4a", "misalignsse",
60 "3dnowprefetch", "osvw", "ibs", "sse5",
61 "skinit", "wdt", NULL, NULL,
62 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
63 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
64
65 /* Auxiliary (Linux-defined) */
66 "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
67 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
68 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
69 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
70};
71
72const char *const x86_power_flags[32] = {
73 "ts", /* temperature sensor */
74 "fid", /* frequency id control */
75 "vid", /* voltage id control */
76 "ttp", /* thermal trip */
77 "tm",
78 "stc",
79 "100mhzsteps",
80 "hwpstate",
81 "", /* tsc invariant mapped to constant_tsc */
82 /* nothing */
83};
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index fe9224c51d37..99468dbd08da 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -15,6 +15,11 @@
15#include <asm/ds.h> 15#include <asm/ds.h>
16#include <asm/bugs.h> 16#include <asm/bugs.h>
17 17
18#ifdef CONFIG_X86_64
19#include <asm/topology.h>
20#include <asm/numa_64.h>
21#endif
22
18#include "cpu.h" 23#include "cpu.h"
19 24
20#ifdef CONFIG_X86_LOCAL_APIC 25#ifdef CONFIG_X86_LOCAL_APIC
@@ -23,23 +28,22 @@
23#include <mach_apic.h> 28#include <mach_apic.h>
24#endif 29#endif
25 30
26#ifdef CONFIG_X86_INTEL_USERCOPY
27/*
28 * Alignment at which movsl is preferred for bulk memory copies.
29 */
30struct movsl_mask movsl_mask __read_mostly;
31#endif
32
33static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) 31static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
34{ 32{
35 /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
36 if (c->x86 == 15 && c->x86_cache_alignment == 64)
37 c->x86_cache_alignment = 128;
38 if ((c->x86 == 0xf && c->x86_model >= 0x03) || 33 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
39 (c->x86 == 0x6 && c->x86_model >= 0x0e)) 34 (c->x86 == 0x6 && c->x86_model >= 0x0e))
40 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 35 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
36
37#ifdef CONFIG_X86_64
38 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
39#else
40 /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
41 if (c->x86 == 15 && c->x86_cache_alignment == 64)
42 c->x86_cache_alignment = 128;
43#endif
41} 44}
42 45
46#ifdef CONFIG_X86_32
43/* 47/*
44 * Early probe support logic for ppro memory erratum #50 48 * Early probe support logic for ppro memory erratum #50
45 * 49 *
@@ -59,15 +63,54 @@ int __cpuinit ppro_with_ram_bug(void)
59 return 0; 63 return 0;
60} 64}
61 65
66#ifdef CONFIG_X86_F00F_BUG
67static void __cpuinit trap_init_f00f_bug(void)
68{
69 __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
62 70
63/* 71 /*
64 * P4 Xeon errata 037 workaround. 72 * Update the IDT descriptor and reload the IDT so that
65 * Hardware prefetcher may cause stale data to be loaded into the cache. 73 * it uses the read-only mapped virtual address.
66 */ 74 */
67static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c) 75 idt_descr.address = fix_to_virt(FIX_F00F_IDT);
76 load_idt(&idt_descr);
77}
78#endif
79
80static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
68{ 81{
69 unsigned long lo, hi; 82 unsigned long lo, hi;
70 83
84#ifdef CONFIG_X86_F00F_BUG
85 /*
86 * All current models of Pentium and Pentium with MMX technology CPUs
87 * have the F0 0F bug, which lets nonprivileged users lock up the system.
88 * Note that the workaround only should be initialized once...
89 */
90 c->f00f_bug = 0;
91 if (!paravirt_enabled() && c->x86 == 5) {
92 static int f00f_workaround_enabled;
93
94 c->f00f_bug = 1;
95 if (!f00f_workaround_enabled) {
96 trap_init_f00f_bug();
97 printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
98 f00f_workaround_enabled = 1;
99 }
100 }
101#endif
102
103 /*
104 * SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until
105 * model 3 mask 3
106 */
107 if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
108 clear_cpu_cap(c, X86_FEATURE_SEP);
109
110 /*
111 * P4 Xeon errata 037 workaround.
112 * Hardware prefetcher may cause stale data to be loaded into the cache.
113 */
71 if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { 114 if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
72 rdmsr(MSR_IA32_MISC_ENABLE, lo, hi); 115 rdmsr(MSR_IA32_MISC_ENABLE, lo, hi);
73 if ((lo & (1<<9)) == 0) { 116 if ((lo & (1<<9)) == 0) {
@@ -77,13 +120,68 @@ static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c)
77 wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); 120 wrmsr (MSR_IA32_MISC_ENABLE, lo, hi);
78 } 121 }
79 } 122 }
123
124 /*
125 * See if we have a good local APIC by checking for buggy Pentia,
126 * i.e. all B steppings and the C2 stepping of P54C when using their
127 * integrated APIC (see 11AP erratum in "Pentium Processor
128 * Specification Update").
129 */
130 if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
131 (c->x86_mask < 0x6 || c->x86_mask == 0xb))
132 set_cpu_cap(c, X86_FEATURE_11AP);
133
134
135#ifdef CONFIG_X86_INTEL_USERCOPY
136 /*
137 * Set up the preferred alignment for movsl bulk memory moves
138 */
139 switch (c->x86) {
140 case 4: /* 486: untested */
141 break;
142 case 5: /* Old Pentia: untested */
143 break;
144 case 6: /* PII/PIII only like movsl with 8-byte alignment */
145 movsl_mask.mask = 7;
146 break;
147 case 15: /* P4 is OK down to 8-byte alignment */
148 movsl_mask.mask = 7;
149 break;
150 }
151#endif
152
153#ifdef CONFIG_X86_NUMAQ
154 numaq_tsc_disable();
155#endif
156}
157#else
158static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
159{
80} 160}
161#endif
81 162
163static void __cpuinit srat_detect_node(void)
164{
165#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
166 unsigned node;
167 int cpu = smp_processor_id();
168 int apicid = hard_smp_processor_id();
169
170 /* Don't do the funky fallback heuristics the AMD version employs
171 for now. */
172 node = apicid_to_node[apicid];
173 if (node == NUMA_NO_NODE || !node_online(node))
174 node = first_node(node_online_map);
175 numa_set_node(cpu, node);
176
177 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
178#endif
179}
82 180
83/* 181/*
84 * find out the number of processor cores on the die 182 * find out the number of processor cores on the die
85 */ 183 */
86static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c) 184static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
87{ 185{
88 unsigned int eax, ebx, ecx, edx; 186 unsigned int eax, ebx, ecx, edx;
89 187
@@ -98,45 +196,51 @@ static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c)
98 return 1; 196 return 1;
99} 197}
100 198
101#ifdef CONFIG_X86_F00F_BUG 199static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c)
102static void __cpuinit trap_init_f00f_bug(void)
103{ 200{
104 __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); 201 /* Intel VMX MSR indicated features */
105 202#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
106 /* 203#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
107 * Update the IDT descriptor and reload the IDT so that 204#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
108 * it uses the read-only mapped virtual address. 205#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
109 */ 206#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
110 idt_descr.address = fix_to_virt(FIX_F00F_IDT); 207#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
111 load_idt(&idt_descr); 208
209 u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
210
211 clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
212 clear_cpu_cap(c, X86_FEATURE_VNMI);
213 clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
214 clear_cpu_cap(c, X86_FEATURE_EPT);
215 clear_cpu_cap(c, X86_FEATURE_VPID);
216
217 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
218 msr_ctl = vmx_msr_high | vmx_msr_low;
219 if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
220 set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
221 if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
222 set_cpu_cap(c, X86_FEATURE_VNMI);
223 if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
224 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
225 vmx_msr_low, vmx_msr_high);
226 msr_ctl2 = vmx_msr_high | vmx_msr_low;
227 if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
228 (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
229 set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
230 if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
231 set_cpu_cap(c, X86_FEATURE_EPT);
232 if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
233 set_cpu_cap(c, X86_FEATURE_VPID);
234 }
112} 235}
113#endif
114 236
115static void __cpuinit init_intel(struct cpuinfo_x86 *c) 237static void __cpuinit init_intel(struct cpuinfo_x86 *c)
116{ 238{
117 unsigned int l2 = 0; 239 unsigned int l2 = 0;
118 char *p = NULL;
119 240
120 early_init_intel(c); 241 early_init_intel(c);
121 242
122#ifdef CONFIG_X86_F00F_BUG 243 intel_workarounds(c);
123 /*
124 * All current models of Pentium and Pentium with MMX technology CPUs
125 * have the F0 0F bug, which lets nonprivileged users lock up the system.
126 * Note that the workaround only should be initialized once...
127 */
128 c->f00f_bug = 0;
129 if (!paravirt_enabled() && c->x86 == 5) {
130 static int f00f_workaround_enabled;
131
132 c->f00f_bug = 1;
133 if (!f00f_workaround_enabled) {
134 trap_init_f00f_bug();
135 printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
136 f00f_workaround_enabled = 1;
137 }
138 }
139#endif
140 244
141 l2 = init_intel_cacheinfo(c); 245 l2 = init_intel_cacheinfo(c);
142 if (c->cpuid_level > 9) { 246 if (c->cpuid_level > 9) {
@@ -146,16 +250,32 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
146 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); 250 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
147 } 251 }
148 252
149 /* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */ 253 if (cpu_has_xmm2)
150 if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) 254 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
151 clear_cpu_cap(c, X86_FEATURE_SEP); 255 if (cpu_has_ds) {
256 unsigned int l1;
257 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
258 if (!(l1 & (1<<11)))
259 set_cpu_cap(c, X86_FEATURE_BTS);
260 if (!(l1 & (1<<12)))
261 set_cpu_cap(c, X86_FEATURE_PEBS);
262 ds_init_intel(c);
263 }
152 264
265#ifdef CONFIG_X86_64
266 if (c->x86 == 15)
267 c->x86_cache_alignment = c->x86_clflush_size * 2;
268 if (c->x86 == 6)
269 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
270#else
153 /* 271 /*
154 * Names for the Pentium II/Celeron processors 272 * Names for the Pentium II/Celeron processors
155 * detectable only by also checking the cache size. 273 * detectable only by also checking the cache size.
156 * Dixon is NOT a Celeron. 274 * Dixon is NOT a Celeron.
157 */ 275 */
158 if (c->x86 == 6) { 276 if (c->x86 == 6) {
277 char *p = NULL;
278
159 switch (c->x86_model) { 279 switch (c->x86_model) {
160 case 5: 280 case 5:
161 if (c->x86_mask == 0) { 281 if (c->x86_mask == 0) {
@@ -178,56 +298,41 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
178 p = "Celeron (Coppermine)"; 298 p = "Celeron (Coppermine)";
179 break; 299 break;
180 } 300 }
181 }
182 301
183 if (p) 302 if (p)
184 strcpy(c->x86_model_id, p); 303 strcpy(c->x86_model_id, p);
185
186 c->x86_max_cores = num_cpu_cores(c);
187
188 detect_ht(c);
189
190 /* Work around errata */
191 Intel_errata_workarounds(c);
192
193#ifdef CONFIG_X86_INTEL_USERCOPY
194 /*
195 * Set up the preferred alignment for movsl bulk memory moves
196 */
197 switch (c->x86) {
198 case 4: /* 486: untested */
199 break;
200 case 5: /* Old Pentia: untested */
201 break;
202 case 6: /* PII/PIII only like movsl with 8-byte alignment */
203 movsl_mask.mask = 7;
204 break;
205 case 15: /* P4 is OK down to 8-byte alignment */
206 movsl_mask.mask = 7;
207 break;
208 } 304 }
209#endif
210 305
211 if (cpu_has_xmm2) 306 if (c->x86 == 15)
212 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
213 if (c->x86 == 15) {
214 set_cpu_cap(c, X86_FEATURE_P4); 307 set_cpu_cap(c, X86_FEATURE_P4);
215 }
216 if (c->x86 == 6) 308 if (c->x86 == 6)
217 set_cpu_cap(c, X86_FEATURE_P3); 309 set_cpu_cap(c, X86_FEATURE_P3);
218 if (cpu_has_ds) {
219 unsigned int l1;
220 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
221 if (!(l1 & (1<<11)))
222 set_cpu_cap(c, X86_FEATURE_BTS);
223 if (!(l1 & (1<<12)))
224 set_cpu_cap(c, X86_FEATURE_PEBS);
225 }
226 310
227 if (cpu_has_bts) 311 if (cpu_has_bts)
228 ds_init_intel(c); 312 ptrace_bts_init_intel(c);
313
314#endif
315
316 detect_extended_topology(c);
317 if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
318 /*
319 * let's use the legacy cpuid vector 0x1 and 0x4 for topology
320 * detection.
321 */
322 c->x86_max_cores = intel_num_cpu_cores(c);
323#ifdef CONFIG_X86_32
324 detect_ht(c);
325#endif
326 }
327
328 /* Work around errata */
329 srat_detect_node();
330
331 if (cpu_has(c, X86_FEATURE_VMX))
332 detect_vmx_virtcap(c);
229} 333}
230 334
335#ifdef CONFIG_X86_32
231static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) 336static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
232{ 337{
233 /* 338 /*
@@ -240,10 +345,12 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
240 size = 256; 345 size = 256;
241 return size; 346 return size;
242} 347}
348#endif
243 349
244static struct cpu_dev intel_cpu_dev __cpuinitdata = { 350static struct cpu_dev intel_cpu_dev __cpuinitdata = {
245 .c_vendor = "Intel", 351 .c_vendor = "Intel",
246 .c_ident = { "GenuineIntel" }, 352 .c_ident = { "GenuineIntel" },
353#ifdef CONFIG_X86_32
247 .c_models = { 354 .c_models = {
248 { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names = 355 { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names =
249 { 356 {
@@ -293,76 +400,12 @@ static struct cpu_dev intel_cpu_dev __cpuinitdata = {
293 } 400 }
294 }, 401 },
295 }, 402 },
403 .c_size_cache = intel_size_cache,
404#endif
296 .c_early_init = early_init_intel, 405 .c_early_init = early_init_intel,
297 .c_init = init_intel, 406 .c_init = init_intel,
298 .c_size_cache = intel_size_cache, 407 .c_x86_vendor = X86_VENDOR_INTEL,
299}; 408};
300 409
301cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev); 410cpu_dev_register(intel_cpu_dev);
302
303#ifndef CONFIG_X86_CMPXCHG
304unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
305{
306 u8 prev;
307 unsigned long flags;
308
309 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
310 local_irq_save(flags);
311 prev = *(u8 *)ptr;
312 if (prev == old)
313 *(u8 *)ptr = new;
314 local_irq_restore(flags);
315 return prev;
316}
317EXPORT_SYMBOL(cmpxchg_386_u8);
318
319unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
320{
321 u16 prev;
322 unsigned long flags;
323
324 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
325 local_irq_save(flags);
326 prev = *(u16 *)ptr;
327 if (prev == old)
328 *(u16 *)ptr = new;
329 local_irq_restore(flags);
330 return prev;
331}
332EXPORT_SYMBOL(cmpxchg_386_u16);
333
334unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
335{
336 u32 prev;
337 unsigned long flags;
338
339 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
340 local_irq_save(flags);
341 prev = *(u32 *)ptr;
342 if (prev == old)
343 *(u32 *)ptr = new;
344 local_irq_restore(flags);
345 return prev;
346}
347EXPORT_SYMBOL(cmpxchg_386_u32);
348#endif
349
350#ifndef CONFIG_X86_CMPXCHG64
351unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
352{
353 u64 prev;
354 unsigned long flags;
355
356 /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
357 local_irq_save(flags);
358 prev = *(u64 *)ptr;
359 if (prev == old)
360 *(u64 *)ptr = new;
361 local_irq_restore(flags);
362 return prev;
363}
364EXPORT_SYMBOL(cmpxchg_486_u64);
365#endif
366
367/* arch_initcall(intel_cpu_init); */
368 411
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 26d615dcb149..3f46afbb1cf1 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -1,8 +1,8 @@
1/* 1/*
2 * Routines to indentify caches on Intel CPU. 2 * Routines to indentify caches on Intel CPU.
3 * 3 *
4 * Changes: 4 * Changes:
5 * Venkatesh Pallipadi : Adding cache identification through cpuid(4) 5 * Venkatesh Pallipadi : Adding cache identification through cpuid(4)
6 * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. 6 * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure.
7 * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. 7 * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD.
8 */ 8 */
@@ -13,6 +13,7 @@
13#include <linux/compiler.h> 13#include <linux/compiler.h>
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/pci.h>
16 17
17#include <asm/processor.h> 18#include <asm/processor.h>
18#include <asm/smp.h> 19#include <asm/smp.h>
@@ -62,6 +63,7 @@ static struct _cache_table cache_table[] __cpuinitdata =
62 { 0x4b, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ 63 { 0x4b, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */
63 { 0x4c, LVL_3, 12288 }, /* 12-way set assoc, 64 byte line size */ 64 { 0x4c, LVL_3, 12288 }, /* 12-way set assoc, 64 byte line size */
64 { 0x4d, LVL_3, 16384 }, /* 16-way set assoc, 64 byte line size */ 65 { 0x4d, LVL_3, 16384 }, /* 16-way set assoc, 64 byte line size */
66 { 0x4e, LVL_2, 6144 }, /* 24-way set assoc, 64 byte line size */
65 { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 67 { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */
66 { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 68 { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */
67 { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 69 { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */
@@ -129,9 +131,18 @@ struct _cpuid4_info {
129 union _cpuid4_leaf_ebx ebx; 131 union _cpuid4_leaf_ebx ebx;
130 union _cpuid4_leaf_ecx ecx; 132 union _cpuid4_leaf_ecx ecx;
131 unsigned long size; 133 unsigned long size;
134 unsigned long can_disable;
132 cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */ 135 cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */
133}; 136};
134 137
138#ifdef CONFIG_PCI
139static struct pci_device_id k8_nb_id[] = {
140 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
141 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
142 {}
143};
144#endif
145
135unsigned short num_cache_leaves; 146unsigned short num_cache_leaves;
136 147
137/* AMD doesn't have CPUID4. Emulate it here to report the same 148/* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -181,9 +192,10 @@ static unsigned short assocs[] __cpuinitdata = {
181static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 }; 192static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 };
182static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 }; 193static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 };
183 194
184static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, 195static void __cpuinit
185 union _cpuid4_leaf_ebx *ebx, 196amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
186 union _cpuid4_leaf_ecx *ecx) 197 union _cpuid4_leaf_ebx *ebx,
198 union _cpuid4_leaf_ecx *ecx)
187{ 199{
188 unsigned dummy; 200 unsigned dummy;
189 unsigned line_size, lines_per_tag, assoc, size_in_kb; 201 unsigned line_size, lines_per_tag, assoc, size_in_kb;
@@ -250,27 +262,40 @@ static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
250 (ebx->split.ways_of_associativity + 1) - 1; 262 (ebx->split.ways_of_associativity + 1) - 1;
251} 263}
252 264
253static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) 265static void __cpuinit
266amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf)
267{
268 if (index < 3)
269 return;
270 this_leaf->can_disable = 1;
271}
272
273static int
274__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
254{ 275{
255 union _cpuid4_leaf_eax eax; 276 union _cpuid4_leaf_eax eax;
256 union _cpuid4_leaf_ebx ebx; 277 union _cpuid4_leaf_ebx ebx;
257 union _cpuid4_leaf_ecx ecx; 278 union _cpuid4_leaf_ecx ecx;
258 unsigned edx; 279 unsigned edx;
259 280
260 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) 281 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
261 amd_cpuid4(index, &eax, &ebx, &ecx); 282 amd_cpuid4(index, &eax, &ebx, &ecx);
262 else 283 if (boot_cpu_data.x86 >= 0x10)
263 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); 284 amd_check_l3_disable(index, this_leaf);
285 } else {
286 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
287 }
288
264 if (eax.split.type == CACHE_TYPE_NULL) 289 if (eax.split.type == CACHE_TYPE_NULL)
265 return -EIO; /* better error ? */ 290 return -EIO; /* better error ? */
266 291
267 this_leaf->eax = eax; 292 this_leaf->eax = eax;
268 this_leaf->ebx = ebx; 293 this_leaf->ebx = ebx;
269 this_leaf->ecx = ecx; 294 this_leaf->ecx = ecx;
270 this_leaf->size = (ecx.split.number_of_sets + 1) * 295 this_leaf->size = (ecx.split.number_of_sets + 1) *
271 (ebx.split.coherency_line_size + 1) * 296 (ebx.split.coherency_line_size + 1) *
272 (ebx.split.physical_line_partition + 1) * 297 (ebx.split.physical_line_partition + 1) *
273 (ebx.split.ways_of_associativity + 1); 298 (ebx.split.ways_of_associativity + 1);
274 return 0; 299 return 0;
275} 300}
276 301
@@ -452,7 +477,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
452 477
453/* pointer to _cpuid4_info array (for each cache leaf) */ 478/* pointer to _cpuid4_info array (for each cache leaf) */
454static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info); 479static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info);
455#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y])) 480#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y]))
456 481
457#ifdef CONFIG_SMP 482#ifdef CONFIG_SMP
458static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) 483static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
@@ -488,8 +513,8 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
488 int sibling; 513 int sibling;
489 514
490 this_leaf = CPUID4_INFO_IDX(cpu, index); 515 this_leaf = CPUID4_INFO_IDX(cpu, index);
491 for_each_cpu_mask(sibling, this_leaf->shared_cpu_map) { 516 for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) {
492 sibling_leaf = CPUID4_INFO_IDX(sibling, index); 517 sibling_leaf = CPUID4_INFO_IDX(sibling, index);
493 cpu_clear(cpu, sibling_leaf->shared_cpu_map); 518 cpu_clear(cpu, sibling_leaf->shared_cpu_map);
494 } 519 }
495} 520}
@@ -571,7 +596,7 @@ struct _index_kobject {
571 596
572/* pointer to array of kobjects for cpuX/cache/indexY */ 597/* pointer to array of kobjects for cpuX/cache/indexY */
573static DEFINE_PER_CPU(struct _index_kobject *, index_kobject); 598static DEFINE_PER_CPU(struct _index_kobject *, index_kobject);
574#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y])) 599#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y]))
575 600
576#define show_one_plus(file_name, object, val) \ 601#define show_one_plus(file_name, object, val) \
577static ssize_t show_##file_name \ 602static ssize_t show_##file_name \
@@ -636,6 +661,99 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) {
636 } 661 }
637} 662}
638 663
664#define to_object(k) container_of(k, struct _index_kobject, kobj)
665#define to_attr(a) container_of(a, struct _cache_attr, attr)
666
667#ifdef CONFIG_PCI
668static struct pci_dev *get_k8_northbridge(int node)
669{
670 struct pci_dev *dev = NULL;
671 int i;
672
673 for (i = 0; i <= node; i++) {
674 do {
675 dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
676 if (!dev)
677 break;
678 } while (!pci_match_id(&k8_nb_id[0], dev));
679 if (!dev)
680 break;
681 }
682 return dev;
683}
684#else
685static struct pci_dev *get_k8_northbridge(int node)
686{
687 return NULL;
688}
689#endif
690
691static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
692{
693 int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map));
694 struct pci_dev *dev = NULL;
695 ssize_t ret = 0;
696 int i;
697
698 if (!this_leaf->can_disable)
699 return sprintf(buf, "Feature not enabled\n");
700
701 dev = get_k8_northbridge(node);
702 if (!dev) {
703 printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
704 return -EINVAL;
705 }
706
707 for (i = 0; i < 2; i++) {
708 unsigned int reg;
709
710 pci_read_config_dword(dev, 0x1BC + i * 4, &reg);
711
712 ret += sprintf(buf, "%sEntry: %d\n", buf, i);
713 ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n",
714 buf,
715 reg & 0x80000000 ? "Disabled" : "Allowed",
716 reg & 0x40000000 ? "Disabled" : "Allowed");
717 ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n",
718 buf, (reg & 0x30000) >> 16, reg & 0xfff);
719 }
720 return ret;
721}
722
723static ssize_t
724store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
725 size_t count)
726{
727 int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map));
728 struct pci_dev *dev = NULL;
729 unsigned int ret, index, val;
730
731 if (!this_leaf->can_disable)
732 return 0;
733
734 if (strlen(buf) > 15)
735 return -EINVAL;
736
737 ret = sscanf(buf, "%x %x", &index, &val);
738 if (ret != 2)
739 return -EINVAL;
740 if (index > 1)
741 return -EINVAL;
742
743 val |= 0xc0000000;
744 dev = get_k8_northbridge(node);
745 if (!dev) {
746 printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
747 return -EINVAL;
748 }
749
750 pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
751 wbinvd();
752 pci_write_config_dword(dev, 0x1BC + index * 4, val);
753
754 return 1;
755}
756
639struct _cache_attr { 757struct _cache_attr {
640 struct attribute attr; 758 struct attribute attr;
641 ssize_t (*show)(struct _cpuid4_info *, char *); 759 ssize_t (*show)(struct _cpuid4_info *, char *);
@@ -656,6 +774,8 @@ define_one_ro(size);
656define_one_ro(shared_cpu_map); 774define_one_ro(shared_cpu_map);
657define_one_ro(shared_cpu_list); 775define_one_ro(shared_cpu_list);
658 776
777static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable);
778
659static struct attribute * default_attrs[] = { 779static struct attribute * default_attrs[] = {
660 &type.attr, 780 &type.attr,
661 &level.attr, 781 &level.attr,
@@ -666,12 +786,10 @@ static struct attribute * default_attrs[] = {
666 &size.attr, 786 &size.attr,
667 &shared_cpu_map.attr, 787 &shared_cpu_map.attr,
668 &shared_cpu_list.attr, 788 &shared_cpu_list.attr,
789 &cache_disable.attr,
669 NULL 790 NULL
670}; 791};
671 792
672#define to_object(k) container_of(k, struct _index_kobject, kobj)
673#define to_attr(a) container_of(a, struct _cache_attr, attr)
674
675static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) 793static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf)
676{ 794{
677 struct _cache_attr *fattr = to_attr(attr); 795 struct _cache_attr *fattr = to_attr(attr);
@@ -681,14 +799,22 @@ static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf)
681 ret = fattr->show ? 799 ret = fattr->show ?
682 fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), 800 fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
683 buf) : 801 buf) :
684 0; 802 0;
685 return ret; 803 return ret;
686} 804}
687 805
688static ssize_t store(struct kobject * kobj, struct attribute * attr, 806static ssize_t store(struct kobject * kobj, struct attribute * attr,
689 const char * buf, size_t count) 807 const char * buf, size_t count)
690{ 808{
691 return 0; 809 struct _cache_attr *fattr = to_attr(attr);
810 struct _index_kobject *this_leaf = to_object(kobj);
811 ssize_t ret;
812
813 ret = fattr->store ?
814 fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
815 buf, count) :
816 0;
817 return ret;
692} 818}
693 819
694static struct sysfs_ops sysfs_ops = { 820static struct sysfs_ops sysfs_ops = {
@@ -779,15 +905,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
779 } 905 }
780 kobject_put(per_cpu(cache_kobject, cpu)); 906 kobject_put(per_cpu(cache_kobject, cpu));
781 cpuid4_cache_sysfs_exit(cpu); 907 cpuid4_cache_sysfs_exit(cpu);
782 break; 908 return retval;
783 } 909 }
784 kobject_uevent(&(this_object->kobj), KOBJ_ADD); 910 kobject_uevent(&(this_object->kobj), KOBJ_ADD);
785 } 911 }
786 if (!retval) 912 cpu_set(cpu, cache_dev_map);
787 cpu_set(cpu, cache_dev_map);
788 913
789 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); 914 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD);
790 return retval; 915 return 0;
791} 916}
792 917
793static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) 918static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index e633c9c2b764..f390c9f66351 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -9,23 +9,23 @@
9#include <linux/interrupt.h> 9#include <linux/interrupt.h>
10#include <linux/smp.h> 10#include <linux/smp.h>
11 11
12#include <asm/processor.h> 12#include <asm/processor.h>
13#include <asm/system.h> 13#include <asm/system.h>
14#include <asm/msr.h> 14#include <asm/msr.h>
15 15
16#include "mce.h" 16#include "mce.h"
17 17
18/* Machine Check Handler For AMD Athlon/Duron */ 18/* Machine Check Handler For AMD Athlon/Duron */
19static void k7_machine_check(struct pt_regs * regs, long error_code) 19static void k7_machine_check(struct pt_regs *regs, long error_code)
20{ 20{
21 int recover=1; 21 int recover = 1;
22 u32 alow, ahigh, high, low; 22 u32 alow, ahigh, high, low;
23 u32 mcgstl, mcgsth; 23 u32 mcgstl, mcgsth;
24 int i; 24 int i;
25 25
26 rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 26 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
27 if (mcgstl & (1<<0)) /* Recoverable ? */ 27 if (mcgstl & (1<<0)) /* Recoverable ? */
28 recover=0; 28 recover = 0;
29 29
30 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", 30 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
31 smp_processor_id(), mcgsth, mcgstl); 31 smp_processor_id(), mcgsth, mcgstl);
@@ -60,12 +60,12 @@ static void k7_machine_check(struct pt_regs * regs, long error_code)
60 } 60 }
61 61
62 if (recover&2) 62 if (recover&2)
63 panic ("CPU context corrupt"); 63 panic("CPU context corrupt");
64 if (recover&1) 64 if (recover&1)
65 panic ("Unable to continue"); 65 panic("Unable to continue");
66 printk (KERN_EMERG "Attempting to continue.\n"); 66 printk(KERN_EMERG "Attempting to continue.\n");
67 mcgstl &= ~(1<<2); 67 mcgstl &= ~(1<<2);
68 wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth); 68 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
69} 69}
70 70
71 71
@@ -81,25 +81,25 @@ void amd_mcheck_init(struct cpuinfo_x86 *c)
81 machine_check_vector = k7_machine_check; 81 machine_check_vector = k7_machine_check;
82 wmb(); 82 wmb();
83 83
84 printk (KERN_INFO "Intel machine check architecture supported.\n"); 84 printk(KERN_INFO "Intel machine check architecture supported.\n");
85 rdmsr (MSR_IA32_MCG_CAP, l, h); 85 rdmsr(MSR_IA32_MCG_CAP, l, h);
86 if (l & (1<<8)) /* Control register present ? */ 86 if (l & (1<<8)) /* Control register present ? */
87 wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 87 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
88 nr_mce_banks = l & 0xff; 88 nr_mce_banks = l & 0xff;
89 89
90 /* Clear status for MC index 0 separately, we don't touch CTL, 90 /* Clear status for MC index 0 separately, we don't touch CTL,
91 * as some K7 Athlons cause spurious MCEs when its enabled. */ 91 * as some K7 Athlons cause spurious MCEs when its enabled. */
92 if (boot_cpu_data.x86 == 6) { 92 if (boot_cpu_data.x86 == 6) {
93 wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0); 93 wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0);
94 i = 1; 94 i = 1;
95 } else 95 } else
96 i = 0; 96 i = 0;
97 for (; i<nr_mce_banks; i++) { 97 for (; i < nr_mce_banks; i++) {
98 wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); 98 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
99 wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); 99 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
100 } 100 }
101 101
102 set_in_cr4 (X86_CR4_MCE); 102 set_in_cr4(X86_CR4_MCE);
103 printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", 103 printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
104 smp_processor_id()); 104 smp_processor_id());
105} 105}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index e07e8c068ae0..4b031a4ac856 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -9,6 +9,7 @@
9#include <linux/types.h> 9#include <linux/types.h>
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/sched.h> 11#include <linux/sched.h>
12#include <linux/smp_lock.h>
12#include <linux/string.h> 13#include <linux/string.h>
13#include <linux/rcupdate.h> 14#include <linux/rcupdate.h>
14#include <linux/kallsyms.h> 15#include <linux/kallsyms.h>
@@ -31,7 +32,7 @@
31#include <asm/idle.h> 32#include <asm/idle.h>
32 33
33#define MISC_MCELOG_MINOR 227 34#define MISC_MCELOG_MINOR 227
34#define NR_BANKS 6 35#define NR_SYSFS_BANKS 6
35 36
36atomic_t mce_entry; 37atomic_t mce_entry;
37 38
@@ -46,7 +47,7 @@ static int mce_dont_init;
46 */ 47 */
47static int tolerant = 1; 48static int tolerant = 1;
48static int banks; 49static int banks;
49static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; 50static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL };
50static unsigned long notify_user; 51static unsigned long notify_user;
51static int rip_msr; 52static int rip_msr;
52static int mce_bootlog = -1; 53static int mce_bootlog = -1;
@@ -209,7 +210,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
209 barrier(); 210 barrier();
210 211
211 for (i = 0; i < banks; i++) { 212 for (i = 0; i < banks; i++) {
212 if (!bank[i]) 213 if (i < NR_SYSFS_BANKS && !bank[i])
213 continue; 214 continue;
214 215
215 m.misc = 0; 216 m.misc = 0;
@@ -363,7 +364,7 @@ static void mcheck_check_cpu(void *info)
363 364
364static void mcheck_timer(struct work_struct *work) 365static void mcheck_timer(struct work_struct *work)
365{ 366{
366 on_each_cpu(mcheck_check_cpu, NULL, 1, 1); 367 on_each_cpu(mcheck_check_cpu, NULL, 1);
367 368
368 /* 369 /*
369 * Alert userspace if needed. If we logged an MCE, reduce the 370 * Alert userspace if needed. If we logged an MCE, reduce the
@@ -444,9 +445,10 @@ static void mce_init(void *dummy)
444 445
445 rdmsrl(MSR_IA32_MCG_CAP, cap); 446 rdmsrl(MSR_IA32_MCG_CAP, cap);
446 banks = cap & 0xff; 447 banks = cap & 0xff;
447 if (banks > NR_BANKS) { 448 if (banks > MCE_EXTENDED_BANK) {
448 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks); 449 banks = MCE_EXTENDED_BANK;
449 banks = NR_BANKS; 450 printk(KERN_INFO "MCE: warning: using only %d banks\n",
451 MCE_EXTENDED_BANK);
450 } 452 }
451 /* Use accurate RIP reporting if available. */ 453 /* Use accurate RIP reporting if available. */
452 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) 454 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
@@ -462,7 +464,11 @@ static void mce_init(void *dummy)
462 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 464 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
463 465
464 for (i = 0; i < banks; i++) { 466 for (i = 0; i < banks; i++) {
465 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); 467 if (i < NR_SYSFS_BANKS)
468 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
469 else
470 wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
471
466 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 472 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
467 } 473 }
468} 474}
@@ -527,10 +533,12 @@ static int open_exclu; /* already open exclusive? */
527 533
528static int mce_open(struct inode *inode, struct file *file) 534static int mce_open(struct inode *inode, struct file *file)
529{ 535{
536 lock_kernel();
530 spin_lock(&mce_state_lock); 537 spin_lock(&mce_state_lock);
531 538
532 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 539 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
533 spin_unlock(&mce_state_lock); 540 spin_unlock(&mce_state_lock);
541 unlock_kernel();
534 return -EBUSY; 542 return -EBUSY;
535 } 543 }
536 544
@@ -539,6 +547,7 @@ static int mce_open(struct inode *inode, struct file *file)
539 open_count++; 547 open_count++;
540 548
541 spin_unlock(&mce_state_lock); 549 spin_unlock(&mce_state_lock);
550 unlock_kernel();
542 551
543 return nonseekable_open(inode, file); 552 return nonseekable_open(inode, file);
544} 553}
@@ -571,7 +580,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
571 char __user *buf = ubuf; 580 char __user *buf = ubuf;
572 int i, err; 581 int i, err;
573 582
574 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL); 583 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
575 if (!cpu_tsc) 584 if (!cpu_tsc)
576 return -ENOMEM; 585 return -ENOMEM;
577 586
@@ -612,7 +621,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
612 * Collect entries that were still getting written before the 621 * Collect entries that were still getting written before the
613 * synchronize. 622 * synchronize.
614 */ 623 */
615 on_each_cpu(collect_tscs, cpu_tsc, 1, 1); 624 on_each_cpu(collect_tscs, cpu_tsc, 1);
616 for (i = next; i < MCE_LOG_LEN; i++) { 625 for (i = next; i < MCE_LOG_LEN; i++) {
617 if (mcelog.entry[i].finished && 626 if (mcelog.entry[i].finished &&
618 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 627 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
@@ -737,7 +746,7 @@ static void mce_restart(void)
737 if (next_interval) 746 if (next_interval)
738 cancel_delayed_work(&mcheck_work); 747 cancel_delayed_work(&mcheck_work);
739 /* Timer race is harmless here */ 748 /* Timer race is harmless here */
740 on_each_cpu(mce_init, NULL, 1, 1); 749 on_each_cpu(mce_init, NULL, 1);
741 next_interval = check_interval * HZ; 750 next_interval = check_interval * HZ;
742 if (next_interval) 751 if (next_interval)
743 schedule_delayed_work(&mcheck_work, 752 schedule_delayed_work(&mcheck_work,
@@ -750,13 +759,18 @@ static struct sysdev_class mce_sysclass = {
750}; 759};
751 760
752DEFINE_PER_CPU(struct sys_device, device_mce); 761DEFINE_PER_CPU(struct sys_device, device_mce);
762void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
753 763
754/* Why are there no generic functions for this? */ 764/* Why are there no generic functions for this? */
755#define ACCESSOR(name, var, start) \ 765#define ACCESSOR(name, var, start) \
756 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \ 766 static ssize_t show_ ## name(struct sys_device *s, \
767 struct sysdev_attribute *attr, \
768 char *buf) { \
757 return sprintf(buf, "%lx\n", (unsigned long)var); \ 769 return sprintf(buf, "%lx\n", (unsigned long)var); \
758 } \ 770 } \
759 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \ 771 static ssize_t set_ ## name(struct sys_device *s, \
772 struct sysdev_attribute *attr, \
773 const char *buf, size_t siz) { \
760 char *end; \ 774 char *end; \
761 unsigned long new = simple_strtoul(buf, &end, 0); \ 775 unsigned long new = simple_strtoul(buf, &end, 0); \
762 if (end == buf) return -EINVAL; \ 776 if (end == buf) return -EINVAL; \
@@ -766,7 +780,10 @@ DEFINE_PER_CPU(struct sys_device, device_mce);
766 } \ 780 } \
767 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); 781 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
768 782
769/* TBD should generate these dynamically based on number of available banks */ 783/*
784 * TBD should generate these dynamically based on number of available banks.
785 * Have only 6 contol banks in /sysfs until then.
786 */
770ACCESSOR(bank0ctl,bank[0],mce_restart()) 787ACCESSOR(bank0ctl,bank[0],mce_restart())
771ACCESSOR(bank1ctl,bank[1],mce_restart()) 788ACCESSOR(bank1ctl,bank[1],mce_restart())
772ACCESSOR(bank2ctl,bank[2],mce_restart()) 789ACCESSOR(bank2ctl,bank[2],mce_restart())
@@ -774,14 +791,16 @@ ACCESSOR(bank3ctl,bank[3],mce_restart())
774ACCESSOR(bank4ctl,bank[4],mce_restart()) 791ACCESSOR(bank4ctl,bank[4],mce_restart())
775ACCESSOR(bank5ctl,bank[5],mce_restart()) 792ACCESSOR(bank5ctl,bank[5],mce_restart())
776 793
777static ssize_t show_trigger(struct sys_device *s, char *buf) 794static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
795 char *buf)
778{ 796{
779 strcpy(buf, trigger); 797 strcpy(buf, trigger);
780 strcat(buf, "\n"); 798 strcat(buf, "\n");
781 return strlen(trigger) + 1; 799 return strlen(trigger) + 1;
782} 800}
783 801
784static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz) 802static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
803 const char *buf,size_t siz)
785{ 804{
786 char *p; 805 char *p;
787 int len; 806 int len;
@@ -794,12 +813,12 @@ static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
794} 813}
795 814
796static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 815static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
797ACCESSOR(tolerant,tolerant,) 816static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
798ACCESSOR(check_interval,check_interval,mce_restart()) 817ACCESSOR(check_interval,check_interval,mce_restart())
799static struct sysdev_attribute *mce_attributes[] = { 818static struct sysdev_attribute *mce_attributes[] = {
800 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, 819 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
801 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, 820 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
802 &attr_tolerant, &attr_check_interval, &attr_trigger, 821 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
803 NULL 822 NULL
804}; 823};
805 824
@@ -841,7 +860,7 @@ error:
841 return err; 860 return err;
842} 861}
843 862
844static void mce_remove_device(unsigned int cpu) 863static __cpuinit void mce_remove_device(unsigned int cpu)
845{ 864{
846 int i; 865 int i;
847 866
@@ -865,9 +884,13 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
865 case CPU_ONLINE: 884 case CPU_ONLINE:
866 case CPU_ONLINE_FROZEN: 885 case CPU_ONLINE_FROZEN:
867 mce_create_device(cpu); 886 mce_create_device(cpu);
887 if (threshold_cpu_callback)
888 threshold_cpu_callback(action, cpu);
868 break; 889 break;
869 case CPU_DEAD: 890 case CPU_DEAD:
870 case CPU_DEAD_FROZEN: 891 case CPU_DEAD_FROZEN:
892 if (threshold_cpu_callback)
893 threshold_cpu_callback(action, cpu);
871 mce_remove_device(cpu); 894 mce_remove_device(cpu);
872 break; 895 break;
873 } 896 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 7c9a813e1193..5eb390a4b2e9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -527,7 +527,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
527 if (err) 527 if (err)
528 goto out_free; 528 goto out_free;
529 529
530 for_each_cpu_mask(i, b->cpus) { 530 for_each_cpu_mask_nr(i, b->cpus) {
531 if (i == cpu) 531 if (i == cpu)
532 continue; 532 continue;
533 533
@@ -617,7 +617,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
617#endif 617#endif
618 618
619 /* remove all sibling symlinks before unregistering */ 619 /* remove all sibling symlinks before unregistering */
620 for_each_cpu_mask(i, b->cpus) { 620 for_each_cpu_mask_nr(i, b->cpus) {
621 if (i == cpu) 621 if (i == cpu)
622 continue; 622 continue;
623 623
@@ -628,6 +628,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
628 deallocate_threshold_block(cpu, bank); 628 deallocate_threshold_block(cpu, bank);
629 629
630free_out: 630free_out:
631 kobject_del(b->kobj);
631 kobject_put(b->kobj); 632 kobject_put(b->kobj);
632 kfree(b); 633 kfree(b);
633 per_cpu(threshold_banks, cpu)[bank] = NULL; 634 per_cpu(threshold_banks, cpu)[bank] = NULL;
@@ -645,14 +646,11 @@ static void threshold_remove_device(unsigned int cpu)
645} 646}
646 647
647/* get notified when a cpu comes on/off */ 648/* get notified when a cpu comes on/off */
648static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb, 649static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action,
649 unsigned long action, void *hcpu) 650 unsigned int cpu)
650{ 651{
651 /* cpu was unsigned int to begin with */
652 unsigned int cpu = (unsigned long)hcpu;
653
654 if (cpu >= NR_CPUS) 652 if (cpu >= NR_CPUS)
655 goto out; 653 return;
656 654
657 switch (action) { 655 switch (action) {
658 case CPU_ONLINE: 656 case CPU_ONLINE:
@@ -666,14 +664,8 @@ static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb,
666 default: 664 default:
667 break; 665 break;
668 } 666 }
669 out:
670 return NOTIFY_OK;
671} 667}
672 668
673static struct notifier_block threshold_cpu_notifier __cpuinitdata = {
674 .notifier_call = threshold_cpu_callback,
675};
676
677static __init int threshold_init_device(void) 669static __init int threshold_init_device(void)
678{ 670{
679 unsigned lcpu = 0; 671 unsigned lcpu = 0;
@@ -684,7 +676,7 @@ static __init int threshold_init_device(void)
684 if (err) 676 if (err)
685 return err; 677 return err;
686 } 678 }
687 register_hotcpu_notifier(&threshold_cpu_notifier); 679 threshold_cpu_callback = amd_64_threshold_cpu_callback;
688 return 0; 680 return 0;
689} 681}
690 682
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
index 00ccb6c14ec2..cc1fccdd31e0 100644
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -59,7 +59,7 @@ static DECLARE_DELAYED_WORK(mce_work, mce_work_fn);
59 59
60static void mce_work_fn(struct work_struct *work) 60static void mce_work_fn(struct work_struct *work)
61{ 61{
62 on_each_cpu(mce_checkregs, NULL, 1, 1); 62 on_each_cpu(mce_checkregs, NULL, 1);
63 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); 63 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
64} 64}
65 65
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index cb03345554a5..9b60fce09f75 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -8,7 +8,7 @@
8#include <linux/interrupt.h> 8#include <linux/interrupt.h>
9#include <linux/smp.h> 9#include <linux/smp.h>
10 10
11#include <asm/processor.h> 11#include <asm/processor.h>
12#include <asm/system.h> 12#include <asm/system.h>
13#include <asm/msr.h> 13#include <asm/msr.h>
14#include <asm/apic.h> 14#include <asm/apic.h>
@@ -32,12 +32,12 @@ struct intel_mce_extended_msrs {
32 /* u32 *reserved[]; */ 32 /* u32 *reserved[]; */
33}; 33};
34 34
35static int mce_num_extended_msrs = 0; 35static int mce_num_extended_msrs;
36 36
37 37
38#ifdef CONFIG_X86_MCE_P4THERMAL 38#ifdef CONFIG_X86_MCE_P4THERMAL
39static void unexpected_thermal_interrupt(struct pt_regs *regs) 39static void unexpected_thermal_interrupt(struct pt_regs *regs)
40{ 40{
41 printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", 41 printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
42 smp_processor_id()); 42 smp_processor_id());
43 add_taint(TAINT_MACHINE_CHECK); 43 add_taint(TAINT_MACHINE_CHECK);
@@ -83,7 +83,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
83 * be some SMM goo which handles it, so we can't even put a handler 83 * be some SMM goo which handles it, so we can't even put a handler
84 * since it might be delivered via SMI already -zwanem. 84 * since it might be delivered via SMI already -zwanem.
85 */ 85 */
86 rdmsr (MSR_IA32_MISC_ENABLE, l, h); 86 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
87 h = apic_read(APIC_LVTTHMR); 87 h = apic_read(APIC_LVTTHMR);
88 if ((l & (1<<3)) && (h & APIC_DM_SMI)) { 88 if ((l & (1<<3)) && (h & APIC_DM_SMI)) {
89 printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", 89 printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",
@@ -91,7 +91,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
91 return; /* -EBUSY */ 91 return; /* -EBUSY */
92 } 92 }
93 93
94 /* check whether a vector already exists, temporarily masked? */ 94 /* check whether a vector already exists, temporarily masked? */
95 if (h & APIC_VECTOR_MASK) { 95 if (h & APIC_VECTOR_MASK) {
96 printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already " 96 printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already "
97 "installed\n", 97 "installed\n",
@@ -102,20 +102,20 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
102 /* The temperature transition interrupt handler setup */ 102 /* The temperature transition interrupt handler setup */
103 h = THERMAL_APIC_VECTOR; /* our delivery vector */ 103 h = THERMAL_APIC_VECTOR; /* our delivery vector */
104 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ 104 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
105 apic_write_around(APIC_LVTTHMR, h); 105 apic_write(APIC_LVTTHMR, h);
106 106
107 rdmsr (MSR_IA32_THERM_INTERRUPT, l, h); 107 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
108 wrmsr (MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); 108 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
109 109
110 /* ok we're good to go... */ 110 /* ok we're good to go... */
111 vendor_thermal_interrupt = intel_thermal_interrupt; 111 vendor_thermal_interrupt = intel_thermal_interrupt;
112
113 rdmsr (MSR_IA32_MISC_ENABLE, l, h);
114 wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h);
115 112
116 l = apic_read (APIC_LVTTHMR); 113 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
117 apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED); 114 wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h);
118 printk (KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); 115
116 l = apic_read(APIC_LVTTHMR);
117 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
118 printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
119 119
120 /* enable thermal throttle processing */ 120 /* enable thermal throttle processing */
121 atomic_set(&therm_throt_en, 1); 121 atomic_set(&therm_throt_en, 1);
@@ -129,28 +129,28 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
129{ 129{
130 u32 h; 130 u32 h;
131 131
132 rdmsr (MSR_IA32_MCG_EAX, r->eax, h); 132 rdmsr(MSR_IA32_MCG_EAX, r->eax, h);
133 rdmsr (MSR_IA32_MCG_EBX, r->ebx, h); 133 rdmsr(MSR_IA32_MCG_EBX, r->ebx, h);
134 rdmsr (MSR_IA32_MCG_ECX, r->ecx, h); 134 rdmsr(MSR_IA32_MCG_ECX, r->ecx, h);
135 rdmsr (MSR_IA32_MCG_EDX, r->edx, h); 135 rdmsr(MSR_IA32_MCG_EDX, r->edx, h);
136 rdmsr (MSR_IA32_MCG_ESI, r->esi, h); 136 rdmsr(MSR_IA32_MCG_ESI, r->esi, h);
137 rdmsr (MSR_IA32_MCG_EDI, r->edi, h); 137 rdmsr(MSR_IA32_MCG_EDI, r->edi, h);
138 rdmsr (MSR_IA32_MCG_EBP, r->ebp, h); 138 rdmsr(MSR_IA32_MCG_EBP, r->ebp, h);
139 rdmsr (MSR_IA32_MCG_ESP, r->esp, h); 139 rdmsr(MSR_IA32_MCG_ESP, r->esp, h);
140 rdmsr (MSR_IA32_MCG_EFLAGS, r->eflags, h); 140 rdmsr(MSR_IA32_MCG_EFLAGS, r->eflags, h);
141 rdmsr (MSR_IA32_MCG_EIP, r->eip, h); 141 rdmsr(MSR_IA32_MCG_EIP, r->eip, h);
142} 142}
143 143
144static void intel_machine_check(struct pt_regs * regs, long error_code) 144static void intel_machine_check(struct pt_regs *regs, long error_code)
145{ 145{
146 int recover=1; 146 int recover = 1;
147 u32 alow, ahigh, high, low; 147 u32 alow, ahigh, high, low;
148 u32 mcgstl, mcgsth; 148 u32 mcgstl, mcgsth;
149 int i; 149 int i;
150 150
151 rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 151 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
152 if (mcgstl & (1<<0)) /* Recoverable ? */ 152 if (mcgstl & (1<<0)) /* Recoverable ? */
153 recover=0; 153 recover = 0;
154 154
155 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", 155 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
156 smp_processor_id(), mcgsth, mcgstl); 156 smp_processor_id(), mcgsth, mcgstl);
@@ -191,20 +191,20 @@ static void intel_machine_check(struct pt_regs * regs, long error_code)
191 } 191 }
192 192
193 if (recover & 2) 193 if (recover & 2)
194 panic ("CPU context corrupt"); 194 panic("CPU context corrupt");
195 if (recover & 1) 195 if (recover & 1)
196 panic ("Unable to continue"); 196 panic("Unable to continue");
197 197
198 printk(KERN_EMERG "Attempting to continue.\n"); 198 printk(KERN_EMERG "Attempting to continue.\n");
199 /* 199 /*
200 * Do not clear the MSR_IA32_MCi_STATUS if the error is not 200 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
201 * recoverable/continuable.This will allow BIOS to look at the MSRs 201 * recoverable/continuable.This will allow BIOS to look at the MSRs
202 * for errors if the OS could not log the error. 202 * for errors if the OS could not log the error.
203 */ 203 */
204 for (i=0; i<nr_mce_banks; i++) { 204 for (i = 0; i < nr_mce_banks; i++) {
205 u32 msr; 205 u32 msr;
206 msr = MSR_IA32_MC0_STATUS+i*4; 206 msr = MSR_IA32_MC0_STATUS+i*4;
207 rdmsr (msr, low, high); 207 rdmsr(msr, low, high);
208 if (high&(1<<31)) { 208 if (high&(1<<31)) {
209 /* Clear it */ 209 /* Clear it */
210 wrmsr(msr, 0UL, 0UL); 210 wrmsr(msr, 0UL, 0UL);
@@ -214,7 +214,7 @@ static void intel_machine_check(struct pt_regs * regs, long error_code)
214 } 214 }
215 } 215 }
216 mcgstl &= ~(1<<2); 216 mcgstl &= ~(1<<2);
217 wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth); 217 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
218} 218}
219 219
220 220
@@ -222,30 +222,30 @@ void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
222{ 222{
223 u32 l, h; 223 u32 l, h;
224 int i; 224 int i;
225 225
226 machine_check_vector = intel_machine_check; 226 machine_check_vector = intel_machine_check;
227 wmb(); 227 wmb();
228 228
229 printk (KERN_INFO "Intel machine check architecture supported.\n"); 229 printk(KERN_INFO "Intel machine check architecture supported.\n");
230 rdmsr (MSR_IA32_MCG_CAP, l, h); 230 rdmsr(MSR_IA32_MCG_CAP, l, h);
231 if (l & (1<<8)) /* Control register present ? */ 231 if (l & (1<<8)) /* Control register present ? */
232 wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 232 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
233 nr_mce_banks = l & 0xff; 233 nr_mce_banks = l & 0xff;
234 234
235 for (i=0; i<nr_mce_banks; i++) { 235 for (i = 0; i < nr_mce_banks; i++) {
236 wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); 236 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
237 wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); 237 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
238 } 238 }
239 239
240 set_in_cr4 (X86_CR4_MCE); 240 set_in_cr4(X86_CR4_MCE);
241 printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", 241 printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
242 smp_processor_id()); 242 smp_processor_id());
243 243
244 /* Check for P4/Xeon extended MCE MSRs */ 244 /* Check for P4/Xeon extended MCE MSRs */
245 rdmsr (MSR_IA32_MCG_CAP, l, h); 245 rdmsr(MSR_IA32_MCG_CAP, l, h);
246 if (l & (1<<9)) {/* MCG_EXT_P */ 246 if (l & (1<<9)) {/* MCG_EXT_P */
247 mce_num_extended_msrs = (l >> 16) & 0xff; 247 mce_num_extended_msrs = (l >> 16) & 0xff;
248 printk (KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)" 248 printk(KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)"
249 " available\n", 249 " available\n",
250 smp_processor_id(), mce_num_extended_msrs); 250 smp_processor_id(), mce_num_extended_msrs);
251 251
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 1f4cc48c14c6..d5ae2243f0b9 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -35,6 +35,7 @@ atomic_t therm_throt_en = ATOMIC_INIT(0);
35 35
36#define define_therm_throt_sysdev_show_func(name) \ 36#define define_therm_throt_sysdev_show_func(name) \
37static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ 37static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
38 struct sysdev_attribute *attr, \
38 char *buf) \ 39 char *buf) \
39{ \ 40{ \
40 unsigned int cpu = dev->id; \ 41 unsigned int cpu = dev->id; \
diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl
new file mode 100644
index 000000000000..dfea390e1608
--- /dev/null
+++ b/arch/x86/kernel/cpu/mkcapflags.pl
@@ -0,0 +1,32 @@
1#!/usr/bin/perl
2#
3# Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h
4#
5
6($in, $out) = @ARGV;
7
8open(IN, "< $in\0") or die "$0: cannot open: $in: $!\n";
9open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n";
10
11print OUT "#include <asm/cpufeature.h>\n\n";
12print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n";
13
14while (defined($line = <IN>)) {
15 if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) {
16 $macro = $1;
17 $feature = $2;
18 $tail = $3;
19 if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) {
20 $feature = $1;
21 }
22
23 if ($feature ne '') {
24 printf OUT "\t%-32s = \"%s\",\n",
25 "[$macro]", "\L$feature";
26 }
27 }
28}
29print OUT "};\n";
30
31close(IN);
32close(OUT);
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 5d241ce94a44..4e8d77f01eeb 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -37,7 +37,7 @@ static struct fixed_range_block fixed_range_blocks[] = {
37static unsigned long smp_changes_mask; 37static unsigned long smp_changes_mask;
38static struct mtrr_state mtrr_state = {}; 38static struct mtrr_state mtrr_state = {};
39static int mtrr_state_set; 39static int mtrr_state_set;
40static u64 tom2; 40u64 mtrr_tom2;
41 41
42#undef MODULE_PARAM_PREFIX 42#undef MODULE_PARAM_PREFIX
43#define MODULE_PARAM_PREFIX "mtrr." 43#define MODULE_PARAM_PREFIX "mtrr."
@@ -139,8 +139,8 @@ u8 mtrr_type_lookup(u64 start, u64 end)
139 } 139 }
140 } 140 }
141 141
142 if (tom2) { 142 if (mtrr_tom2) {
143 if (start >= (1ULL<<32) && (end < tom2)) 143 if (start >= (1ULL<<32) && (end < mtrr_tom2))
144 return MTRR_TYPE_WRBACK; 144 return MTRR_TYPE_WRBACK;
145 } 145 }
146 146
@@ -158,6 +158,20 @@ get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
158 rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); 158 rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
159} 159}
160 160
161/* fill the MSR pair relating to a var range */
162void fill_mtrr_var_range(unsigned int index,
163 u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi)
164{
165 struct mtrr_var_range *vr;
166
167 vr = mtrr_state.var_ranges;
168
169 vr[index].base_lo = base_lo;
170 vr[index].base_hi = base_hi;
171 vr[index].mask_lo = mask_lo;
172 vr[index].mask_hi = mask_hi;
173}
174
161static void 175static void
162get_fixed_ranges(mtrr_type * frs) 176get_fixed_ranges(mtrr_type * frs)
163{ 177{
@@ -213,13 +227,13 @@ void __init get_mtrr_state(void)
213 mtrr_state.enabled = (lo & 0xc00) >> 10; 227 mtrr_state.enabled = (lo & 0xc00) >> 10;
214 228
215 if (amd_special_default_mtrr()) { 229 if (amd_special_default_mtrr()) {
216 unsigned lo, hi; 230 unsigned low, high;
217 /* TOP_MEM2 */ 231 /* TOP_MEM2 */
218 rdmsr(MSR_K8_TOP_MEM2, lo, hi); 232 rdmsr(MSR_K8_TOP_MEM2, low, high);
219 tom2 = hi; 233 mtrr_tom2 = high;
220 tom2 <<= 32; 234 mtrr_tom2 <<= 32;
221 tom2 |= lo; 235 mtrr_tom2 |= low;
222 tom2 &= 0xffffff8000000ULL; 236 mtrr_tom2 &= 0xffffff800000ULL;
223 } 237 }
224 if (mtrr_show) { 238 if (mtrr_show) {
225 int high_width; 239 int high_width;
@@ -251,9 +265,9 @@ void __init get_mtrr_state(void)
251 else 265 else
252 printk(KERN_INFO "MTRR %u disabled\n", i); 266 printk(KERN_INFO "MTRR %u disabled\n", i);
253 } 267 }
254 if (tom2) { 268 if (mtrr_tom2) {
255 printk(KERN_INFO "TOM2: %016llx aka %lldM\n", 269 printk(KERN_INFO "TOM2: %016llx aka %lldM\n",
256 tom2, tom2>>20); 270 mtrr_tom2, mtrr_tom2>>20);
257 } 271 }
258 } 272 }
259 mtrr_state_set = 1; 273 mtrr_state_set = 1;
@@ -328,7 +342,7 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
328 342
329 if (lo != msrwords[0] || hi != msrwords[1]) { 343 if (lo != msrwords[0] || hi != msrwords[1]) {
330 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && 344 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
331 boot_cpu_data.x86 == 15 && 345 (boot_cpu_data.x86 >= 0x0f && boot_cpu_data.x86 <= 0x11) &&
332 ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK)) 346 ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK))
333 k8_enable_fixed_iorrs(); 347 k8_enable_fixed_iorrs();
334 mtrr_wrmsr(msr, msrwords[0], msrwords[1]); 348 mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
@@ -365,6 +379,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
365 unsigned long *size, mtrr_type *type) 379 unsigned long *size, mtrr_type *type)
366{ 380{
367 unsigned int mask_lo, mask_hi, base_lo, base_hi; 381 unsigned int mask_lo, mask_hi, base_lo, base_hi;
382 unsigned int tmp, hi;
368 383
369 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); 384 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
370 if ((mask_lo & 0x800) == 0) { 385 if ((mask_lo & 0x800) == 0) {
@@ -378,8 +393,18 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
378 rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); 393 rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
379 394
380 /* Work out the shifted address mask. */ 395 /* Work out the shifted address mask. */
381 mask_lo = size_or_mask | mask_hi << (32 - PAGE_SHIFT) 396 tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT;
382 | mask_lo >> PAGE_SHIFT; 397 mask_lo = size_or_mask | tmp;
398 /* Expand tmp with high bits to all 1s*/
399 hi = fls(tmp);
400 if (hi > 0) {
401 tmp |= ~((1<<(hi - 1)) - 1);
402
403 if (tmp != mask_lo) {
404 WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n");
405 mask_lo = tmp;
406 }
407 }
383 408
384 /* This works correctly if size is a power of two, i.e. a 409 /* This works correctly if size is a power of two, i.e. a
385 contiguous range. */ 410 contiguous range. */
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 84c480bb3715..4c4214690dd1 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -405,9 +405,9 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
405 } 405 }
406 /* RED-PEN: base can be > 32bit */ 406 /* RED-PEN: base can be > 32bit */
407 len += seq_printf(seq, 407 len += seq_printf(seq,
408 "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n", 408 "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
409 i, base, base >> (20 - PAGE_SHIFT), size, factor, 409 i, base, base >> (20 - PAGE_SHIFT), size, factor,
410 mtrr_attrib_to_str(type), mtrr_usage_table[i]); 410 mtrr_usage_table[i], mtrr_attrib_to_str(type));
411 } 411 }
412 } 412 }
413 return 0; 413 return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 6a1e278d9323..c78c04821ea1 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -37,6 +37,7 @@
37#include <linux/smp.h> 37#include <linux/smp.h>
38#include <linux/cpu.h> 38#include <linux/cpu.h>
39#include <linux/mutex.h> 39#include <linux/mutex.h>
40#include <linux/sort.h>
40 41
41#include <asm/e820.h> 42#include <asm/e820.h>
42#include <asm/mtrr.h> 43#include <asm/mtrr.h>
@@ -222,7 +223,7 @@ static void set_mtrr(unsigned int reg, unsigned long base,
222 atomic_set(&data.gate,0); 223 atomic_set(&data.gate,0);
223 224
224 /* Start the ball rolling on other CPUs */ 225 /* Start the ball rolling on other CPUs */
225 if (smp_call_function(ipi_handler, &data, 1, 0) != 0) 226 if (smp_call_function(ipi_handler, &data, 0) != 0)
226 panic("mtrr: timed out waiting for other CPUs\n"); 227 panic("mtrr: timed out waiting for other CPUs\n");
227 228
228 local_irq_save(flags); 229 local_irq_save(flags);
@@ -609,6 +610,891 @@ static struct sysdev_driver mtrr_sysdev_driver = {
609 .resume = mtrr_restore, 610 .resume = mtrr_restore,
610}; 611};
611 612
613/* should be related to MTRR_VAR_RANGES nums */
614#define RANGE_NUM 256
615
616struct res_range {
617 unsigned long start;
618 unsigned long end;
619};
620
621static int __init
622add_range(struct res_range *range, int nr_range, unsigned long start,
623 unsigned long end)
624{
625 /* out of slots */
626 if (nr_range >= RANGE_NUM)
627 return nr_range;
628
629 range[nr_range].start = start;
630 range[nr_range].end = end;
631
632 nr_range++;
633
634 return nr_range;
635}
636
637static int __init
638add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
639 unsigned long end)
640{
641 int i;
642
643 /* try to merge it with old one */
644 for (i = 0; i < nr_range; i++) {
645 unsigned long final_start, final_end;
646 unsigned long common_start, common_end;
647
648 if (!range[i].end)
649 continue;
650
651 common_start = max(range[i].start, start);
652 common_end = min(range[i].end, end);
653 if (common_start > common_end + 1)
654 continue;
655
656 final_start = min(range[i].start, start);
657 final_end = max(range[i].end, end);
658
659 range[i].start = final_start;
660 range[i].end = final_end;
661 return nr_range;
662 }
663
664 /* need to add that */
665 return add_range(range, nr_range, start, end);
666}
667
668static void __init
669subtract_range(struct res_range *range, unsigned long start, unsigned long end)
670{
671 int i, j;
672
673 for (j = 0; j < RANGE_NUM; j++) {
674 if (!range[j].end)
675 continue;
676
677 if (start <= range[j].start && end >= range[j].end) {
678 range[j].start = 0;
679 range[j].end = 0;
680 continue;
681 }
682
683 if (start <= range[j].start && end < range[j].end &&
684 range[j].start < end + 1) {
685 range[j].start = end + 1;
686 continue;
687 }
688
689
690 if (start > range[j].start && end >= range[j].end &&
691 range[j].end > start - 1) {
692 range[j].end = start - 1;
693 continue;
694 }
695
696 if (start > range[j].start && end < range[j].end) {
697 /* find the new spare */
698 for (i = 0; i < RANGE_NUM; i++) {
699 if (range[i].end == 0)
700 break;
701 }
702 if (i < RANGE_NUM) {
703 range[i].end = range[j].end;
704 range[i].start = end + 1;
705 } else {
706 printk(KERN_ERR "run of slot in ranges\n");
707 }
708 range[j].end = start - 1;
709 continue;
710 }
711 }
712}
713
714static int __init cmp_range(const void *x1, const void *x2)
715{
716 const struct res_range *r1 = x1;
717 const struct res_range *r2 = x2;
718 long start1, start2;
719
720 start1 = r1->start;
721 start2 = r2->start;
722
723 return start1 - start2;
724}
725
726struct var_mtrr_range_state {
727 unsigned long base_pfn;
728 unsigned long size_pfn;
729 mtrr_type type;
730};
731
732static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
733static int __initdata debug_print;
734
735static int __init
736x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
737 unsigned long extra_remove_base,
738 unsigned long extra_remove_size)
739{
740 unsigned long i, base, size;
741 mtrr_type type;
742
743 for (i = 0; i < num_var_ranges; i++) {
744 type = range_state[i].type;
745 if (type != MTRR_TYPE_WRBACK)
746 continue;
747 base = range_state[i].base_pfn;
748 size = range_state[i].size_pfn;
749 nr_range = add_range_with_merge(range, nr_range, base,
750 base + size - 1);
751 }
752 if (debug_print) {
753 printk(KERN_DEBUG "After WB checking\n");
754 for (i = 0; i < nr_range; i++)
755 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
756 range[i].start, range[i].end + 1);
757 }
758
759 /* take out UC ranges */
760 for (i = 0; i < num_var_ranges; i++) {
761 type = range_state[i].type;
762 if (type != MTRR_TYPE_UNCACHABLE &&
763 type != MTRR_TYPE_WRPROT)
764 continue;
765 size = range_state[i].size_pfn;
766 if (!size)
767 continue;
768 base = range_state[i].base_pfn;
769 subtract_range(range, base, base + size - 1);
770 }
771 if (extra_remove_size)
772 subtract_range(range, extra_remove_base,
773 extra_remove_base + extra_remove_size - 1);
774
775 /* get new range num */
776 nr_range = 0;
777 for (i = 0; i < RANGE_NUM; i++) {
778 if (!range[i].end)
779 continue;
780 nr_range++;
781 }
782 if (debug_print) {
783 printk(KERN_DEBUG "After UC checking\n");
784 for (i = 0; i < nr_range; i++)
785 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
786 range[i].start, range[i].end + 1);
787 }
788
789 /* sort the ranges */
790 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
791 if (debug_print) {
792 printk(KERN_DEBUG "After sorting\n");
793 for (i = 0; i < nr_range; i++)
794 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
795 range[i].start, range[i].end + 1);
796 }
797
798 /* clear those is not used */
799 for (i = nr_range; i < RANGE_NUM; i++)
800 memset(&range[i], 0, sizeof(range[i]));
801
802 return nr_range;
803}
804
805static struct res_range __initdata range[RANGE_NUM];
806
807#ifdef CONFIG_MTRR_SANITIZER
808
809static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
810{
811 unsigned long sum;
812 int i;
813
814 sum = 0;
815 for (i = 0; i < nr_range; i++)
816 sum += range[i].end + 1 - range[i].start;
817
818 return sum;
819}
820
821static int enable_mtrr_cleanup __initdata =
822 CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
823
824static int __init disable_mtrr_cleanup_setup(char *str)
825{
826 if (enable_mtrr_cleanup != -1)
827 enable_mtrr_cleanup = 0;
828 return 0;
829}
830early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
831
832static int __init enable_mtrr_cleanup_setup(char *str)
833{
834 if (enable_mtrr_cleanup != -1)
835 enable_mtrr_cleanup = 1;
836 return 0;
837}
838early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
839
840static int __init mtrr_cleanup_debug_setup(char *str)
841{
842 debug_print = 1;
843 return 0;
844}
845early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
846
847struct var_mtrr_state {
848 unsigned long range_startk;
849 unsigned long range_sizek;
850 unsigned long chunk_sizek;
851 unsigned long gran_sizek;
852 unsigned int reg;
853};
854
855static void __init
856set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
857 unsigned char type, unsigned int address_bits)
858{
859 u32 base_lo, base_hi, mask_lo, mask_hi;
860 u64 base, mask;
861
862 if (!sizek) {
863 fill_mtrr_var_range(reg, 0, 0, 0, 0);
864 return;
865 }
866
867 mask = (1ULL << address_bits) - 1;
868 mask &= ~((((u64)sizek) << 10) - 1);
869
870 base = ((u64)basek) << 10;
871
872 base |= type;
873 mask |= 0x800;
874
875 base_lo = base & ((1ULL<<32) - 1);
876 base_hi = base >> 32;
877
878 mask_lo = mask & ((1ULL<<32) - 1);
879 mask_hi = mask >> 32;
880
881 fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
882}
883
884static void __init
885save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
886 unsigned char type)
887{
888 range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
889 range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
890 range_state[reg].type = type;
891}
892
893static void __init
894set_var_mtrr_all(unsigned int address_bits)
895{
896 unsigned long basek, sizek;
897 unsigned char type;
898 unsigned int reg;
899
900 for (reg = 0; reg < num_var_ranges; reg++) {
901 basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
902 sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
903 type = range_state[reg].type;
904
905 set_var_mtrr(reg, basek, sizek, type, address_bits);
906 }
907}
908
909static unsigned long to_size_factor(unsigned long sizek, char *factorp)
910{
911 char factor;
912 unsigned long base = sizek;
913
914 if (base & ((1<<10) - 1)) {
915 /* not MB alignment */
916 factor = 'K';
917 } else if (base & ((1<<20) - 1)){
918 factor = 'M';
919 base >>= 10;
920 } else {
921 factor = 'G';
922 base >>= 20;
923 }
924
925 *factorp = factor;
926
927 return base;
928}
929
930static unsigned int __init
931range_to_mtrr(unsigned int reg, unsigned long range_startk,
932 unsigned long range_sizek, unsigned char type)
933{
934 if (!range_sizek || (reg >= num_var_ranges))
935 return reg;
936
937 while (range_sizek) {
938 unsigned long max_align, align;
939 unsigned long sizek;
940
941 /* Compute the maximum size I can make a range */
942 if (range_startk)
943 max_align = ffs(range_startk) - 1;
944 else
945 max_align = 32;
946 align = fls(range_sizek) - 1;
947 if (align > max_align)
948 align = max_align;
949
950 sizek = 1 << align;
951 if (debug_print) {
952 char start_factor = 'K', size_factor = 'K';
953 unsigned long start_base, size_base;
954
955 start_base = to_size_factor(range_startk, &start_factor),
956 size_base = to_size_factor(sizek, &size_factor),
957
958 printk(KERN_DEBUG "Setting variable MTRR %d, "
959 "base: %ld%cB, range: %ld%cB, type %s\n",
960 reg, start_base, start_factor,
961 size_base, size_factor,
962 (type == MTRR_TYPE_UNCACHABLE)?"UC":
963 ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
964 );
965 }
966 save_var_mtrr(reg++, range_startk, sizek, type);
967 range_startk += sizek;
968 range_sizek -= sizek;
969 if (reg >= num_var_ranges)
970 break;
971 }
972 return reg;
973}
974
975static unsigned __init
976range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
977 unsigned long sizek)
978{
979 unsigned long hole_basek, hole_sizek;
980 unsigned long second_basek, second_sizek;
981 unsigned long range0_basek, range0_sizek;
982 unsigned long range_basek, range_sizek;
983 unsigned long chunk_sizek;
984 unsigned long gran_sizek;
985
986 hole_basek = 0;
987 hole_sizek = 0;
988 second_basek = 0;
989 second_sizek = 0;
990 chunk_sizek = state->chunk_sizek;
991 gran_sizek = state->gran_sizek;
992
993 /* align with gran size, prevent small block used up MTRRs */
994 range_basek = ALIGN(state->range_startk, gran_sizek);
995 if ((range_basek > basek) && basek)
996 return second_sizek;
997 state->range_sizek -= (range_basek - state->range_startk);
998 range_sizek = ALIGN(state->range_sizek, gran_sizek);
999
1000 while (range_sizek > state->range_sizek) {
1001 range_sizek -= gran_sizek;
1002 if (!range_sizek)
1003 return 0;
1004 }
1005 state->range_sizek = range_sizek;
1006
1007 /* try to append some small hole */
1008 range0_basek = state->range_startk;
1009 range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
1010
1011 /* no increase */
1012 if (range0_sizek == state->range_sizek) {
1013 if (debug_print)
1014 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
1015 range0_basek<<10,
1016 (range0_basek + state->range_sizek)<<10);
1017 state->reg = range_to_mtrr(state->reg, range0_basek,
1018 state->range_sizek, MTRR_TYPE_WRBACK);
1019 return 0;
1020 }
1021
1022 /* only cut back, when it is not the last */
1023 if (sizek) {
1024 while (range0_basek + range0_sizek > (basek + sizek)) {
1025 if (range0_sizek >= chunk_sizek)
1026 range0_sizek -= chunk_sizek;
1027 else
1028 range0_sizek = 0;
1029
1030 if (!range0_sizek)
1031 break;
1032 }
1033 }
1034
1035second_try:
1036 range_basek = range0_basek + range0_sizek;
1037
1038 /* one hole in the middle */
1039 if (range_basek > basek && range_basek <= (basek + sizek))
1040 second_sizek = range_basek - basek;
1041
1042 if (range0_sizek > state->range_sizek) {
1043
1044 /* one hole in middle or at end */
1045 hole_sizek = range0_sizek - state->range_sizek - second_sizek;
1046
1047 /* hole size should be less than half of range0 size */
1048 if (hole_sizek >= (range0_sizek >> 1) &&
1049 range0_sizek >= chunk_sizek) {
1050 range0_sizek -= chunk_sizek;
1051 second_sizek = 0;
1052 hole_sizek = 0;
1053
1054 goto second_try;
1055 }
1056 }
1057
1058 if (range0_sizek) {
1059 if (debug_print)
1060 printk(KERN_DEBUG "range0: %016lx - %016lx\n",
1061 range0_basek<<10,
1062 (range0_basek + range0_sizek)<<10);
1063 state->reg = range_to_mtrr(state->reg, range0_basek,
1064 range0_sizek, MTRR_TYPE_WRBACK);
1065 }
1066
1067 if (range0_sizek < state->range_sizek) {
1068 /* need to handle left over */
1069 range_sizek = state->range_sizek - range0_sizek;
1070
1071 if (debug_print)
1072 printk(KERN_DEBUG "range: %016lx - %016lx\n",
1073 range_basek<<10,
1074 (range_basek + range_sizek)<<10);
1075 state->reg = range_to_mtrr(state->reg, range_basek,
1076 range_sizek, MTRR_TYPE_WRBACK);
1077 }
1078
1079 if (hole_sizek) {
1080 hole_basek = range_basek - hole_sizek - second_sizek;
1081 if (debug_print)
1082 printk(KERN_DEBUG "hole: %016lx - %016lx\n",
1083 hole_basek<<10,
1084 (hole_basek + hole_sizek)<<10);
1085 state->reg = range_to_mtrr(state->reg, hole_basek,
1086 hole_sizek, MTRR_TYPE_UNCACHABLE);
1087 }
1088
1089 return second_sizek;
1090}
1091
1092static void __init
1093set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
1094 unsigned long size_pfn)
1095{
1096 unsigned long basek, sizek;
1097 unsigned long second_sizek = 0;
1098
1099 if (state->reg >= num_var_ranges)
1100 return;
1101
1102 basek = base_pfn << (PAGE_SHIFT - 10);
1103 sizek = size_pfn << (PAGE_SHIFT - 10);
1104
1105 /* See if I can merge with the last range */
1106 if ((basek <= 1024) ||
1107 (state->range_startk + state->range_sizek == basek)) {
1108 unsigned long endk = basek + sizek;
1109 state->range_sizek = endk - state->range_startk;
1110 return;
1111 }
1112 /* Write the range mtrrs */
1113 if (state->range_sizek != 0)
1114 second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
1115
1116 /* Allocate an msr */
1117 state->range_startk = basek + second_sizek;
1118 state->range_sizek = sizek - second_sizek;
1119}
1120
1121/* mininum size of mtrr block that can take hole */
1122static u64 mtrr_chunk_size __initdata = (256ULL<<20);
1123
1124static int __init parse_mtrr_chunk_size_opt(char *p)
1125{
1126 if (!p)
1127 return -EINVAL;
1128 mtrr_chunk_size = memparse(p, &p);
1129 return 0;
1130}
1131early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
1132
1133/* granity of mtrr of block */
1134static u64 mtrr_gran_size __initdata;
1135
1136static int __init parse_mtrr_gran_size_opt(char *p)
1137{
1138 if (!p)
1139 return -EINVAL;
1140 mtrr_gran_size = memparse(p, &p);
1141 return 0;
1142}
1143early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
1144
1145static int nr_mtrr_spare_reg __initdata =
1146 CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
1147
1148static int __init parse_mtrr_spare_reg(char *arg)
1149{
1150 if (arg)
1151 nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
1152 return 0;
1153}
1154
1155early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
1156
1157static int __init
1158x86_setup_var_mtrrs(struct res_range *range, int nr_range,
1159 u64 chunk_size, u64 gran_size)
1160{
1161 struct var_mtrr_state var_state;
1162 int i;
1163 int num_reg;
1164
1165 var_state.range_startk = 0;
1166 var_state.range_sizek = 0;
1167 var_state.reg = 0;
1168 var_state.chunk_sizek = chunk_size >> 10;
1169 var_state.gran_sizek = gran_size >> 10;
1170
1171 memset(range_state, 0, sizeof(range_state));
1172
1173 /* Write the range etc */
1174 for (i = 0; i < nr_range; i++)
1175 set_var_mtrr_range(&var_state, range[i].start,
1176 range[i].end - range[i].start + 1);
1177
1178 /* Write the last range */
1179 if (var_state.range_sizek != 0)
1180 range_to_mtrr_with_hole(&var_state, 0, 0);
1181
1182 num_reg = var_state.reg;
1183 /* Clear out the extra MTRR's */
1184 while (var_state.reg < num_var_ranges) {
1185 save_var_mtrr(var_state.reg, 0, 0, 0);
1186 var_state.reg++;
1187 }
1188
1189 return num_reg;
1190}
1191
1192struct mtrr_cleanup_result {
1193 unsigned long gran_sizek;
1194 unsigned long chunk_sizek;
1195 unsigned long lose_cover_sizek;
1196 unsigned int num_reg;
1197 int bad;
1198};
1199
1200/*
1201 * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
1202 * chunk size: gran_size, ..., 2G
1203 * so we need (1+16)*8
1204 */
1205#define NUM_RESULT 136
1206#define PSHIFT (PAGE_SHIFT - 10)
1207
1208static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
1209static struct res_range __initdata range_new[RANGE_NUM];
1210static unsigned long __initdata min_loss_pfn[RANGE_NUM];
1211
1212static int __init mtrr_cleanup(unsigned address_bits)
1213{
1214 unsigned long extra_remove_base, extra_remove_size;
1215 unsigned long base, size, def, dummy;
1216 mtrr_type type;
1217 int nr_range, nr_range_new;
1218 u64 chunk_size, gran_size;
1219 unsigned long range_sums, range_sums_new;
1220 int index_good;
1221 int num_reg_good;
1222 int i;
1223
1224 /* extra one for all 0 */
1225 int num[MTRR_NUM_TYPES + 1];
1226
1227 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
1228 return 0;
1229 rdmsr(MTRRdefType_MSR, def, dummy);
1230 def &= 0xff;
1231 if (def != MTRR_TYPE_UNCACHABLE)
1232 return 0;
1233
1234 /* get it and store it aside */
1235 memset(range_state, 0, sizeof(range_state));
1236 for (i = 0; i < num_var_ranges; i++) {
1237 mtrr_if->get(i, &base, &size, &type);
1238 range_state[i].base_pfn = base;
1239 range_state[i].size_pfn = size;
1240 range_state[i].type = type;
1241 }
1242
1243 /* check entries number */
1244 memset(num, 0, sizeof(num));
1245 for (i = 0; i < num_var_ranges; i++) {
1246 type = range_state[i].type;
1247 size = range_state[i].size_pfn;
1248 if (type >= MTRR_NUM_TYPES)
1249 continue;
1250 if (!size)
1251 type = MTRR_NUM_TYPES;
1252 if (type == MTRR_TYPE_WRPROT)
1253 type = MTRR_TYPE_UNCACHABLE;
1254 num[type]++;
1255 }
1256
1257 /* check if we got UC entries */
1258 if (!num[MTRR_TYPE_UNCACHABLE])
1259 return 0;
1260
1261 /* check if we only had WB and UC */
1262 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
1263 num_var_ranges - num[MTRR_NUM_TYPES])
1264 return 0;
1265
1266 /* print original var MTRRs at first, for debugging: */
1267 printk(KERN_DEBUG "original variable MTRRs\n");
1268 for (i = 0; i < num_var_ranges; i++) {
1269 char start_factor = 'K', size_factor = 'K';
1270 unsigned long start_base, size_base;
1271
1272 size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
1273 if (!size_base)
1274 continue;
1275
1276 size_base = to_size_factor(size_base, &size_factor),
1277 start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
1278 start_base = to_size_factor(start_base, &start_factor),
1279 type = range_state[i].type;
1280
1281 printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
1282 i, start_base, start_factor,
1283 size_base, size_factor,
1284 (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
1285 ((type == MTRR_TYPE_WRPROT) ? "WP" :
1286 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
1287 );
1288 }
1289
1290 memset(range, 0, sizeof(range));
1291 extra_remove_size = 0;
1292 extra_remove_base = 1 << (32 - PAGE_SHIFT);
1293 if (mtrr_tom2)
1294 extra_remove_size =
1295 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
1296 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
1297 extra_remove_size);
1298 /*
1299 * [0, 1M) should always be coverred by var mtrr with WB
1300 * and fixed mtrrs should take effective before var mtrr for it
1301 */
1302 nr_range = add_range_with_merge(range, nr_range, 0,
1303 (1ULL<<(20 - PAGE_SHIFT)) - 1);
1304 /* sort the ranges */
1305 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
1306
1307 range_sums = sum_ranges(range, nr_range);
1308 printk(KERN_INFO "total RAM coverred: %ldM\n",
1309 range_sums >> (20 - PAGE_SHIFT));
1310
1311 if (mtrr_chunk_size && mtrr_gran_size) {
1312 int num_reg;
1313 char gran_factor, chunk_factor, lose_factor;
1314 unsigned long gran_base, chunk_base, lose_base;
1315
1316 debug_print++;
1317 /* convert ranges to var ranges state */
1318 num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size,
1319 mtrr_gran_size);
1320
1321 /* we got new setting in range_state, check it */
1322 memset(range_new, 0, sizeof(range_new));
1323 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
1324 extra_remove_base,
1325 extra_remove_size);
1326 range_sums_new = sum_ranges(range_new, nr_range_new);
1327
1328 i = 0;
1329 result[i].chunk_sizek = mtrr_chunk_size >> 10;
1330 result[i].gran_sizek = mtrr_gran_size >> 10;
1331 result[i].num_reg = num_reg;
1332 if (range_sums < range_sums_new) {
1333 result[i].lose_cover_sizek =
1334 (range_sums_new - range_sums) << PSHIFT;
1335 result[i].bad = 1;
1336 } else
1337 result[i].lose_cover_sizek =
1338 (range_sums - range_sums_new) << PSHIFT;
1339
1340 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1341 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1342 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1343 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1344 result[i].bad?"*BAD*":" ",
1345 gran_base, gran_factor, chunk_base, chunk_factor);
1346 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1347 result[i].num_reg, result[i].bad?"-":"",
1348 lose_base, lose_factor);
1349 if (!result[i].bad) {
1350 set_var_mtrr_all(address_bits);
1351 return 1;
1352 }
1353 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
1354 "will find optimal one\n");
1355 debug_print--;
1356 memset(result, 0, sizeof(result[0]));
1357 }
1358
1359 i = 0;
1360 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
1361 memset(result, 0, sizeof(result));
1362 for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
1363 char gran_factor;
1364 unsigned long gran_base;
1365
1366 if (debug_print)
1367 gran_base = to_size_factor(gran_size >> 10, &gran_factor);
1368
1369 for (chunk_size = gran_size; chunk_size < (1ULL<<32);
1370 chunk_size <<= 1) {
1371 int num_reg;
1372
1373 if (debug_print) {
1374 char chunk_factor;
1375 unsigned long chunk_base;
1376
1377 chunk_base = to_size_factor(chunk_size>>10, &chunk_factor),
1378 printk(KERN_INFO "\n");
1379 printk(KERN_INFO "gran_size: %ld%c chunk_size: %ld%c \n",
1380 gran_base, gran_factor, chunk_base, chunk_factor);
1381 }
1382 if (i >= NUM_RESULT)
1383 continue;
1384
1385 /* convert ranges to var ranges state */
1386 num_reg = x86_setup_var_mtrrs(range, nr_range,
1387 chunk_size, gran_size);
1388
1389 /* we got new setting in range_state, check it */
1390 memset(range_new, 0, sizeof(range_new));
1391 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
1392 extra_remove_base, extra_remove_size);
1393 range_sums_new = sum_ranges(range_new, nr_range_new);
1394
1395 result[i].chunk_sizek = chunk_size >> 10;
1396 result[i].gran_sizek = gran_size >> 10;
1397 result[i].num_reg = num_reg;
1398 if (range_sums < range_sums_new) {
1399 result[i].lose_cover_sizek =
1400 (range_sums_new - range_sums) << PSHIFT;
1401 result[i].bad = 1;
1402 } else
1403 result[i].lose_cover_sizek =
1404 (range_sums - range_sums_new) << PSHIFT;
1405
1406 /* double check it */
1407 if (!result[i].bad && !result[i].lose_cover_sizek) {
1408 if (nr_range_new != nr_range ||
1409 memcmp(range, range_new, sizeof(range)))
1410 result[i].bad = 1;
1411 }
1412
1413 if (!result[i].bad && (range_sums - range_sums_new <
1414 min_loss_pfn[num_reg])) {
1415 min_loss_pfn[num_reg] =
1416 range_sums - range_sums_new;
1417 }
1418 i++;
1419 }
1420 }
1421
1422 /* print out all */
1423 for (i = 0; i < NUM_RESULT; i++) {
1424 char gran_factor, chunk_factor, lose_factor;
1425 unsigned long gran_base, chunk_base, lose_base;
1426
1427 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1428 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1429 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1430 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1431 result[i].bad?"*BAD*":" ",
1432 gran_base, gran_factor, chunk_base, chunk_factor);
1433 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1434 result[i].num_reg, result[i].bad?"-":"",
1435 lose_base, lose_factor);
1436 }
1437
1438 /* try to find the optimal index */
1439 if (nr_mtrr_spare_reg >= num_var_ranges)
1440 nr_mtrr_spare_reg = num_var_ranges - 1;
1441 num_reg_good = -1;
1442 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
1443 if (!min_loss_pfn[i])
1444 num_reg_good = i;
1445 }
1446
1447 index_good = -1;
1448 if (num_reg_good != -1) {
1449 for (i = 0; i < NUM_RESULT; i++) {
1450 if (!result[i].bad &&
1451 result[i].num_reg == num_reg_good &&
1452 !result[i].lose_cover_sizek) {
1453 index_good = i;
1454 break;
1455 }
1456 }
1457 }
1458
1459 if (index_good != -1) {
1460 char gran_factor, chunk_factor, lose_factor;
1461 unsigned long gran_base, chunk_base, lose_base;
1462
1463 printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
1464 i = index_good;
1465 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1466 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1467 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1468 printk(KERN_INFO "gran_size: %ld%c \tchunk_size: %ld%c \t",
1469 gran_base, gran_factor, chunk_base, chunk_factor);
1470 printk(KERN_CONT "num_reg: %d \tlose RAM: %ld%c\n",
1471 result[i].num_reg, lose_base, lose_factor);
1472 /* convert ranges to var ranges state */
1473 chunk_size = result[i].chunk_sizek;
1474 chunk_size <<= 10;
1475 gran_size = result[i].gran_sizek;
1476 gran_size <<= 10;
1477 debug_print++;
1478 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
1479 debug_print--;
1480 set_var_mtrr_all(address_bits);
1481 return 1;
1482 }
1483
1484 printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
1485 printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
1486
1487 return 0;
1488}
1489#else
1490static int __init mtrr_cleanup(unsigned address_bits)
1491{
1492 return 0;
1493}
1494#endif
1495
1496static int __initdata changed_by_mtrr_cleanup;
1497
612static int disable_mtrr_trim; 1498static int disable_mtrr_trim;
613 1499
614static int __init disable_mtrr_trim_setup(char *str) 1500static int __init disable_mtrr_trim_setup(char *str)
@@ -648,6 +1534,19 @@ int __init amd_special_default_mtrr(void)
648 return 0; 1534 return 0;
649} 1535}
650 1536
1537static u64 __init real_trim_memory(unsigned long start_pfn,
1538 unsigned long limit_pfn)
1539{
1540 u64 trim_start, trim_size;
1541 trim_start = start_pfn;
1542 trim_start <<= PAGE_SHIFT;
1543 trim_size = limit_pfn;
1544 trim_size <<= PAGE_SHIFT;
1545 trim_size -= trim_start;
1546
1547 return e820_update_range(trim_start, trim_size, E820_RAM,
1548 E820_RESERVED);
1549}
651/** 1550/**
652 * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs 1551 * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
653 * @end_pfn: ending page frame number 1552 * @end_pfn: ending page frame number
@@ -663,8 +1562,11 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
663{ 1562{
664 unsigned long i, base, size, highest_pfn = 0, def, dummy; 1563 unsigned long i, base, size, highest_pfn = 0, def, dummy;
665 mtrr_type type; 1564 mtrr_type type;
666 u64 trim_start, trim_size; 1565 int nr_range;
1566 u64 total_trim_size;
667 1567
1568 /* extra one for all 0 */
1569 int num[MTRR_NUM_TYPES + 1];
668 /* 1570 /*
669 * Make sure we only trim uncachable memory on machines that 1571 * Make sure we only trim uncachable memory on machines that
670 * support the Intel MTRR architecture: 1572 * support the Intel MTRR architecture:
@@ -676,44 +1578,92 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
676 if (def != MTRR_TYPE_UNCACHABLE) 1578 if (def != MTRR_TYPE_UNCACHABLE)
677 return 0; 1579 return 0;
678 1580
679 if (amd_special_default_mtrr()) 1581 /* get it and store it aside */
680 return 0; 1582 memset(range_state, 0, sizeof(range_state));
1583 for (i = 0; i < num_var_ranges; i++) {
1584 mtrr_if->get(i, &base, &size, &type);
1585 range_state[i].base_pfn = base;
1586 range_state[i].size_pfn = size;
1587 range_state[i].type = type;
1588 }
681 1589
682 /* Find highest cached pfn */ 1590 /* Find highest cached pfn */
683 for (i = 0; i < num_var_ranges; i++) { 1591 for (i = 0; i < num_var_ranges; i++) {
684 mtrr_if->get(i, &base, &size, &type); 1592 type = range_state[i].type;
685 if (type != MTRR_TYPE_WRBACK) 1593 if (type != MTRR_TYPE_WRBACK)
686 continue; 1594 continue;
1595 base = range_state[i].base_pfn;
1596 size = range_state[i].size_pfn;
687 if (highest_pfn < base + size) 1597 if (highest_pfn < base + size)
688 highest_pfn = base + size; 1598 highest_pfn = base + size;
689 } 1599 }
690 1600
691 /* kvm/qemu doesn't have mtrr set right, don't trim them all */ 1601 /* kvm/qemu doesn't have mtrr set right, don't trim them all */
692 if (!highest_pfn) { 1602 if (!highest_pfn) {
693 if (!kvm_para_available()) { 1603 WARN(!kvm_para_available(), KERN_WARNING
694 printk(KERN_WARNING
695 "WARNING: strange, CPU MTRRs all blank?\n"); 1604 "WARNING: strange, CPU MTRRs all blank?\n");
696 WARN_ON(1);
697 }
698 return 0; 1605 return 0;
699 } 1606 }
700 1607
701 if (highest_pfn < end_pfn) { 1608 /* check entries number */
1609 memset(num, 0, sizeof(num));
1610 for (i = 0; i < num_var_ranges; i++) {
1611 type = range_state[i].type;
1612 if (type >= MTRR_NUM_TYPES)
1613 continue;
1614 size = range_state[i].size_pfn;
1615 if (!size)
1616 type = MTRR_NUM_TYPES;
1617 num[type]++;
1618 }
1619
1620 /* no entry for WB? */
1621 if (!num[MTRR_TYPE_WRBACK])
1622 return 0;
1623
1624 /* check if we only had WB and UC */
1625 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
1626 num_var_ranges - num[MTRR_NUM_TYPES])
1627 return 0;
1628
1629 memset(range, 0, sizeof(range));
1630 nr_range = 0;
1631 if (mtrr_tom2) {
1632 range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
1633 range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
1634 if (highest_pfn < range[nr_range].end + 1)
1635 highest_pfn = range[nr_range].end + 1;
1636 nr_range++;
1637 }
1638 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
1639
1640 total_trim_size = 0;
1641 /* check the head */
1642 if (range[0].start)
1643 total_trim_size += real_trim_memory(0, range[0].start);
1644 /* check the holes */
1645 for (i = 0; i < nr_range - 1; i++) {
1646 if (range[i].end + 1 < range[i+1].start)
1647 total_trim_size += real_trim_memory(range[i].end + 1,
1648 range[i+1].start);
1649 }
1650 /* check the top */
1651 i = nr_range - 1;
1652 if (range[i].end + 1 < end_pfn)
1653 total_trim_size += real_trim_memory(range[i].end + 1,
1654 end_pfn);
1655
1656 if (total_trim_size) {
702 printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" 1657 printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
703 " all of memory, losing %luMB of RAM.\n", 1658 " all of memory, losing %lluMB of RAM.\n",
704 (end_pfn - highest_pfn) >> (20 - PAGE_SHIFT)); 1659 total_trim_size >> 20);
705 1660
706 WARN_ON(1); 1661 if (!changed_by_mtrr_cleanup)
1662 WARN_ON(1);
707 1663
708 printk(KERN_INFO "update e820 for mtrr\n"); 1664 printk(KERN_INFO "update e820 for mtrr\n");
709 trim_start = highest_pfn;
710 trim_start <<= PAGE_SHIFT;
711 trim_size = end_pfn;
712 trim_size <<= PAGE_SHIFT;
713 trim_size -= trim_start;
714 update_memory_range(trim_start, trim_size, E820_RAM,
715 E820_RESERVED);
716 update_e820(); 1665 update_e820();
1666
717 return 1; 1667 return 1;
718 } 1668 }
719 1669
@@ -729,18 +1679,21 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
729 */ 1679 */
730void __init mtrr_bp_init(void) 1680void __init mtrr_bp_init(void)
731{ 1681{
1682 u32 phys_addr;
732 init_ifs(); 1683 init_ifs();
733 1684
1685 phys_addr = 32;
1686
734 if (cpu_has_mtrr) { 1687 if (cpu_has_mtrr) {
735 mtrr_if = &generic_mtrr_ops; 1688 mtrr_if = &generic_mtrr_ops;
736 size_or_mask = 0xff000000; /* 36 bits */ 1689 size_or_mask = 0xff000000; /* 36 bits */
737 size_and_mask = 0x00f00000; 1690 size_and_mask = 0x00f00000;
1691 phys_addr = 36;
738 1692
739 /* This is an AMD specific MSR, but we assume(hope?) that 1693 /* This is an AMD specific MSR, but we assume(hope?) that
740 Intel will implement it to when they extend the address 1694 Intel will implement it to when they extend the address
741 bus of the Xeon. */ 1695 bus of the Xeon. */
742 if (cpuid_eax(0x80000000) >= 0x80000008) { 1696 if (cpuid_eax(0x80000000) >= 0x80000008) {
743 u32 phys_addr;
744 phys_addr = cpuid_eax(0x80000008) & 0xff; 1697 phys_addr = cpuid_eax(0x80000008) & 0xff;
745 /* CPUID workaround for Intel 0F33/0F34 CPU */ 1698 /* CPUID workaround for Intel 0F33/0F34 CPU */
746 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && 1699 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
@@ -758,6 +1711,7 @@ void __init mtrr_bp_init(void)
758 don't support PAE */ 1711 don't support PAE */
759 size_or_mask = 0xfff00000; /* 32 bits */ 1712 size_or_mask = 0xfff00000; /* 32 bits */
760 size_and_mask = 0; 1713 size_and_mask = 0;
1714 phys_addr = 32;
761 } 1715 }
762 } else { 1716 } else {
763 switch (boot_cpu_data.x86_vendor) { 1717 switch (boot_cpu_data.x86_vendor) {
@@ -791,8 +1745,15 @@ void __init mtrr_bp_init(void)
791 if (mtrr_if) { 1745 if (mtrr_if) {
792 set_num_var_ranges(); 1746 set_num_var_ranges();
793 init_table(); 1747 init_table();
794 if (use_intel()) 1748 if (use_intel()) {
795 get_mtrr_state(); 1749 get_mtrr_state();
1750
1751 if (mtrr_cleanup(phys_addr)) {
1752 changed_by_mtrr_cleanup = 1;
1753 mtrr_if->set_all();
1754 }
1755
1756 }
796 } 1757 }
797} 1758}
798 1759
@@ -822,16 +1783,17 @@ void mtrr_ap_init(void)
822 */ 1783 */
823void mtrr_save_state(void) 1784void mtrr_save_state(void)
824{ 1785{
825 smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1, 1); 1786 smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1);
826} 1787}
827 1788
828static int __init mtrr_init_finialize(void) 1789static int __init mtrr_init_finialize(void)
829{ 1790{
830 if (!mtrr_if) 1791 if (!mtrr_if)
831 return 0; 1792 return 0;
832 if (use_intel()) 1793 if (use_intel()) {
833 mtrr_state_warn(); 1794 if (!changed_by_mtrr_cleanup)
834 else { 1795 mtrr_state_warn();
1796 } else {
835 /* The CPUs haven't MTRR and seem to not support SMP. They have 1797 /* The CPUs haven't MTRR and seem to not support SMP. They have
836 * specific drivers, we use a tricky method to support 1798 * specific drivers, we use a tricky method to support
837 * suspend/resume for them. 1799 * suspend/resume for them.
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 2cc77eb6fea3..2dc4ec656b23 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -81,6 +81,8 @@ void set_mtrr_done(struct set_mtrr_context *ctxt);
81void set_mtrr_cache_disable(struct set_mtrr_context *ctxt); 81void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
82void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); 82void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
83 83
84void fill_mtrr_var_range(unsigned int index,
85 u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
84void get_mtrr_state(void); 86void get_mtrr_state(void);
85 87
86extern void set_mtrr_ops(struct mtrr_ops * ops); 88extern void set_mtrr_ops(struct mtrr_ops * ops);
@@ -92,6 +94,7 @@ extern struct mtrr_ops * mtrr_if;
92#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) 94#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1)
93 95
94extern unsigned int num_var_ranges; 96extern unsigned int num_var_ranges;
97extern u64 mtrr_tom2;
95 98
96void mtrr_state_warn(void); 99void mtrr_state_warn(void);
97const char *mtrr_attrib_to_str(int x); 100const char *mtrr_attrib_to_str(int x);
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index f9ae93adffe5..6bff382094f5 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -1,11 +1,15 @@
1/* local apic based NMI watchdog for various CPUs. 1/*
2 This file also handles reservation of performance counters for coordination 2 * local apic based NMI watchdog for various CPUs.
3 with other users (like oprofile). 3 *
4 4 * This file also handles reservation of performance counters for coordination
5 Note that these events normally don't tick when the CPU idles. This means 5 * with other users (like oprofile).
6 the frequency varies with CPU load. 6 *
7 7 * Note that these events normally don't tick when the CPU idles. This means
8 Original code for K7/P6 written by Keith Owens */ 8 * the frequency varies with CPU load.
9 *
10 * Original code for K7/P6 written by Keith Owens
11 *
12 */
9 13
10#include <linux/percpu.h> 14#include <linux/percpu.h>
11#include <linux/module.h> 15#include <linux/module.h>
@@ -36,12 +40,16 @@ struct wd_ops {
36 40
37static const struct wd_ops *wd_ops; 41static const struct wd_ops *wd_ops;
38 42
39/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's 43/*
40 * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) 44 * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
45 * offset from MSR_P4_BSU_ESCR0.
46 *
47 * It will be the max for all platforms (for now)
41 */ 48 */
42#define NMI_MAX_COUNTER_BITS 66 49#define NMI_MAX_COUNTER_BITS 66
43 50
44/* perfctr_nmi_owner tracks the ownership of the perfctr registers: 51/*
52 * perfctr_nmi_owner tracks the ownership of the perfctr registers:
45 * evtsel_nmi_owner tracks the ownership of the event selection 53 * evtsel_nmi_owner tracks the ownership of the event selection
46 * - different performance counters/ event selection may be reserved for 54 * - different performance counters/ event selection may be reserved for
47 * different subsystems this reservation system just tries to coordinate 55 * different subsystems this reservation system just tries to coordinate
@@ -73,8 +81,10 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
73 return 0; 81 return 0;
74} 82}
75 83
76/* converts an msr to an appropriate reservation bit */ 84/*
77/* returns the bit offset of the event selection register */ 85 * converts an msr to an appropriate reservation bit
86 * returns the bit offset of the event selection register
87 */
78static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) 88static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
79{ 89{
80 /* returns the bit offset of the event selection register */ 90 /* returns the bit offset of the event selection register */
@@ -114,6 +124,7 @@ int avail_to_resrv_perfctr_nmi(unsigned int msr)
114 124
115 return (!test_bit(counter, perfctr_nmi_owner)); 125 return (!test_bit(counter, perfctr_nmi_owner));
116} 126}
127EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
117 128
118int reserve_perfctr_nmi(unsigned int msr) 129int reserve_perfctr_nmi(unsigned int msr)
119{ 130{
@@ -128,6 +139,7 @@ int reserve_perfctr_nmi(unsigned int msr)
128 return 1; 139 return 1;
129 return 0; 140 return 0;
130} 141}
142EXPORT_SYMBOL(reserve_perfctr_nmi);
131 143
132void release_perfctr_nmi(unsigned int msr) 144void release_perfctr_nmi(unsigned int msr)
133{ 145{
@@ -140,6 +152,7 @@ void release_perfctr_nmi(unsigned int msr)
140 152
141 clear_bit(counter, perfctr_nmi_owner); 153 clear_bit(counter, perfctr_nmi_owner);
142} 154}
155EXPORT_SYMBOL(release_perfctr_nmi);
143 156
144int reserve_evntsel_nmi(unsigned int msr) 157int reserve_evntsel_nmi(unsigned int msr)
145{ 158{
@@ -154,6 +167,7 @@ int reserve_evntsel_nmi(unsigned int msr)
154 return 1; 167 return 1;
155 return 0; 168 return 0;
156} 169}
170EXPORT_SYMBOL(reserve_evntsel_nmi);
157 171
158void release_evntsel_nmi(unsigned int msr) 172void release_evntsel_nmi(unsigned int msr)
159{ 173{
@@ -166,11 +180,6 @@ void release_evntsel_nmi(unsigned int msr)
166 180
167 clear_bit(counter, evntsel_nmi_owner); 181 clear_bit(counter, evntsel_nmi_owner);
168} 182}
169
170EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
171EXPORT_SYMBOL(reserve_perfctr_nmi);
172EXPORT_SYMBOL(release_perfctr_nmi);
173EXPORT_SYMBOL(reserve_evntsel_nmi);
174EXPORT_SYMBOL(release_evntsel_nmi); 183EXPORT_SYMBOL(release_evntsel_nmi);
175 184
176void disable_lapic_nmi_watchdog(void) 185void disable_lapic_nmi_watchdog(void)
@@ -180,8 +189,10 @@ void disable_lapic_nmi_watchdog(void)
180 if (atomic_read(&nmi_active) <= 0) 189 if (atomic_read(&nmi_active) <= 0)
181 return; 190 return;
182 191
183 on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); 192 on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
184 wd_ops->unreserve(); 193
194 if (wd_ops)
195 wd_ops->unreserve();
185 196
186 BUG_ON(atomic_read(&nmi_active) != 0); 197 BUG_ON(atomic_read(&nmi_active) != 0);
187} 198}
@@ -202,7 +213,7 @@ void enable_lapic_nmi_watchdog(void)
202 return; 213 return;
203 } 214 }
204 215
205 on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); 216 on_each_cpu(setup_apic_nmi_watchdog, NULL, 1);
206 touch_nmi_watchdog(); 217 touch_nmi_watchdog();
207} 218}
208 219
@@ -232,31 +243,32 @@ static unsigned int adjust_for_32bit_ctr(unsigned int hz)
232 return retval; 243 return retval;
233} 244}
234 245
235static void 246static void write_watchdog_counter(unsigned int perfctr_msr,
236write_watchdog_counter(unsigned int perfctr_msr, const char *descr, unsigned nmi_hz) 247 const char *descr, unsigned nmi_hz)
237{ 248{
238 u64 count = (u64)cpu_khz * 1000; 249 u64 count = (u64)cpu_khz * 1000;
239 250
240 do_div(count, nmi_hz); 251 do_div(count, nmi_hz);
241 if(descr) 252 if(descr)
242 Dprintk("setting %s to -0x%08Lx\n", descr, count); 253 pr_debug("setting %s to -0x%08Lx\n", descr, count);
243 wrmsrl(perfctr_msr, 0 - count); 254 wrmsrl(perfctr_msr, 0 - count);
244} 255}
245 256
246static void write_watchdog_counter32(unsigned int perfctr_msr, 257static void write_watchdog_counter32(unsigned int perfctr_msr,
247 const char *descr, unsigned nmi_hz) 258 const char *descr, unsigned nmi_hz)
248{ 259{
249 u64 count = (u64)cpu_khz * 1000; 260 u64 count = (u64)cpu_khz * 1000;
250 261
251 do_div(count, nmi_hz); 262 do_div(count, nmi_hz);
252 if(descr) 263 if(descr)
253 Dprintk("setting %s to -0x%08Lx\n", descr, count); 264 pr_debug("setting %s to -0x%08Lx\n", descr, count);
254 wrmsr(perfctr_msr, (u32)(-count), 0); 265 wrmsr(perfctr_msr, (u32)(-count), 0);
255} 266}
256 267
257/* AMD K7/K8/Family10h/Family11h support. AMD keeps this interface 268/*
258 nicely stable so there is not much variety */ 269 * AMD K7/K8/Family10h/Family11h support.
259 270 * AMD keeps this interface nicely stable so there is not much variety
271 */
260#define K7_EVNTSEL_ENABLE (1 << 22) 272#define K7_EVNTSEL_ENABLE (1 << 22)
261#define K7_EVNTSEL_INT (1 << 20) 273#define K7_EVNTSEL_INT (1 << 20)
262#define K7_EVNTSEL_OS (1 << 17) 274#define K7_EVNTSEL_OS (1 << 17)
@@ -283,13 +295,19 @@ static int setup_k7_watchdog(unsigned nmi_hz)
283 /* setup the timer */ 295 /* setup the timer */
284 wrmsr(evntsel_msr, evntsel, 0); 296 wrmsr(evntsel_msr, evntsel, 0);
285 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz); 297 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz);
298
299 /* initialize the wd struct before enabling */
300 wd->perfctr_msr = perfctr_msr;
301 wd->evntsel_msr = evntsel_msr;
302 wd->cccr_msr = 0; /* unused */
303
304 /* ok, everything is initialized, announce that we're set */
305 cpu_nmi_set_wd_enabled();
306
286 apic_write(APIC_LVTPC, APIC_DM_NMI); 307 apic_write(APIC_LVTPC, APIC_DM_NMI);
287 evntsel |= K7_EVNTSEL_ENABLE; 308 evntsel |= K7_EVNTSEL_ENABLE;
288 wrmsr(evntsel_msr, evntsel, 0); 309 wrmsr(evntsel_msr, evntsel, 0);
289 310
290 wd->perfctr_msr = perfctr_msr;
291 wd->evntsel_msr = evntsel_msr;
292 wd->cccr_msr = 0; //unused
293 return 1; 311 return 1;
294} 312}
295 313
@@ -325,18 +343,19 @@ static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
325} 343}
326 344
327static const struct wd_ops k7_wd_ops = { 345static const struct wd_ops k7_wd_ops = {
328 .reserve = single_msr_reserve, 346 .reserve = single_msr_reserve,
329 .unreserve = single_msr_unreserve, 347 .unreserve = single_msr_unreserve,
330 .setup = setup_k7_watchdog, 348 .setup = setup_k7_watchdog,
331 .rearm = single_msr_rearm, 349 .rearm = single_msr_rearm,
332 .stop = single_msr_stop_watchdog, 350 .stop = single_msr_stop_watchdog,
333 .perfctr = MSR_K7_PERFCTR0, 351 .perfctr = MSR_K7_PERFCTR0,
334 .evntsel = MSR_K7_EVNTSEL0, 352 .evntsel = MSR_K7_EVNTSEL0,
335 .checkbit = 1ULL<<47, 353 .checkbit = 1ULL << 47,
336}; 354};
337 355
338/* Intel Model 6 (PPro+,P2,P3,P-M,Core1) */ 356/*
339 357 * Intel Model 6 (PPro+,P2,P3,P-M,Core1)
358 */
340#define P6_EVNTSEL0_ENABLE (1 << 22) 359#define P6_EVNTSEL0_ENABLE (1 << 22)
341#define P6_EVNTSEL_INT (1 << 20) 360#define P6_EVNTSEL_INT (1 << 20)
342#define P6_EVNTSEL_OS (1 << 17) 361#define P6_EVNTSEL_OS (1 << 17)
@@ -366,58 +385,91 @@ static int setup_p6_watchdog(unsigned nmi_hz)
366 wrmsr(evntsel_msr, evntsel, 0); 385 wrmsr(evntsel_msr, evntsel, 0);
367 nmi_hz = adjust_for_32bit_ctr(nmi_hz); 386 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
368 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz); 387 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz);
388
389 /* initialize the wd struct before enabling */
390 wd->perfctr_msr = perfctr_msr;
391 wd->evntsel_msr = evntsel_msr;
392 wd->cccr_msr = 0; /* unused */
393
394 /* ok, everything is initialized, announce that we're set */
395 cpu_nmi_set_wd_enabled();
396
369 apic_write(APIC_LVTPC, APIC_DM_NMI); 397 apic_write(APIC_LVTPC, APIC_DM_NMI);
370 evntsel |= P6_EVNTSEL0_ENABLE; 398 evntsel |= P6_EVNTSEL0_ENABLE;
371 wrmsr(evntsel_msr, evntsel, 0); 399 wrmsr(evntsel_msr, evntsel, 0);
372 400
373 wd->perfctr_msr = perfctr_msr;
374 wd->evntsel_msr = evntsel_msr;
375 wd->cccr_msr = 0; //unused
376 return 1; 401 return 1;
377} 402}
378 403
379static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) 404static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
380{ 405{
381 /* P6 based Pentium M need to re-unmask 406 /*
407 * P6 based Pentium M need to re-unmask
382 * the apic vector but it doesn't hurt 408 * the apic vector but it doesn't hurt
383 * other P6 variant. 409 * other P6 variant.
384 * ArchPerfom/Core Duo also needs this */ 410 * ArchPerfom/Core Duo also needs this
411 */
385 apic_write(APIC_LVTPC, APIC_DM_NMI); 412 apic_write(APIC_LVTPC, APIC_DM_NMI);
413
386 /* P6/ARCH_PERFMON has 32 bit counter write */ 414 /* P6/ARCH_PERFMON has 32 bit counter write */
387 write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz); 415 write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz);
388} 416}
389 417
390static const struct wd_ops p6_wd_ops = { 418static const struct wd_ops p6_wd_ops = {
391 .reserve = single_msr_reserve, 419 .reserve = single_msr_reserve,
392 .unreserve = single_msr_unreserve, 420 .unreserve = single_msr_unreserve,
393 .setup = setup_p6_watchdog, 421 .setup = setup_p6_watchdog,
394 .rearm = p6_rearm, 422 .rearm = p6_rearm,
395 .stop = single_msr_stop_watchdog, 423 .stop = single_msr_stop_watchdog,
396 .perfctr = MSR_P6_PERFCTR0, 424 .perfctr = MSR_P6_PERFCTR0,
397 .evntsel = MSR_P6_EVNTSEL0, 425 .evntsel = MSR_P6_EVNTSEL0,
398 .checkbit = 1ULL<<39, 426 .checkbit = 1ULL << 39,
399}; 427};
400 428
401/* Intel P4 performance counters. By far the most complicated of all. */ 429/*
402 430 * Intel P4 performance counters.
403#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) 431 * By far the most complicated of all.
404#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) 432 */
405#define P4_ESCR_OS (1<<3) 433#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1 << 7)
406#define P4_ESCR_USR (1<<2) 434#define P4_ESCR_EVENT_SELECT(N) ((N) << 25)
407#define P4_CCCR_OVF_PMI0 (1<<26) 435#define P4_ESCR_OS (1 << 3)
408#define P4_CCCR_OVF_PMI1 (1<<27) 436#define P4_ESCR_USR (1 << 2)
409#define P4_CCCR_THRESHOLD(N) ((N)<<20) 437#define P4_CCCR_OVF_PMI0 (1 << 26)
410#define P4_CCCR_COMPLEMENT (1<<19) 438#define P4_CCCR_OVF_PMI1 (1 << 27)
411#define P4_CCCR_COMPARE (1<<18) 439#define P4_CCCR_THRESHOLD(N) ((N) << 20)
412#define P4_CCCR_REQUIRED (3<<16) 440#define P4_CCCR_COMPLEMENT (1 << 19)
413#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) 441#define P4_CCCR_COMPARE (1 << 18)
414#define P4_CCCR_ENABLE (1<<12) 442#define P4_CCCR_REQUIRED (3 << 16)
415#define P4_CCCR_OVF (1<<31) 443#define P4_CCCR_ESCR_SELECT(N) ((N) << 13)
416 444#define P4_CCCR_ENABLE (1 << 12)
417/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter 445#define P4_CCCR_OVF (1 << 31)
418 CRU_ESCR0 (with any non-null event selector) through a complemented 446
419 max threshold. [IA32-Vol3, Section 14.9.9] */ 447#define P4_CONTROLS 18
420 448static unsigned int p4_controls[18] = {
449 MSR_P4_BPU_CCCR0,
450 MSR_P4_BPU_CCCR1,
451 MSR_P4_BPU_CCCR2,
452 MSR_P4_BPU_CCCR3,
453 MSR_P4_MS_CCCR0,
454 MSR_P4_MS_CCCR1,
455 MSR_P4_MS_CCCR2,
456 MSR_P4_MS_CCCR3,
457 MSR_P4_FLAME_CCCR0,
458 MSR_P4_FLAME_CCCR1,
459 MSR_P4_FLAME_CCCR2,
460 MSR_P4_FLAME_CCCR3,
461 MSR_P4_IQ_CCCR0,
462 MSR_P4_IQ_CCCR1,
463 MSR_P4_IQ_CCCR2,
464 MSR_P4_IQ_CCCR3,
465 MSR_P4_IQ_CCCR4,
466 MSR_P4_IQ_CCCR5,
467};
468/*
469 * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
470 * CRU_ESCR0 (with any non-null event selector) through a complemented
471 * max threshold. [IA32-Vol3, Section 14.9.9]
472 */
421static int setup_p4_watchdog(unsigned nmi_hz) 473static int setup_p4_watchdog(unsigned nmi_hz)
422{ 474{
423 unsigned int perfctr_msr, evntsel_msr, cccr_msr; 475 unsigned int perfctr_msr, evntsel_msr, cccr_msr;
@@ -442,7 +494,8 @@ static int setup_p4_watchdog(unsigned nmi_hz)
442#endif 494#endif
443 ht_num = 0; 495 ht_num = 0;
444 496
445 /* performance counters are shared resources 497 /*
498 * performance counters are shared resources
446 * assign each hyperthread its own set 499 * assign each hyperthread its own set
447 * (re-use the ESCR0 register, seems safe 500 * (re-use the ESCR0 register, seems safe
448 * and keeps the cccr_val the same) 501 * and keeps the cccr_val the same)
@@ -453,12 +506,38 @@ static int setup_p4_watchdog(unsigned nmi_hz)
453 evntsel_msr = MSR_P4_CRU_ESCR0; 506 evntsel_msr = MSR_P4_CRU_ESCR0;
454 cccr_msr = MSR_P4_IQ_CCCR0; 507 cccr_msr = MSR_P4_IQ_CCCR0;
455 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); 508 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
509
510 /*
511 * If we're on the kdump kernel or other situation, we may
512 * still have other performance counter registers set to
513 * interrupt and they'll keep interrupting forever because
514 * of the P4_CCCR_OVF quirk. So we need to ACK all the
515 * pending interrupts and disable all the registers here,
516 * before reenabling the NMI delivery. Refer to p4_rearm()
517 * about the P4_CCCR_OVF quirk.
518 */
519 if (reset_devices) {
520 unsigned int low, high;
521 int i;
522
523 for (i = 0; i < P4_CONTROLS; i++) {
524 rdmsr(p4_controls[i], low, high);
525 low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
526 wrmsr(p4_controls[i], low, high);
527 }
528 }
456 } else { 529 } else {
457 /* logical cpu 1 */ 530 /* logical cpu 1 */
458 perfctr_msr = MSR_P4_IQ_PERFCTR1; 531 perfctr_msr = MSR_P4_IQ_PERFCTR1;
459 evntsel_msr = MSR_P4_CRU_ESCR0; 532 evntsel_msr = MSR_P4_CRU_ESCR0;
460 cccr_msr = MSR_P4_IQ_CCCR1; 533 cccr_msr = MSR_P4_IQ_CCCR1;
461 cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4); 534
535 /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */
536 if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4)
537 cccr_val = P4_CCCR_OVF_PMI0;
538 else
539 cccr_val = P4_CCCR_OVF_PMI1;
540 cccr_val |= P4_CCCR_ESCR_SELECT(4);
462 } 541 }
463 542
464 evntsel = P4_ESCR_EVENT_SELECT(0x3F) 543 evntsel = P4_ESCR_EVENT_SELECT(0x3F)
@@ -473,12 +552,17 @@ static int setup_p4_watchdog(unsigned nmi_hz)
473 wrmsr(evntsel_msr, evntsel, 0); 552 wrmsr(evntsel_msr, evntsel, 0);
474 wrmsr(cccr_msr, cccr_val, 0); 553 wrmsr(cccr_msr, cccr_val, 0);
475 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz); 554 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
476 apic_write(APIC_LVTPC, APIC_DM_NMI); 555
477 cccr_val |= P4_CCCR_ENABLE;
478 wrmsr(cccr_msr, cccr_val, 0);
479 wd->perfctr_msr = perfctr_msr; 556 wd->perfctr_msr = perfctr_msr;
480 wd->evntsel_msr = evntsel_msr; 557 wd->evntsel_msr = evntsel_msr;
481 wd->cccr_msr = cccr_msr; 558 wd->cccr_msr = cccr_msr;
559
560 /* ok, everything is initialized, announce that we're set */
561 cpu_nmi_set_wd_enabled();
562
563 apic_write(APIC_LVTPC, APIC_DM_NMI);
564 cccr_val |= P4_CCCR_ENABLE;
565 wrmsr(cccr_msr, cccr_val, 0);
482 return 1; 566 return 1;
483} 567}
484 568
@@ -540,20 +624,21 @@ static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
540} 624}
541 625
542static const struct wd_ops p4_wd_ops = { 626static const struct wd_ops p4_wd_ops = {
543 .reserve = p4_reserve, 627 .reserve = p4_reserve,
544 .unreserve = p4_unreserve, 628 .unreserve = p4_unreserve,
545 .setup = setup_p4_watchdog, 629 .setup = setup_p4_watchdog,
546 .rearm = p4_rearm, 630 .rearm = p4_rearm,
547 .stop = stop_p4_watchdog, 631 .stop = stop_p4_watchdog,
548 /* RED-PEN this is wrong for the other sibling */ 632 /* RED-PEN this is wrong for the other sibling */
549 .perfctr = MSR_P4_BPU_PERFCTR0, 633 .perfctr = MSR_P4_BPU_PERFCTR0,
550 .evntsel = MSR_P4_BSU_ESCR0, 634 .evntsel = MSR_P4_BSU_ESCR0,
551 .checkbit = 1ULL<<39, 635 .checkbit = 1ULL << 39,
552}; 636};
553 637
554/* Watchdog using the Intel architected PerfMon. Used for Core2 and hopefully 638/*
555 all future Intel CPUs. */ 639 * Watchdog using the Intel architected PerfMon.
556 640 * Used for Core2 and hopefully all future Intel CPUs.
641 */
557#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 642#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
558#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK 643#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
559 644
@@ -593,25 +678,29 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
593 wrmsr(evntsel_msr, evntsel, 0); 678 wrmsr(evntsel_msr, evntsel, 0);
594 nmi_hz = adjust_for_32bit_ctr(nmi_hz); 679 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
595 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz); 680 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
596 apic_write(APIC_LVTPC, APIC_DM_NMI);
597 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
598 wrmsr(evntsel_msr, evntsel, 0);
599 681
600 wd->perfctr_msr = perfctr_msr; 682 wd->perfctr_msr = perfctr_msr;
601 wd->evntsel_msr = evntsel_msr; 683 wd->evntsel_msr = evntsel_msr;
602 wd->cccr_msr = 0; //unused 684 wd->cccr_msr = 0; /* unused */
685
686 /* ok, everything is initialized, announce that we're set */
687 cpu_nmi_set_wd_enabled();
688
689 apic_write(APIC_LVTPC, APIC_DM_NMI);
690 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
691 wrmsr(evntsel_msr, evntsel, 0);
603 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); 692 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
604 return 1; 693 return 1;
605} 694}
606 695
607static struct wd_ops intel_arch_wd_ops __read_mostly = { 696static struct wd_ops intel_arch_wd_ops __read_mostly = {
608 .reserve = single_msr_reserve, 697 .reserve = single_msr_reserve,
609 .unreserve = single_msr_unreserve, 698 .unreserve = single_msr_unreserve,
610 .setup = setup_intel_arch_watchdog, 699 .setup = setup_intel_arch_watchdog,
611 .rearm = p6_rearm, 700 .rearm = p6_rearm,
612 .stop = single_msr_stop_watchdog, 701 .stop = single_msr_stop_watchdog,
613 .perfctr = MSR_ARCH_PERFMON_PERFCTR1, 702 .perfctr = MSR_ARCH_PERFMON_PERFCTR1,
614 .evntsel = MSR_ARCH_PERFMON_EVENTSEL1, 703 .evntsel = MSR_ARCH_PERFMON_EVENTSEL1,
615}; 704};
616 705
617static void probe_nmi_watchdog(void) 706static void probe_nmi_watchdog(void)
@@ -624,8 +713,10 @@ static void probe_nmi_watchdog(void)
624 wd_ops = &k7_wd_ops; 713 wd_ops = &k7_wd_ops;
625 break; 714 break;
626 case X86_VENDOR_INTEL: 715 case X86_VENDOR_INTEL:
627 /* Work around Core Duo (Yonah) errata AE49 where perfctr1 716 /*
628 doesn't have a working enable bit. */ 717 * Work around Core Duo (Yonah) errata AE49 where perfctr1
718 * doesn't have a working enable bit.
719 */
629 if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) { 720 if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) {
630 intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0; 721 intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0;
631 intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0; 722 intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0;
@@ -636,7 +727,7 @@ static void probe_nmi_watchdog(void)
636 } 727 }
637 switch (boot_cpu_data.x86) { 728 switch (boot_cpu_data.x86) {
638 case 6: 729 case 6:
639 if (boot_cpu_data.x86_model > 0xd) 730 if (boot_cpu_data.x86_model > 13)
640 return; 731 return;
641 732
642 wd_ops = &p6_wd_ops; 733 wd_ops = &p6_wd_ops;
@@ -697,10 +788,11 @@ int lapic_wd_event(unsigned nmi_hz)
697{ 788{
698 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); 789 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
699 u64 ctr; 790 u64 ctr;
791
700 rdmsrl(wd->perfctr_msr, ctr); 792 rdmsrl(wd->perfctr_msr, ctr);
701 if (ctr & wd_ops->checkbit) { /* perfctr still running? */ 793 if (ctr & wd_ops->checkbit) /* perfctr still running? */
702 return 0; 794 return 0;
703 } 795
704 wd_ops->rearm(wd, nmi_hz); 796 wd_ops->rearm(wd, nmi_hz);
705 return 1; 797 return 1;
706} 798}
diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c
new file mode 100644
index 000000000000..5abbea297e0c
--- /dev/null
+++ b/arch/x86/kernel/cpu/powerflags.c
@@ -0,0 +1,20 @@
1/*
2 * Strings for the various x86 power flags
3 *
4 * This file must not contain any executable code.
5 */
6
7#include <asm/cpufeature.h>
8
9const char *const x86_power_flags[32] = {
10 "ts", /* temperature sensor */
11 "fid", /* frequency id control */
12 "vid", /* voltage id control */
13 "ttp", /* thermal trip */
14 "tm",
15 "stc",
16 "100mhzsteps",
17 "hwpstate",
18 "", /* tsc invariant mapped to constant_tsc */
19 /* nothing */
20};
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 0d0d9057e7c0..a26c480b9491 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -160,7 +160,7 @@ static void *c_start(struct seq_file *m, loff_t *pos)
160{ 160{
161 if (*pos == 0) /* just in case, cpu 0 is not the first */ 161 if (*pos == 0) /* just in case, cpu 0 is not the first */
162 *pos = first_cpu(cpu_online_map); 162 *pos = first_cpu(cpu_online_map);
163 if ((*pos) < NR_CPUS && cpu_online(*pos)) 163 if ((*pos) < nr_cpu_ids && cpu_online(*pos))
164 return &cpu_data(*pos); 164 return &cpu_data(*pos);
165 return NULL; 165 return NULL;
166} 166}
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index b911a2c61b8f..52b3fefbd5af 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -5,6 +5,18 @@
5#include <asm/msr.h> 5#include <asm/msr.h>
6#include "cpu.h" 6#include "cpu.h"
7 7
8static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c)
9{
10 u32 xlvl;
11
12 /* Transmeta-defined flags: level 0x80860001 */
13 xlvl = cpuid_eax(0x80860000);
14 if ((xlvl & 0xffff0000) == 0x80860000) {
15 if (xlvl >= 0x80860001)
16 c->x86_capability[2] = cpuid_edx(0x80860001);
17 }
18}
19
8static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) 20static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
9{ 21{
10 unsigned int cap_mask, uk, max, dummy; 22 unsigned int cap_mask, uk, max, dummy;
@@ -12,7 +24,8 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
12 unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev; 24 unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev;
13 char cpu_info[65]; 25 char cpu_info[65];
14 26
15 get_model_name(c); /* Same as AMD/Cyrix */ 27 early_init_transmeta(c);
28
16 display_cacheinfo(c); 29 display_cacheinfo(c);
17 30
18 /* Print CMS and CPU revision */ 31 /* Print CMS and CPU revision */
@@ -85,23 +98,12 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
85#endif 98#endif
86} 99}
87 100
88static void __cpuinit transmeta_identify(struct cpuinfo_x86 *c)
89{
90 u32 xlvl;
91
92 /* Transmeta-defined flags: level 0x80860001 */
93 xlvl = cpuid_eax(0x80860000);
94 if ((xlvl & 0xffff0000) == 0x80860000) {
95 if (xlvl >= 0x80860001)
96 c->x86_capability[2] = cpuid_edx(0x80860001);
97 }
98}
99
100static struct cpu_dev transmeta_cpu_dev __cpuinitdata = { 101static struct cpu_dev transmeta_cpu_dev __cpuinitdata = {
101 .c_vendor = "Transmeta", 102 .c_vendor = "Transmeta",
102 .c_ident = { "GenuineTMx86", "TransmetaCPU" }, 103 .c_ident = { "GenuineTMx86", "TransmetaCPU" },
104 .c_early_init = early_init_transmeta,
103 .c_init = init_transmeta, 105 .c_init = init_transmeta,
104 .c_identify = transmeta_identify, 106 .c_x86_vendor = X86_VENDOR_TRANSMETA,
105}; 107};
106 108
107cpu_vendor_dev_register(X86_VENDOR_TRANSMETA, &transmeta_cpu_dev); 109cpu_dev_register(transmeta_cpu_dev);
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index b1fc90989d75..e777f79e0960 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -19,7 +19,8 @@ static struct cpu_dev umc_cpu_dev __cpuinitdata = {
19 } 19 }
20 }, 20 },
21 }, 21 },
22 .c_x86_vendor = X86_VENDOR_UMC,
22}; 23};
23 24
24cpu_vendor_dev_register(X86_VENDOR_UMC, &umc_cpu_dev); 25cpu_dev_register(umc_cpu_dev);
25 26
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index daff52a62248..6a44d6465991 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -33,9 +33,9 @@
33#include <linux/init.h> 33#include <linux/init.h>
34#include <linux/poll.h> 34#include <linux/poll.h>
35#include <linux/smp.h> 35#include <linux/smp.h>
36#include <linux/smp_lock.h>
36#include <linux/major.h> 37#include <linux/major.h>
37#include <linux/fs.h> 38#include <linux/fs.h>
38#include <linux/smp_lock.h>
39#include <linux/device.h> 39#include <linux/device.h>
40#include <linux/cpu.h> 40#include <linux/cpu.h>
41#include <linux/notifier.h> 41#include <linux/notifier.h>
@@ -88,6 +88,8 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
88 struct cpuid_regs cmd; 88 struct cpuid_regs cmd;
89 int cpu = iminor(file->f_path.dentry->d_inode); 89 int cpu = iminor(file->f_path.dentry->d_inode);
90 u64 pos = *ppos; 90 u64 pos = *ppos;
91 ssize_t bytes = 0;
92 int err = 0;
91 93
92 if (count % 16) 94 if (count % 16)
93 return -EINVAL; /* Invalid chunk size */ 95 return -EINVAL; /* Invalid chunk size */
@@ -95,27 +97,40 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
95 for (; count; count -= 16) { 97 for (; count; count -= 16) {
96 cmd.eax = pos; 98 cmd.eax = pos;
97 cmd.ecx = pos >> 32; 99 cmd.ecx = pos >> 32;
98 smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1); 100 err = smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1);
99 if (copy_to_user(tmp, &cmd, 16)) 101 if (err)
100 return -EFAULT; 102 break;
103 if (copy_to_user(tmp, &cmd, 16)) {
104 err = -EFAULT;
105 break;
106 }
101 tmp += 16; 107 tmp += 16;
108 bytes += 16;
102 *ppos = ++pos; 109 *ppos = ++pos;
103 } 110 }
104 111
105 return tmp - buf; 112 return bytes ? bytes : err;
106} 113}
107 114
108static int cpuid_open(struct inode *inode, struct file *file) 115static int cpuid_open(struct inode *inode, struct file *file)
109{ 116{
110 unsigned int cpu = iminor(file->f_path.dentry->d_inode); 117 unsigned int cpu;
111 struct cpuinfo_x86 *c = &cpu_data(cpu); 118 struct cpuinfo_x86 *c;
112 119 int ret = 0;
113 if (cpu >= NR_CPUS || !cpu_online(cpu)) 120
114 return -ENXIO; /* No such CPU */ 121 lock_kernel();
122
123 cpu = iminor(file->f_path.dentry->d_inode);
124 if (cpu >= NR_CPUS || !cpu_online(cpu)) {
125 ret = -ENXIO; /* No such CPU */
126 goto out;
127 }
128 c = &cpu_data(cpu);
115 if (c->cpuid_level < 0) 129 if (c->cpuid_level < 0)
116 return -EIO; /* CPUID not supported */ 130 ret = -EIO; /* CPUID not supported */
117 131out:
118 return 0; 132 unlock_kernel();
133 return ret;
119} 134}
120 135
121/* 136/*
@@ -132,8 +147,8 @@ static __cpuinit int cpuid_device_create(int cpu)
132{ 147{
133 struct device *dev; 148 struct device *dev;
134 149
135 dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), 150 dev = device_create_drvdata(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu),
136 "cpu%d", cpu); 151 NULL, "cpu%d", cpu);
137 return IS_ERR(dev) ? PTR_ERR(dev) : 0; 152 return IS_ERR(dev) ? PTR_ERR(dev) : 0;
138} 153}
139 154
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 15e6c6bc4a46..e90a60ef10c2 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -7,9 +7,8 @@
7 7
8#include <linux/errno.h> 8#include <linux/errno.h>
9#include <linux/crash_dump.h> 9#include <linux/crash_dump.h>
10 10#include <linux/uaccess.h>
11#include <asm/uaccess.h> 11#include <linux/io.h>
12#include <asm/io.h>
13 12
14/** 13/**
15 * copy_oldmem_page - copy one page from "oldmem" 14 * copy_oldmem_page - copy one page from "oldmem"
@@ -25,7 +24,7 @@
25 * in the current kernel. We stitch up a pte, similar to kmap_atomic. 24 * in the current kernel. We stitch up a pte, similar to kmap_atomic.
26 */ 25 */
27ssize_t copy_oldmem_page(unsigned long pfn, char *buf, 26ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
28 size_t csize, unsigned long offset, int userbuf) 27 size_t csize, unsigned long offset, int userbuf)
29{ 28{
30 void *vaddr; 29 void *vaddr;
31 30
@@ -33,14 +32,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
33 return 0; 32 return 0;
34 33
35 vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); 34 vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
35 if (!vaddr)
36 return -ENOMEM;
36 37
37 if (userbuf) { 38 if (userbuf) {
38 if (copy_to_user(buf, (vaddr + offset), csize)) { 39 if (copy_to_user(buf, vaddr + offset, csize)) {
39 iounmap(vaddr); 40 iounmap(vaddr);
40 return -EFAULT; 41 return -EFAULT;
41 } 42 }
42 } else 43 } else
43 memcpy(buf, (vaddr + offset), csize); 44 memcpy(buf, vaddr + offset, csize);
44 45
45 iounmap(vaddr); 46 iounmap(vaddr);
46 return csize; 47 return csize;
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index a47798b59f07..b4f14c6c09d9 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -66,6 +66,6 @@ struct tss_struct doublefault_tss __cacheline_aligned = {
66 .ds = __USER_DS, 66 .ds = __USER_DS,
67 .fs = __KERNEL_PERCPU, 67 .fs = __KERNEL_PERCPU,
68 68
69 .__cr3 = __pa(swapper_pg_dir) 69 .__cr3 = __pa_nodebug(swapper_pg_dir),
70 } 70 }
71}; 71};
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 11c11b8ec48d..2b69994fd3a8 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -2,26 +2,49 @@
2 * Debug Store support 2 * Debug Store support
3 * 3 *
4 * This provides a low-level interface to the hardware's Debug Store 4 * This provides a low-level interface to the hardware's Debug Store
5 * feature that is used for last branch recording (LBR) and 5 * feature that is used for branch trace store (BTS) and
6 * precise-event based sampling (PEBS). 6 * precise-event based sampling (PEBS).
7 * 7 *
8 * Different architectures use a different DS layout/pointer size. 8 * It manages:
9 * The below functions therefore work on a void*. 9 * - per-thread and per-cpu allocation of BTS and PEBS
10 * - buffer memory allocation (optional)
11 * - buffer overflow handling
12 * - buffer access
10 * 13 *
14 * It assumes:
15 * - get_task_struct on all parameter tasks
16 * - current is allowed to trace parameter tasks
11 * 17 *
12 * Since there is no user for PEBS, yet, only LBR (or branch
13 * trace store, BTS) is supported.
14 * 18 *
15 * 19 * Copyright (C) 2007-2008 Intel Corporation.
16 * Copyright (C) 2007 Intel Corporation. 20 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008
17 * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
18 */ 21 */
19 22
23
24#ifdef CONFIG_X86_DS
25
20#include <asm/ds.h> 26#include <asm/ds.h>
21 27
22#include <linux/errno.h> 28#include <linux/errno.h>
23#include <linux/string.h> 29#include <linux/string.h>
24#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/sched.h>
32#include <linux/mm.h>
33
34
35/*
36 * The configuration for a particular DS hardware implementation.
37 */
38struct ds_configuration {
39 /* the size of the DS structure in bytes */
40 unsigned char sizeof_ds;
41 /* the size of one pointer-typed field in the DS structure in bytes;
42 this covers the first 8 fields related to buffer management. */
43 unsigned char sizeof_field;
44 /* the size of a BTS/PEBS record in bytes */
45 unsigned char sizeof_rec[2];
46};
47static struct ds_configuration ds_cfg;
25 48
26 49
27/* 50/*
@@ -44,378 +67,747 @@
44 * (interrupt occurs when write pointer passes interrupt pointer) 67 * (interrupt occurs when write pointer passes interrupt pointer)
45 * - value to which counter is reset following counter overflow 68 * - value to which counter is reset following counter overflow
46 * 69 *
47 * On later architectures, the last branch recording hardware uses 70 * Later architectures use 64bit pointers throughout, whereas earlier
48 * 64bit pointers even in 32bit mode. 71 * architectures use 32bit pointers in 32bit mode.
49 *
50 *
51 * Branch Trace Store (BTS) records store information about control
52 * flow changes. They at least provide the following information:
53 * - source linear address
54 * - destination linear address
55 * 72 *
56 * Netburst supported a predicated bit that had been dropped in later
57 * architectures. We do not suppor it.
58 * 73 *
74 * We compute the base address for the first 8 fields based on:
75 * - the field size stored in the DS configuration
76 * - the relative field position
77 * - an offset giving the start of the respective region
59 * 78 *
60 * In order to abstract from the actual DS and BTS layout, we describe 79 * This offset is further used to index various arrays holding
61 * the access to the relevant fields. 80 * information for BTS and PEBS at the respective index.
62 * Thanks to Andi Kleen for proposing this design.
63 * 81 *
64 * The implementation, however, is not as general as it might seem. In 82 * On later 32bit processors, we only access the lower 32bit of the
65 * order to stay somewhat simple and efficient, we assume an 83 * 64bit pointer fields. The upper halves will be zeroed out.
66 * underlying unsigned type (mostly a pointer type) and we expect the
67 * field to be at least as big as that type.
68 */ 84 */
69 85
70/* 86enum ds_field {
71 * A special from_ip address to indicate that the BTS record is an 87 ds_buffer_base = 0,
72 * info record that needs to be interpreted or skipped. 88 ds_index,
73 */ 89 ds_absolute_maximum,
74#define BTS_ESCAPE_ADDRESS (-1) 90 ds_interrupt_threshold,
91};
75 92
76/* 93enum ds_qualifier {
77 * A field access descriptor 94 ds_bts = 0,
78 */ 95 ds_pebs
79struct access_desc {
80 unsigned char offset;
81 unsigned char size;
82}; 96};
83 97
98static inline unsigned long ds_get(const unsigned char *base,
99 enum ds_qualifier qual, enum ds_field field)
100{
101 base += (ds_cfg.sizeof_field * (field + (4 * qual)));
102 return *(unsigned long *)base;
103}
104
105static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
106 enum ds_field field, unsigned long value)
107{
108 base += (ds_cfg.sizeof_field * (field + (4 * qual)));
109 (*(unsigned long *)base) = value;
110}
111
112
84/* 113/*
85 * The configuration for a particular DS/BTS hardware implementation. 114 * Locking is done only for allocating BTS or PEBS resources and for
115 * guarding context and buffer memory allocation.
116 *
117 * Most functions require the current task to own the ds context part
118 * they are going to access. All the locking is done when validating
119 * access to the context.
86 */ 120 */
87struct ds_configuration { 121static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
88 /* the DS configuration */
89 unsigned char sizeof_ds;
90 struct access_desc bts_buffer_base;
91 struct access_desc bts_index;
92 struct access_desc bts_absolute_maximum;
93 struct access_desc bts_interrupt_threshold;
94 /* the BTS configuration */
95 unsigned char sizeof_bts;
96 struct access_desc from_ip;
97 struct access_desc to_ip;
98 /* BTS variants used to store additional information like
99 timestamps */
100 struct access_desc info_type;
101 struct access_desc info_data;
102 unsigned long debugctl_mask;
103};
104 122
105/* 123/*
106 * The global configuration used by the below accessor functions 124 * Validate that the current task is allowed to access the BTS/PEBS
125 * buffer of the parameter task.
126 *
127 * Returns 0, if access is granted; -Eerrno, otherwise.
107 */ 128 */
108static struct ds_configuration ds_cfg; 129static inline int ds_validate_access(struct ds_context *context,
130 enum ds_qualifier qual)
131{
132 if (!context)
133 return -EPERM;
134
135 if (context->owner[qual] == current)
136 return 0;
137
138 return -EPERM;
139}
140
109 141
110/* 142/*
111 * Accessor functions for some DS and BTS fields using the above 143 * We either support (system-wide) per-cpu or per-thread allocation.
112 * global ptrace_bts_cfg. 144 * We distinguish the two based on the task_struct pointer, where a
145 * NULL pointer indicates per-cpu allocation for the current cpu.
146 *
147 * Allocations are use-counted. As soon as resources are allocated,
148 * further allocations must be of the same type (per-cpu or
149 * per-thread). We model this by counting allocations (i.e. the number
150 * of tracers of a certain type) for one type negatively:
151 * =0 no tracers
152 * >0 number of per-thread tracers
153 * <0 number of per-cpu tracers
154 *
155 * The below functions to get and put tracers and to check the
156 * allocation type require the ds_lock to be held by the caller.
157 *
158 * Tracers essentially gives the number of ds contexts for a certain
159 * type of allocation.
113 */ 160 */
114static inline unsigned long get_bts_buffer_base(char *base) 161static long tracers;
162
163static inline void get_tracer(struct task_struct *task)
115{ 164{
116 return *(unsigned long *)(base + ds_cfg.bts_buffer_base.offset); 165 tracers += (task ? 1 : -1);
117} 166}
118static inline void set_bts_buffer_base(char *base, unsigned long value) 167
168static inline void put_tracer(struct task_struct *task)
119{ 169{
120 (*(unsigned long *)(base + ds_cfg.bts_buffer_base.offset)) = value; 170 tracers -= (task ? 1 : -1);
121} 171}
122static inline unsigned long get_bts_index(char *base) 172
173static inline int check_tracer(struct task_struct *task)
123{ 174{
124 return *(unsigned long *)(base + ds_cfg.bts_index.offset); 175 return (task ? (tracers >= 0) : (tracers <= 0));
125} 176}
126static inline void set_bts_index(char *base, unsigned long value) 177
178
179/*
180 * The DS context is either attached to a thread or to a cpu:
181 * - in the former case, the thread_struct contains a pointer to the
182 * attached context.
183 * - in the latter case, we use a static array of per-cpu context
184 * pointers.
185 *
186 * Contexts are use-counted. They are allocated on first access and
187 * deallocated when the last user puts the context.
188 *
189 * We distinguish between an allocating and a non-allocating get of a
190 * context:
191 * - the allocating get is used for requesting BTS/PEBS resources. It
192 * requires the caller to hold the global ds_lock.
193 * - the non-allocating get is used for all other cases. A
194 * non-existing context indicates an error. It acquires and releases
195 * the ds_lock itself for obtaining the context.
196 *
197 * A context and its DS configuration are allocated and deallocated
198 * together. A context always has a DS configuration of the
199 * appropriate size.
200 */
201static DEFINE_PER_CPU(struct ds_context *, system_context);
202
203#define this_system_context per_cpu(system_context, smp_processor_id())
204
205/*
206 * Returns the pointer to the parameter task's context or to the
207 * system-wide context, if task is NULL.
208 *
209 * Increases the use count of the returned context, if not NULL.
210 */
211static inline struct ds_context *ds_get_context(struct task_struct *task)
127{ 212{
128 (*(unsigned long *)(base + ds_cfg.bts_index.offset)) = value; 213 struct ds_context *context;
214
215 spin_lock(&ds_lock);
216
217 context = (task ? task->thread.ds_ctx : this_system_context);
218 if (context)
219 context->count++;
220
221 spin_unlock(&ds_lock);
222
223 return context;
129} 224}
130static inline unsigned long get_bts_absolute_maximum(char *base) 225
226/*
227 * Same as ds_get_context, but allocates the context and it's DS
228 * structure, if necessary; returns NULL; if out of memory.
229 *
230 * pre: requires ds_lock to be held
231 */
232static inline struct ds_context *ds_alloc_context(struct task_struct *task)
131{ 233{
132 return *(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset); 234 struct ds_context **p_context =
235 (task ? &task->thread.ds_ctx : &this_system_context);
236 struct ds_context *context = *p_context;
237
238 if (!context) {
239 context = kzalloc(sizeof(*context), GFP_KERNEL);
240
241 if (!context)
242 return NULL;
243
244 context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
245 if (!context->ds) {
246 kfree(context);
247 return NULL;
248 }
249
250 *p_context = context;
251
252 context->this = p_context;
253 context->task = task;
254
255 if (task)
256 set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
257
258 if (!task || (task == current))
259 wrmsr(MSR_IA32_DS_AREA, (unsigned long)context->ds, 0);
260
261 get_tracer(task);
262 }
263
264 context->count++;
265
266 return context;
133} 267}
134static inline void set_bts_absolute_maximum(char *base, unsigned long value) 268
269/*
270 * Decreases the use count of the parameter context, if not NULL.
271 * Deallocates the context, if the use count reaches zero.
272 */
273static inline void ds_put_context(struct ds_context *context)
135{ 274{
136 (*(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset)) = value; 275 if (!context)
276 return;
277
278 spin_lock(&ds_lock);
279
280 if (--context->count)
281 goto out;
282
283 *(context->this) = NULL;
284
285 if (context->task)
286 clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
287
288 if (!context->task || (context->task == current))
289 wrmsrl(MSR_IA32_DS_AREA, 0);
290
291 put_tracer(context->task);
292
293 /* free any leftover buffers from tracers that did not
294 * deallocate them properly. */
295 kfree(context->buffer[ds_bts]);
296 kfree(context->buffer[ds_pebs]);
297 kfree(context->ds);
298 kfree(context);
299 out:
300 spin_unlock(&ds_lock);
137} 301}
138static inline unsigned long get_bts_interrupt_threshold(char *base) 302
303
304/*
305 * Handle a buffer overflow
306 *
307 * task: the task whose buffers are overflowing;
308 * NULL for a buffer overflow on the current cpu
309 * context: the ds context
310 * qual: the buffer type
311 */
312static void ds_overflow(struct task_struct *task, struct ds_context *context,
313 enum ds_qualifier qual)
139{ 314{
140 return *(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset); 315 if (!context)
316 return;
317
318 if (context->callback[qual])
319 (*context->callback[qual])(task);
320
321 /* todo: do some more overflow handling */
141} 322}
142static inline void set_bts_interrupt_threshold(char *base, unsigned long value) 323
324
325/*
326 * Allocate a non-pageable buffer of the parameter size.
327 * Checks the memory and the locked memory rlimit.
328 *
329 * Returns the buffer, if successful;
330 * NULL, if out of memory or rlimit exceeded.
331 *
332 * size: the requested buffer size in bytes
333 * pages (out): if not NULL, contains the number of pages reserved
334 */
335static inline void *ds_allocate_buffer(size_t size, unsigned int *pages)
143{ 336{
144 (*(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset)) = value; 337 unsigned long rlim, vm, pgsz;
338 void *buffer;
339
340 pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
341
342 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
343 vm = current->mm->total_vm + pgsz;
344 if (rlim < vm)
345 return NULL;
346
347 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
348 vm = current->mm->locked_vm + pgsz;
349 if (rlim < vm)
350 return NULL;
351
352 buffer = kzalloc(size, GFP_KERNEL);
353 if (!buffer)
354 return NULL;
355
356 current->mm->total_vm += pgsz;
357 current->mm->locked_vm += pgsz;
358
359 if (pages)
360 *pages = pgsz;
361
362 return buffer;
145} 363}
146static inline unsigned long get_from_ip(char *base) 364
365static int ds_request(struct task_struct *task, void *base, size_t size,
366 ds_ovfl_callback_t ovfl, enum ds_qualifier qual)
147{ 367{
148 return *(unsigned long *)(base + ds_cfg.from_ip.offset); 368 struct ds_context *context;
369 unsigned long buffer, adj;
370 const unsigned long alignment = (1 << 3);
371 int error = 0;
372
373 if (!ds_cfg.sizeof_ds)
374 return -EOPNOTSUPP;
375
376 /* we require some space to do alignment adjustments below */
377 if (size < (alignment + ds_cfg.sizeof_rec[qual]))
378 return -EINVAL;
379
380 /* buffer overflow notification is not yet implemented */
381 if (ovfl)
382 return -EOPNOTSUPP;
383
384
385 spin_lock(&ds_lock);
386
387 if (!check_tracer(task))
388 return -EPERM;
389
390 error = -ENOMEM;
391 context = ds_alloc_context(task);
392 if (!context)
393 goto out_unlock;
394
395 error = -EALREADY;
396 if (context->owner[qual] == current)
397 goto out_unlock;
398 error = -EPERM;
399 if (context->owner[qual] != NULL)
400 goto out_unlock;
401 context->owner[qual] = current;
402
403 spin_unlock(&ds_lock);
404
405
406 error = -ENOMEM;
407 if (!base) {
408 base = ds_allocate_buffer(size, &context->pages[qual]);
409 if (!base)
410 goto out_release;
411
412 context->buffer[qual] = base;
413 }
414 error = 0;
415
416 context->callback[qual] = ovfl;
417
418 /* adjust the buffer address and size to meet alignment
419 * constraints:
420 * - buffer is double-word aligned
421 * - size is multiple of record size
422 *
423 * We checked the size at the very beginning; we have enough
424 * space to do the adjustment.
425 */
426 buffer = (unsigned long)base;
427
428 adj = ALIGN(buffer, alignment) - buffer;
429 buffer += adj;
430 size -= adj;
431
432 size /= ds_cfg.sizeof_rec[qual];
433 size *= ds_cfg.sizeof_rec[qual];
434
435 ds_set(context->ds, qual, ds_buffer_base, buffer);
436 ds_set(context->ds, qual, ds_index, buffer);
437 ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
438
439 if (ovfl) {
440 /* todo: select a suitable interrupt threshold */
441 } else
442 ds_set(context->ds, qual,
443 ds_interrupt_threshold, buffer + size + 1);
444
445 /* we keep the context until ds_release */
446 return error;
447
448 out_release:
449 context->owner[qual] = NULL;
450 ds_put_context(context);
451 return error;
452
453 out_unlock:
454 spin_unlock(&ds_lock);
455 ds_put_context(context);
456 return error;
149} 457}
150static inline void set_from_ip(char *base, unsigned long value) 458
459int ds_request_bts(struct task_struct *task, void *base, size_t size,
460 ds_ovfl_callback_t ovfl)
151{ 461{
152 (*(unsigned long *)(base + ds_cfg.from_ip.offset)) = value; 462 return ds_request(task, base, size, ovfl, ds_bts);
153} 463}
154static inline unsigned long get_to_ip(char *base) 464
465int ds_request_pebs(struct task_struct *task, void *base, size_t size,
466 ds_ovfl_callback_t ovfl)
155{ 467{
156 return *(unsigned long *)(base + ds_cfg.to_ip.offset); 468 return ds_request(task, base, size, ovfl, ds_pebs);
157} 469}
158static inline void set_to_ip(char *base, unsigned long value) 470
471static int ds_release(struct task_struct *task, enum ds_qualifier qual)
159{ 472{
160 (*(unsigned long *)(base + ds_cfg.to_ip.offset)) = value; 473 struct ds_context *context;
474 int error;
475
476 context = ds_get_context(task);
477 error = ds_validate_access(context, qual);
478 if (error < 0)
479 goto out;
480
481 kfree(context->buffer[qual]);
482 context->buffer[qual] = NULL;
483
484 current->mm->total_vm -= context->pages[qual];
485 current->mm->locked_vm -= context->pages[qual];
486 context->pages[qual] = 0;
487 context->owner[qual] = NULL;
488
489 /*
490 * we put the context twice:
491 * once for the ds_get_context
492 * once for the corresponding ds_request
493 */
494 ds_put_context(context);
495 out:
496 ds_put_context(context);
497 return error;
161} 498}
162static inline unsigned char get_info_type(char *base) 499
500int ds_release_bts(struct task_struct *task)
163{ 501{
164 return *(unsigned char *)(base + ds_cfg.info_type.offset); 502 return ds_release(task, ds_bts);
165} 503}
166static inline void set_info_type(char *base, unsigned char value) 504
505int ds_release_pebs(struct task_struct *task)
167{ 506{
168 (*(unsigned char *)(base + ds_cfg.info_type.offset)) = value; 507 return ds_release(task, ds_pebs);
169} 508}
170static inline unsigned long get_info_data(char *base) 509
510static int ds_get_index(struct task_struct *task, size_t *pos,
511 enum ds_qualifier qual)
171{ 512{
172 return *(unsigned long *)(base + ds_cfg.info_data.offset); 513 struct ds_context *context;
514 unsigned long base, index;
515 int error;
516
517 context = ds_get_context(task);
518 error = ds_validate_access(context, qual);
519 if (error < 0)
520 goto out;
521
522 base = ds_get(context->ds, qual, ds_buffer_base);
523 index = ds_get(context->ds, qual, ds_index);
524
525 error = ((index - base) / ds_cfg.sizeof_rec[qual]);
526 if (pos)
527 *pos = error;
528 out:
529 ds_put_context(context);
530 return error;
173} 531}
174static inline void set_info_data(char *base, unsigned long value) 532
533int ds_get_bts_index(struct task_struct *task, size_t *pos)
175{ 534{
176 (*(unsigned long *)(base + ds_cfg.info_data.offset)) = value; 535 return ds_get_index(task, pos, ds_bts);
177} 536}
178 537
538int ds_get_pebs_index(struct task_struct *task, size_t *pos)
539{
540 return ds_get_index(task, pos, ds_pebs);
541}
179 542
180int ds_allocate(void **dsp, size_t bts_size_in_bytes) 543static int ds_get_end(struct task_struct *task, size_t *pos,
544 enum ds_qualifier qual)
181{ 545{
182 size_t bts_size_in_records; 546 struct ds_context *context;
183 unsigned long bts; 547 unsigned long base, end;
184 void *ds; 548 int error;
549
550 context = ds_get_context(task);
551 error = ds_validate_access(context, qual);
552 if (error < 0)
553 goto out;
554
555 base = ds_get(context->ds, qual, ds_buffer_base);
556 end = ds_get(context->ds, qual, ds_absolute_maximum);
557
558 error = ((end - base) / ds_cfg.sizeof_rec[qual]);
559 if (pos)
560 *pos = error;
561 out:
562 ds_put_context(context);
563 return error;
564}
185 565
186 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) 566int ds_get_bts_end(struct task_struct *task, size_t *pos)
187 return -EOPNOTSUPP; 567{
568 return ds_get_end(task, pos, ds_bts);
569}
188 570
189 if (bts_size_in_bytes < 0) 571int ds_get_pebs_end(struct task_struct *task, size_t *pos)
190 return -EINVAL; 572{
573 return ds_get_end(task, pos, ds_pebs);
574}
191 575
192 bts_size_in_records = 576static int ds_access(struct task_struct *task, size_t index,
193 bts_size_in_bytes / ds_cfg.sizeof_bts; 577 const void **record, enum ds_qualifier qual)
194 bts_size_in_bytes = 578{
195 bts_size_in_records * ds_cfg.sizeof_bts; 579 struct ds_context *context;
580 unsigned long base, idx;
581 int error;
196 582
197 if (bts_size_in_bytes <= 0) 583 if (!record)
198 return -EINVAL; 584 return -EINVAL;
199 585
200 bts = (unsigned long)kzalloc(bts_size_in_bytes, GFP_KERNEL); 586 context = ds_get_context(task);
201 587 error = ds_validate_access(context, qual);
202 if (!bts) 588 if (error < 0)
203 return -ENOMEM; 589 goto out;
204 590
205 ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); 591 base = ds_get(context->ds, qual, ds_buffer_base);
592 idx = base + (index * ds_cfg.sizeof_rec[qual]);
206 593
207 if (!ds) { 594 error = -EINVAL;
208 kfree((void *)bts); 595 if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
209 return -ENOMEM; 596 goto out;
210 }
211
212 set_bts_buffer_base(ds, bts);
213 set_bts_index(ds, bts);
214 set_bts_absolute_maximum(ds, bts + bts_size_in_bytes);
215 set_bts_interrupt_threshold(ds, bts + bts_size_in_bytes + 1);
216 597
217 *dsp = ds; 598 *record = (const void *)idx;
218 return 0; 599 error = ds_cfg.sizeof_rec[qual];
600 out:
601 ds_put_context(context);
602 return error;
219} 603}
220 604
221int ds_free(void **dsp) 605int ds_access_bts(struct task_struct *task, size_t index, const void **record)
222{ 606{
223 if (*dsp) { 607 return ds_access(task, index, record, ds_bts);
224 kfree((void *)get_bts_buffer_base(*dsp));
225 kfree(*dsp);
226 *dsp = NULL;
227 }
228 return 0;
229} 608}
230 609
231int ds_get_bts_size(void *ds) 610int ds_access_pebs(struct task_struct *task, size_t index, const void **record)
232{ 611{
233 int size_in_bytes; 612 return ds_access(task, index, record, ds_pebs);
234
235 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
236 return -EOPNOTSUPP;
237
238 if (!ds)
239 return 0;
240
241 size_in_bytes =
242 get_bts_absolute_maximum(ds) -
243 get_bts_buffer_base(ds);
244 return size_in_bytes;
245} 613}
246 614
247int ds_get_bts_end(void *ds) 615static int ds_write(struct task_struct *task, const void *record, size_t size,
616 enum ds_qualifier qual, int force)
248{ 617{
249 int size_in_bytes = ds_get_bts_size(ds); 618 struct ds_context *context;
250 619 int error;
251 if (size_in_bytes <= 0)
252 return size_in_bytes;
253 620
254 return size_in_bytes / ds_cfg.sizeof_bts; 621 if (!record)
255} 622 return -EINVAL;
256 623
257int ds_get_bts_index(void *ds) 624 error = -EPERM;
258{ 625 context = ds_get_context(task);
259 int index_offset_in_bytes; 626 if (!context)
627 goto out;
260 628
261 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) 629 if (!force) {
262 return -EOPNOTSUPP; 630 error = ds_validate_access(context, qual);
631 if (error < 0)
632 goto out;
633 }
263 634
264 index_offset_in_bytes = 635 error = 0;
265 get_bts_index(ds) - 636 while (size) {
266 get_bts_buffer_base(ds); 637 unsigned long base, index, end, write_end, int_th;
638 unsigned long write_size, adj_write_size;
639
640 /*
641 * write as much as possible without producing an
642 * overflow interrupt.
643 *
644 * interrupt_threshold must either be
645 * - bigger than absolute_maximum or
646 * - point to a record between buffer_base and absolute_maximum
647 *
648 * index points to a valid record.
649 */
650 base = ds_get(context->ds, qual, ds_buffer_base);
651 index = ds_get(context->ds, qual, ds_index);
652 end = ds_get(context->ds, qual, ds_absolute_maximum);
653 int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
654
655 write_end = min(end, int_th);
656
657 /* if we are already beyond the interrupt threshold,
658 * we fill the entire buffer */
659 if (write_end <= index)
660 write_end = end;
661
662 if (write_end <= index)
663 goto out;
664
665 write_size = min((unsigned long) size, write_end - index);
666 memcpy((void *)index, record, write_size);
667
668 record = (const char *)record + write_size;
669 size -= write_size;
670 error += write_size;
671
672 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
673 adj_write_size *= ds_cfg.sizeof_rec[qual];
674
675 /* zero out trailing bytes */
676 memset((char *)index + write_size, 0,
677 adj_write_size - write_size);
678 index += adj_write_size;
679
680 if (index >= end)
681 index = base;
682 ds_set(context->ds, qual, ds_index, index);
683
684 if (index >= int_th)
685 ds_overflow(task, context, qual);
686 }
267 687
268 return index_offset_in_bytes / ds_cfg.sizeof_bts; 688 out:
689 ds_put_context(context);
690 return error;
269} 691}
270 692
271int ds_set_overflow(void *ds, int method) 693int ds_write_bts(struct task_struct *task, const void *record, size_t size)
272{ 694{
273 switch (method) { 695 return ds_write(task, record, size, ds_bts, /* force = */ 0);
274 case DS_O_SIGNAL:
275 return -EOPNOTSUPP;
276 case DS_O_WRAP:
277 return 0;
278 default:
279 return -EINVAL;
280 }
281} 696}
282 697
283int ds_get_overflow(void *ds) 698int ds_write_pebs(struct task_struct *task, const void *record, size_t size)
284{ 699{
285 return DS_O_WRAP; 700 return ds_write(task, record, size, ds_pebs, /* force = */ 0);
286} 701}
287 702
288int ds_clear(void *ds) 703int ds_unchecked_write_bts(struct task_struct *task,
704 const void *record, size_t size)
289{ 705{
290 int bts_size = ds_get_bts_size(ds); 706 return ds_write(task, record, size, ds_bts, /* force = */ 1);
291 unsigned long bts_base;
292
293 if (bts_size <= 0)
294 return bts_size;
295
296 bts_base = get_bts_buffer_base(ds);
297 memset((void *)bts_base, 0, bts_size);
298
299 set_bts_index(ds, bts_base);
300 return 0;
301} 707}
302 708
303int ds_read_bts(void *ds, int index, struct bts_struct *out) 709int ds_unchecked_write_pebs(struct task_struct *task,
710 const void *record, size_t size)
304{ 711{
305 void *bts; 712 return ds_write(task, record, size, ds_pebs, /* force = */ 1);
713}
306 714
307 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) 715static int ds_reset_or_clear(struct task_struct *task,
308 return -EOPNOTSUPP; 716 enum ds_qualifier qual, int clear)
717{
718 struct ds_context *context;
719 unsigned long base, end;
720 int error;
309 721
310 if (index < 0) 722 context = ds_get_context(task);
311 return -EINVAL; 723 error = ds_validate_access(context, qual);
724 if (error < 0)
725 goto out;
312 726
313 if (index >= ds_get_bts_size(ds)) 727 base = ds_get(context->ds, qual, ds_buffer_base);
314 return -EINVAL; 728 end = ds_get(context->ds, qual, ds_absolute_maximum);
315 729
316 bts = (void *)(get_bts_buffer_base(ds) + (index * ds_cfg.sizeof_bts)); 730 if (clear)
731 memset((void *)base, 0, end - base);
317 732
318 memset(out, 0, sizeof(*out)); 733 ds_set(context->ds, qual, ds_index, base);
319 if (get_from_ip(bts) == BTS_ESCAPE_ADDRESS) {
320 out->qualifier = get_info_type(bts);
321 out->variant.jiffies = get_info_data(bts);
322 } else {
323 out->qualifier = BTS_BRANCH;
324 out->variant.lbr.from_ip = get_from_ip(bts);
325 out->variant.lbr.to_ip = get_to_ip(bts);
326 }
327 734
328 return sizeof(*out);; 735 error = 0;
736 out:
737 ds_put_context(context);
738 return error;
329} 739}
330 740
331int ds_write_bts(void *ds, const struct bts_struct *in) 741int ds_reset_bts(struct task_struct *task)
332{ 742{
333 unsigned long bts; 743 return ds_reset_or_clear(task, ds_bts, /* clear = */ 0);
334 744}
335 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
336 return -EOPNOTSUPP;
337
338 if (ds_get_bts_size(ds) <= 0)
339 return -ENXIO;
340 745
341 bts = get_bts_index(ds); 746int ds_reset_pebs(struct task_struct *task)
747{
748 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0);
749}
342 750
343 memset((void *)bts, 0, ds_cfg.sizeof_bts); 751int ds_clear_bts(struct task_struct *task)
344 switch (in->qualifier) { 752{
345 case BTS_INVALID: 753 return ds_reset_or_clear(task, ds_bts, /* clear = */ 1);
346 break; 754}
347 755
348 case BTS_BRANCH: 756int ds_clear_pebs(struct task_struct *task)
349 set_from_ip((void *)bts, in->variant.lbr.from_ip); 757{
350 set_to_ip((void *)bts, in->variant.lbr.to_ip); 758 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1);
351 break; 759}
352 760
353 case BTS_TASK_ARRIVES: 761int ds_get_pebs_reset(struct task_struct *task, u64 *value)
354 case BTS_TASK_DEPARTS: 762{
355 set_from_ip((void *)bts, BTS_ESCAPE_ADDRESS); 763 struct ds_context *context;
356 set_info_type((void *)bts, in->qualifier); 764 int error;
357 set_info_data((void *)bts, in->variant.jiffies);
358 break;
359 765
360 default: 766 if (!value)
361 return -EINVAL; 767 return -EINVAL;
362 }
363 768
364 bts = bts + ds_cfg.sizeof_bts; 769 context = ds_get_context(task);
365 if (bts >= get_bts_absolute_maximum(ds)) 770 error = ds_validate_access(context, ds_pebs);
366 bts = get_bts_buffer_base(ds); 771 if (error < 0)
367 set_bts_index(ds, bts); 772 goto out;
368 773
369 return ds_cfg.sizeof_bts; 774 *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8));
775
776 error = 0;
777 out:
778 ds_put_context(context);
779 return error;
370} 780}
371 781
372unsigned long ds_debugctl_mask(void) 782int ds_set_pebs_reset(struct task_struct *task, u64 value)
373{ 783{
374 return ds_cfg.debugctl_mask; 784 struct ds_context *context;
375} 785 int error;
376 786
377#ifdef __i386__ 787 context = ds_get_context(task);
378static const struct ds_configuration ds_cfg_netburst = { 788 error = ds_validate_access(context, ds_pebs);
379 .sizeof_ds = 9 * 4, 789 if (error < 0)
380 .bts_buffer_base = { 0, 4 }, 790 goto out;
381 .bts_index = { 4, 4 },
382 .bts_absolute_maximum = { 8, 4 },
383 .bts_interrupt_threshold = { 12, 4 },
384 .sizeof_bts = 3 * 4,
385 .from_ip = { 0, 4 },
386 .to_ip = { 4, 4 },
387 .info_type = { 4, 1 },
388 .info_data = { 8, 4 },
389 .debugctl_mask = (1<<2)|(1<<3)
390};
391 791
392static const struct ds_configuration ds_cfg_pentium_m = { 792 *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value;
393 .sizeof_ds = 9 * 4, 793
394 .bts_buffer_base = { 0, 4 }, 794 error = 0;
395 .bts_index = { 4, 4 }, 795 out:
396 .bts_absolute_maximum = { 8, 4 }, 796 ds_put_context(context);
397 .bts_interrupt_threshold = { 12, 4 }, 797 return error;
398 .sizeof_bts = 3 * 4, 798}
399 .from_ip = { 0, 4 }, 799
400 .to_ip = { 4, 4 }, 800static const struct ds_configuration ds_cfg_var = {
401 .info_type = { 4, 1 }, 801 .sizeof_ds = sizeof(long) * 12,
402 .info_data = { 8, 4 }, 802 .sizeof_field = sizeof(long),
403 .debugctl_mask = (1<<6)|(1<<7) 803 .sizeof_rec[ds_bts] = sizeof(long) * 3,
804 .sizeof_rec[ds_pebs] = sizeof(long) * 10
404}; 805};
405#endif /* _i386_ */ 806static const struct ds_configuration ds_cfg_64 = {
406 807 .sizeof_ds = 8 * 12,
407static const struct ds_configuration ds_cfg_core2 = { 808 .sizeof_field = 8,
408 .sizeof_ds = 9 * 8, 809 .sizeof_rec[ds_bts] = 8 * 3,
409 .bts_buffer_base = { 0, 8 }, 810 .sizeof_rec[ds_pebs] = 8 * 10
410 .bts_index = { 8, 8 },
411 .bts_absolute_maximum = { 16, 8 },
412 .bts_interrupt_threshold = { 24, 8 },
413 .sizeof_bts = 3 * 8,
414 .from_ip = { 0, 8 },
415 .to_ip = { 8, 8 },
416 .info_type = { 8, 1 },
417 .info_data = { 16, 8 },
418 .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
419}; 811};
420 812
421static inline void 813static inline void
@@ -429,14 +821,13 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
429 switch (c->x86) { 821 switch (c->x86) {
430 case 0x6: 822 case 0x6:
431 switch (c->x86_model) { 823 switch (c->x86_model) {
432#ifdef __i386__
433 case 0xD: 824 case 0xD:
434 case 0xE: /* Pentium M */ 825 case 0xE: /* Pentium M */
435 ds_configure(&ds_cfg_pentium_m); 826 ds_configure(&ds_cfg_var);
436 break; 827 break;
437#endif /* _i386_ */
438 case 0xF: /* Core2 */ 828 case 0xF: /* Core2 */
439 ds_configure(&ds_cfg_core2); 829 case 0x1C: /* Atom */
830 ds_configure(&ds_cfg_64);
440 break; 831 break;
441 default: 832 default:
442 /* sorry, don't know about them */ 833 /* sorry, don't know about them */
@@ -445,13 +836,11 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
445 break; 836 break;
446 case 0xF: 837 case 0xF:
447 switch (c->x86_model) { 838 switch (c->x86_model) {
448#ifdef __i386__
449 case 0x0: 839 case 0x0:
450 case 0x1: 840 case 0x1:
451 case 0x2: /* Netburst */ 841 case 0x2: /* Netburst */
452 ds_configure(&ds_cfg_netburst); 842 ds_configure(&ds_cfg_var);
453 break; 843 break;
454#endif /* _i386_ */
455 default: 844 default:
456 /* sorry, don't know about them */ 845 /* sorry, don't know about them */
457 break; 846 break;
@@ -462,3 +851,14 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
462 break; 851 break;
463 } 852 }
464} 853}
854
855void ds_free(struct ds_context *context)
856{
857 /* This is called when the task owning the parameter context
858 * is dying. There should not be any user of that context left
859 * to disturb us, anymore. */
860 unsigned long leftovers = context->count;
861 while (leftovers--)
862 ds_put_context(context);
863}
864#endif /* CONFIG_X86_DS */
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
new file mode 100644
index 000000000000..201ee359a1a9
--- /dev/null
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -0,0 +1,447 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 */
5#include <linux/kallsyms.h>
6#include <linux/kprobes.h>
7#include <linux/uaccess.h>
8#include <linux/utsname.h>
9#include <linux/hardirq.h>
10#include <linux/kdebug.h>
11#include <linux/module.h>
12#include <linux/ptrace.h>
13#include <linux/kexec.h>
14#include <linux/bug.h>
15#include <linux/nmi.h>
16
17#include <asm/stacktrace.h>
18
19#define STACKSLOTS_PER_LINE 8
20#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
21
22int panic_on_unrecovered_nmi;
23int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
24static unsigned int code_bytes = 64;
25static int die_counter;
26
27void printk_address(unsigned long address, int reliable)
28{
29 printk(" [<%p>] %s%pS\n", (void *) address,
30 reliable ? "" : "? ", (void *) address);
31}
32
33static inline int valid_stack_ptr(struct thread_info *tinfo,
34 void *p, unsigned int size, void *end)
35{
36 void *t = tinfo;
37 if (end) {
38 if (p < end && p >= (end-THREAD_SIZE))
39 return 1;
40 else
41 return 0;
42 }
43 return p > t && p < t + THREAD_SIZE - size;
44}
45
46/* The form of the top of the frame on the stack */
47struct stack_frame {
48 struct stack_frame *next_frame;
49 unsigned long return_address;
50};
51
52static inline unsigned long
53print_context_stack(struct thread_info *tinfo,
54 unsigned long *stack, unsigned long bp,
55 const struct stacktrace_ops *ops, void *data,
56 unsigned long *end)
57{
58 struct stack_frame *frame = (struct stack_frame *)bp;
59
60 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
61 unsigned long addr;
62
63 addr = *stack;
64 if (__kernel_text_address(addr)) {
65 if ((unsigned long) stack == bp + sizeof(long)) {
66 ops->address(data, addr, 1);
67 frame = frame->next_frame;
68 bp = (unsigned long) frame;
69 } else {
70 ops->address(data, addr, bp == 0);
71 }
72 }
73 stack++;
74 }
75 return bp;
76}
77
78void dump_trace(struct task_struct *task, struct pt_regs *regs,
79 unsigned long *stack, unsigned long bp,
80 const struct stacktrace_ops *ops, void *data)
81{
82 if (!task)
83 task = current;
84
85 if (!stack) {
86 unsigned long dummy;
87 stack = &dummy;
88 if (task && task != current)
89 stack = (unsigned long *)task->thread.sp;
90 }
91
92#ifdef CONFIG_FRAME_POINTER
93 if (!bp) {
94 if (task == current) {
95 /* Grab bp right from our regs */
96 get_bp(bp);
97 } else {
98 /* bp is the last reg pushed by switch_to */
99 bp = *(unsigned long *) task->thread.sp;
100 }
101 }
102#endif
103
104 for (;;) {
105 struct thread_info *context;
106
107 context = (struct thread_info *)
108 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
109 bp = print_context_stack(context, stack, bp, ops, data, NULL);
110
111 stack = (unsigned long *)context->previous_esp;
112 if (!stack)
113 break;
114 if (ops->stack(data, "IRQ") < 0)
115 break;
116 touch_nmi_watchdog();
117 }
118}
119EXPORT_SYMBOL(dump_trace);
120
121static void
122print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
123{
124 printk(data);
125 print_symbol(msg, symbol);
126 printk("\n");
127}
128
129static void print_trace_warning(void *data, char *msg)
130{
131 printk("%s%s\n", (char *)data, msg);
132}
133
134static int print_trace_stack(void *data, char *name)
135{
136 printk("%s <%s> ", (char *)data, name);
137 return 0;
138}
139
140/*
141 * Print one address/symbol entries per line.
142 */
143static void print_trace_address(void *data, unsigned long addr, int reliable)
144{
145 touch_nmi_watchdog();
146 printk(data);
147 printk_address(addr, reliable);
148}
149
150static const struct stacktrace_ops print_trace_ops = {
151 .warning = print_trace_warning,
152 .warning_symbol = print_trace_warning_symbol,
153 .stack = print_trace_stack,
154 .address = print_trace_address,
155};
156
157static void
158show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
159 unsigned long *stack, unsigned long bp, char *log_lvl)
160{
161 printk("%sCall Trace:\n", log_lvl);
162 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
163}
164
165void show_trace(struct task_struct *task, struct pt_regs *regs,
166 unsigned long *stack, unsigned long bp)
167{
168 show_trace_log_lvl(task, regs, stack, bp, "");
169}
170
171static void
172show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
173 unsigned long *sp, unsigned long bp, char *log_lvl)
174{
175 unsigned long *stack;
176 int i;
177
178 if (sp == NULL) {
179 if (task)
180 sp = (unsigned long *)task->thread.sp;
181 else
182 sp = (unsigned long *)&sp;
183 }
184
185 stack = sp;
186 for (i = 0; i < kstack_depth_to_print; i++) {
187 if (kstack_end(stack))
188 break;
189 if (i && ((i % STACKSLOTS_PER_LINE) == 0))
190 printk("\n%s", log_lvl);
191 printk(" %08lx", *stack++);
192 touch_nmi_watchdog();
193 }
194 printk("\n");
195 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
196}
197
198void show_stack(struct task_struct *task, unsigned long *sp)
199{
200 show_stack_log_lvl(task, NULL, sp, 0, "");
201}
202
203/*
204 * The architecture-independent dump_stack generator
205 */
206void dump_stack(void)
207{
208 unsigned long bp = 0;
209 unsigned long stack;
210
211#ifdef CONFIG_FRAME_POINTER
212 if (!bp)
213 get_bp(bp);
214#endif
215
216 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
217 current->pid, current->comm, print_tainted(),
218 init_utsname()->release,
219 (int)strcspn(init_utsname()->version, " "),
220 init_utsname()->version);
221 show_trace(NULL, NULL, &stack, bp);
222}
223
224EXPORT_SYMBOL(dump_stack);
225
226void show_registers(struct pt_regs *regs)
227{
228 int i;
229
230 print_modules();
231 __show_regs(regs, 0);
232
233 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n",
234 TASK_COMM_LEN, current->comm, task_pid_nr(current),
235 current_thread_info(), current, task_thread_info(current));
236 /*
237 * When in-kernel, we also print out the stack and code at the
238 * time of the fault..
239 */
240 if (!user_mode_vm(regs)) {
241 unsigned int code_prologue = code_bytes * 43 / 64;
242 unsigned int code_len = code_bytes;
243 unsigned char c;
244 u8 *ip;
245
246 printk(KERN_EMERG "Stack:\n");
247 show_stack_log_lvl(NULL, regs, &regs->sp,
248 0, KERN_EMERG);
249
250 printk(KERN_EMERG "Code: ");
251
252 ip = (u8 *)regs->ip - code_prologue;
253 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
254 /* try starting at IP */
255 ip = (u8 *)regs->ip;
256 code_len = code_len - code_prologue + 1;
257 }
258 for (i = 0; i < code_len; i++, ip++) {
259 if (ip < (u8 *)PAGE_OFFSET ||
260 probe_kernel_address(ip, c)) {
261 printk(" Bad EIP value.");
262 break;
263 }
264 if (ip == (u8 *)regs->ip)
265 printk("<%02x> ", c);
266 else
267 printk("%02x ", c);
268 }
269 }
270 printk("\n");
271}
272
273int is_valid_bugaddr(unsigned long ip)
274{
275 unsigned short ud2;
276
277 if (ip < PAGE_OFFSET)
278 return 0;
279 if (probe_kernel_address((unsigned short *)ip, ud2))
280 return 0;
281
282 return ud2 == 0x0b0f;
283}
284
285static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
286static int die_owner = -1;
287static unsigned int die_nest_count;
288
289unsigned __kprobes long oops_begin(void)
290{
291 unsigned long flags;
292
293 oops_enter();
294
295 if (die_owner != raw_smp_processor_id()) {
296 console_verbose();
297 raw_local_irq_save(flags);
298 __raw_spin_lock(&die_lock);
299 die_owner = smp_processor_id();
300 die_nest_count = 0;
301 bust_spinlocks(1);
302 } else {
303 raw_local_irq_save(flags);
304 }
305 die_nest_count++;
306 return flags;
307}
308
309void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
310{
311 bust_spinlocks(0);
312 die_owner = -1;
313 add_taint(TAINT_DIE);
314 __raw_spin_unlock(&die_lock);
315 raw_local_irq_restore(flags);
316
317 if (!regs)
318 return;
319
320 if (kexec_should_crash(current))
321 crash_kexec(regs);
322 if (in_interrupt())
323 panic("Fatal exception in interrupt");
324 if (panic_on_oops)
325 panic("Fatal exception");
326 oops_exit();
327 do_exit(signr);
328}
329
330int __kprobes __die(const char *str, struct pt_regs *regs, long err)
331{
332 unsigned short ss;
333 unsigned long sp;
334
335 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
336#ifdef CONFIG_PREEMPT
337 printk("PREEMPT ");
338#endif
339#ifdef CONFIG_SMP
340 printk("SMP ");
341#endif
342#ifdef CONFIG_DEBUG_PAGEALLOC
343 printk("DEBUG_PAGEALLOC");
344#endif
345 printk("\n");
346 if (notify_die(DIE_OOPS, str, regs, err,
347 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
348 return 1;
349
350 show_registers(regs);
351 /* Executive summary in case the oops scrolled away */
352 sp = (unsigned long) (&regs->sp);
353 savesegment(ss, ss);
354 if (user_mode(regs)) {
355 sp = regs->sp;
356 ss = regs->ss & 0xffff;
357 }
358 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
359 print_symbol("%s", regs->ip);
360 printk(" SS:ESP %04x:%08lx\n", ss, sp);
361 return 0;
362}
363
364/*
365 * This is gone through when something in the kernel has done something bad
366 * and is about to be terminated:
367 */
368void die(const char *str, struct pt_regs *regs, long err)
369{
370 unsigned long flags = oops_begin();
371
372 if (die_nest_count < 3) {
373 report_bug(regs->ip, regs);
374
375 if (__die(str, regs, err))
376 regs = NULL;
377 } else {
378 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
379 }
380
381 oops_end(flags, regs, SIGSEGV);
382}
383
384static DEFINE_SPINLOCK(nmi_print_lock);
385
386void notrace __kprobes
387die_nmi(char *str, struct pt_regs *regs, int do_panic)
388{
389 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
390 return;
391
392 spin_lock(&nmi_print_lock);
393 /*
394 * We are in trouble anyway, lets at least try
395 * to get a message out:
396 */
397 bust_spinlocks(1);
398 printk(KERN_EMERG "%s", str);
399 printk(" on CPU%d, ip %08lx, registers:\n",
400 smp_processor_id(), regs->ip);
401 show_registers(regs);
402 if (do_panic)
403 panic("Non maskable interrupt");
404 console_silent();
405 spin_unlock(&nmi_print_lock);
406 bust_spinlocks(0);
407
408 /*
409 * If we are in kernel we are probably nested up pretty bad
410 * and might aswell get out now while we still can:
411 */
412 if (!user_mode_vm(regs)) {
413 current->thread.trap_no = 2;
414 crash_kexec(regs);
415 }
416
417 do_exit(SIGSEGV);
418}
419
420static int __init oops_setup(char *s)
421{
422 if (!s)
423 return -EINVAL;
424 if (!strcmp(s, "panic"))
425 panic_on_oops = 1;
426 return 0;
427}
428early_param("oops", oops_setup);
429
430static int __init kstack_setup(char *s)
431{
432 if (!s)
433 return -EINVAL;
434 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
435 return 0;
436}
437early_param("kstack", kstack_setup);
438
439static int __init code_bytes_setup(char *s)
440{
441 code_bytes = simple_strtoul(s, NULL, 0);
442 if (code_bytes > 8192)
443 code_bytes = 8192;
444
445 return 1;
446}
447__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
new file mode 100644
index 000000000000..086cc8118e39
--- /dev/null
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -0,0 +1,573 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 */
5#include <linux/kallsyms.h>
6#include <linux/kprobes.h>
7#include <linux/uaccess.h>
8#include <linux/utsname.h>
9#include <linux/hardirq.h>
10#include <linux/kdebug.h>
11#include <linux/module.h>
12#include <linux/ptrace.h>
13#include <linux/kexec.h>
14#include <linux/bug.h>
15#include <linux/nmi.h>
16
17#include <asm/stacktrace.h>
18
19#define STACKSLOTS_PER_LINE 4
20#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
21
22int panic_on_unrecovered_nmi;
23int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
24static unsigned int code_bytes = 64;
25static int die_counter;
26
27void printk_address(unsigned long address, int reliable)
28{
29 printk(" [<%p>] %s%pS\n", (void *) address,
30 reliable ? "" : "? ", (void *) address);
31}
32
33static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
34 unsigned *usedp, char **idp)
35{
36 static char ids[][8] = {
37 [DEBUG_STACK - 1] = "#DB",
38 [NMI_STACK - 1] = "NMI",
39 [DOUBLEFAULT_STACK - 1] = "#DF",
40 [STACKFAULT_STACK - 1] = "#SS",
41 [MCE_STACK - 1] = "#MC",
42#if DEBUG_STKSZ > EXCEPTION_STKSZ
43 [N_EXCEPTION_STACKS ...
44 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
45#endif
46 };
47 unsigned k;
48
49 /*
50 * Iterate over all exception stacks, and figure out whether
51 * 'stack' is in one of them:
52 */
53 for (k = 0; k < N_EXCEPTION_STACKS; k++) {
54 unsigned long end = per_cpu(orig_ist, cpu).ist[k];
55 /*
56 * Is 'stack' above this exception frame's end?
57 * If yes then skip to the next frame.
58 */
59 if (stack >= end)
60 continue;
61 /*
62 * Is 'stack' above this exception frame's start address?
63 * If yes then we found the right frame.
64 */
65 if (stack >= end - EXCEPTION_STKSZ) {
66 /*
67 * Make sure we only iterate through an exception
68 * stack once. If it comes up for the second time
69 * then there's something wrong going on - just
70 * break out and return NULL:
71 */
72 if (*usedp & (1U << k))
73 break;
74 *usedp |= 1U << k;
75 *idp = ids[k];
76 return (unsigned long *)end;
77 }
78 /*
79 * If this is a debug stack, and if it has a larger size than
80 * the usual exception stacks, then 'stack' might still
81 * be within the lower portion of the debug stack:
82 */
83#if DEBUG_STKSZ > EXCEPTION_STKSZ
84 if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
85 unsigned j = N_EXCEPTION_STACKS - 1;
86
87 /*
88 * Black magic. A large debug stack is composed of
89 * multiple exception stack entries, which we
90 * iterate through now. Dont look:
91 */
92 do {
93 ++j;
94 end -= EXCEPTION_STKSZ;
95 ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
96 } while (stack < end - EXCEPTION_STKSZ);
97 if (*usedp & (1U << j))
98 break;
99 *usedp |= 1U << j;
100 *idp = ids[j];
101 return (unsigned long *)end;
102 }
103#endif
104 }
105 return NULL;
106}
107
108/*
109 * x86-64 can have up to three kernel stacks:
110 * process stack
111 * interrupt stack
112 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
113 */
114
115static inline int valid_stack_ptr(struct thread_info *tinfo,
116 void *p, unsigned int size, void *end)
117{
118 void *t = tinfo;
119 if (end) {
120 if (p < end && p >= (end-THREAD_SIZE))
121 return 1;
122 else
123 return 0;
124 }
125 return p > t && p < t + THREAD_SIZE - size;
126}
127
128/* The form of the top of the frame on the stack */
129struct stack_frame {
130 struct stack_frame *next_frame;
131 unsigned long return_address;
132};
133
134static inline unsigned long
135print_context_stack(struct thread_info *tinfo,
136 unsigned long *stack, unsigned long bp,
137 const struct stacktrace_ops *ops, void *data,
138 unsigned long *end)
139{
140 struct stack_frame *frame = (struct stack_frame *)bp;
141
142 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
143 unsigned long addr;
144
145 addr = *stack;
146 if (__kernel_text_address(addr)) {
147 if ((unsigned long) stack == bp + sizeof(long)) {
148 ops->address(data, addr, 1);
149 frame = frame->next_frame;
150 bp = (unsigned long) frame;
151 } else {
152 ops->address(data, addr, bp == 0);
153 }
154 }
155 stack++;
156 }
157 return bp;
158}
159
160void dump_trace(struct task_struct *task, struct pt_regs *regs,
161 unsigned long *stack, unsigned long bp,
162 const struct stacktrace_ops *ops, void *data)
163{
164 const unsigned cpu = get_cpu();
165 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
166 unsigned used = 0;
167 struct thread_info *tinfo;
168
169 if (!task)
170 task = current;
171
172 if (!stack) {
173 unsigned long dummy;
174 stack = &dummy;
175 if (task && task != current)
176 stack = (unsigned long *)task->thread.sp;
177 }
178
179#ifdef CONFIG_FRAME_POINTER
180 if (!bp) {
181 if (task == current) {
182 /* Grab bp right from our regs */
183 get_bp(bp);
184 } else {
185 /* bp is the last reg pushed by switch_to */
186 bp = *(unsigned long *) task->thread.sp;
187 }
188 }
189#endif
190
191 /*
192 * Print function call entries in all stacks, starting at the
193 * current stack address. If the stacks consist of nested
194 * exceptions
195 */
196 tinfo = task_thread_info(task);
197 for (;;) {
198 char *id;
199 unsigned long *estack_end;
200 estack_end = in_exception_stack(cpu, (unsigned long)stack,
201 &used, &id);
202
203 if (estack_end) {
204 if (ops->stack(data, id) < 0)
205 break;
206
207 bp = print_context_stack(tinfo, stack, bp, ops,
208 data, estack_end);
209 ops->stack(data, "<EOE>");
210 /*
211 * We link to the next stack via the
212 * second-to-last pointer (index -2 to end) in the
213 * exception stack:
214 */
215 stack = (unsigned long *) estack_end[-2];
216 continue;
217 }
218 if (irqstack_end) {
219 unsigned long *irqstack;
220 irqstack = irqstack_end -
221 (IRQSTACKSIZE - 64) / sizeof(*irqstack);
222
223 if (stack >= irqstack && stack < irqstack_end) {
224 if (ops->stack(data, "IRQ") < 0)
225 break;
226 bp = print_context_stack(tinfo, stack, bp,
227 ops, data, irqstack_end);
228 /*
229 * We link to the next stack (which would be
230 * the process stack normally) the last
231 * pointer (index -1 to end) in the IRQ stack:
232 */
233 stack = (unsigned long *) (irqstack_end[-1]);
234 irqstack_end = NULL;
235 ops->stack(data, "EOI");
236 continue;
237 }
238 }
239 break;
240 }
241
242 /*
243 * This handles the process stack:
244 */
245 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
246 put_cpu();
247}
248EXPORT_SYMBOL(dump_trace);
249
250static void
251print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
252{
253 printk(data);
254 print_symbol(msg, symbol);
255 printk("\n");
256}
257
258static void print_trace_warning(void *data, char *msg)
259{
260 printk("%s%s\n", (char *)data, msg);
261}
262
263static int print_trace_stack(void *data, char *name)
264{
265 printk("%s <%s> ", (char *)data, name);
266 return 0;
267}
268
269/*
270 * Print one address/symbol entries per line.
271 */
272static void print_trace_address(void *data, unsigned long addr, int reliable)
273{
274 touch_nmi_watchdog();
275 printk(data);
276 printk_address(addr, reliable);
277}
278
279static const struct stacktrace_ops print_trace_ops = {
280 .warning = print_trace_warning,
281 .warning_symbol = print_trace_warning_symbol,
282 .stack = print_trace_stack,
283 .address = print_trace_address,
284};
285
286static void
287show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
288 unsigned long *stack, unsigned long bp, char *log_lvl)
289{
290 printk("%sCall Trace:\n", log_lvl);
291 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
292}
293
294void show_trace(struct task_struct *task, struct pt_regs *regs,
295 unsigned long *stack, unsigned long bp)
296{
297 show_trace_log_lvl(task, regs, stack, bp, "");
298}
299
300static void
301show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
302 unsigned long *sp, unsigned long bp, char *log_lvl)
303{
304 unsigned long *stack;
305 int i;
306 const int cpu = smp_processor_id();
307 unsigned long *irqstack_end =
308 (unsigned long *) (cpu_pda(cpu)->irqstackptr);
309 unsigned long *irqstack =
310 (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
311
312 /*
313 * debugging aid: "show_stack(NULL, NULL);" prints the
314 * back trace for this cpu.
315 */
316
317 if (sp == NULL) {
318 if (task)
319 sp = (unsigned long *)task->thread.sp;
320 else
321 sp = (unsigned long *)&sp;
322 }
323
324 stack = sp;
325 for (i = 0; i < kstack_depth_to_print; i++) {
326 if (stack >= irqstack && stack <= irqstack_end) {
327 if (stack == irqstack_end) {
328 stack = (unsigned long *) (irqstack_end[-1]);
329 printk(" <EOI> ");
330 }
331 } else {
332 if (((long) stack & (THREAD_SIZE-1)) == 0)
333 break;
334 }
335 if (i && ((i % STACKSLOTS_PER_LINE) == 0))
336 printk("\n%s", log_lvl);
337 printk(" %016lx", *stack++);
338 touch_nmi_watchdog();
339 }
340 printk("\n");
341 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
342}
343
344void show_stack(struct task_struct *task, unsigned long *sp)
345{
346 show_stack_log_lvl(task, NULL, sp, 0, "");
347}
348
349/*
350 * The architecture-independent dump_stack generator
351 */
352void dump_stack(void)
353{
354 unsigned long bp = 0;
355 unsigned long stack;
356
357#ifdef CONFIG_FRAME_POINTER
358 if (!bp)
359 get_bp(bp);
360#endif
361
362 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
363 current->pid, current->comm, print_tainted(),
364 init_utsname()->release,
365 (int)strcspn(init_utsname()->version, " "),
366 init_utsname()->version);
367 show_trace(NULL, NULL, &stack, bp);
368}
369EXPORT_SYMBOL(dump_stack);
370
371void show_registers(struct pt_regs *regs)
372{
373 int i;
374 unsigned long sp;
375 const int cpu = smp_processor_id();
376 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
377
378 sp = regs->sp;
379 printk("CPU %d ", cpu);
380 __show_regs(regs, 1);
381 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
382 cur->comm, cur->pid, task_thread_info(cur), cur);
383
384 /*
385 * When in-kernel, we also print out the stack and code at the
386 * time of the fault..
387 */
388 if (!user_mode(regs)) {
389 unsigned int code_prologue = code_bytes * 43 / 64;
390 unsigned int code_len = code_bytes;
391 unsigned char c;
392 u8 *ip;
393
394 printk(KERN_EMERG "Stack:\n");
395 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
396 regs->bp, KERN_EMERG);
397
398 printk(KERN_EMERG "Code: ");
399
400 ip = (u8 *)regs->ip - code_prologue;
401 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
402 /* try starting at IP */
403 ip = (u8 *)regs->ip;
404 code_len = code_len - code_prologue + 1;
405 }
406 for (i = 0; i < code_len; i++, ip++) {
407 if (ip < (u8 *)PAGE_OFFSET ||
408 probe_kernel_address(ip, c)) {
409 printk(" Bad RIP value.");
410 break;
411 }
412 if (ip == (u8 *)regs->ip)
413 printk("<%02x> ", c);
414 else
415 printk("%02x ", c);
416 }
417 }
418 printk("\n");
419}
420
421int is_valid_bugaddr(unsigned long ip)
422{
423 unsigned short ud2;
424
425 if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
426 return 0;
427
428 return ud2 == 0x0b0f;
429}
430
431static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
432static int die_owner = -1;
433static unsigned int die_nest_count;
434
435unsigned __kprobes long oops_begin(void)
436{
437 int cpu;
438 unsigned long flags;
439
440 oops_enter();
441
442 /* racy, but better than risking deadlock. */
443 raw_local_irq_save(flags);
444 cpu = smp_processor_id();
445 if (!__raw_spin_trylock(&die_lock)) {
446 if (cpu == die_owner)
447 /* nested oops. should stop eventually */;
448 else
449 __raw_spin_lock(&die_lock);
450 }
451 die_nest_count++;
452 die_owner = cpu;
453 console_verbose();
454 bust_spinlocks(1);
455 return flags;
456}
457
458void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
459{
460 die_owner = -1;
461 bust_spinlocks(0);
462 die_nest_count--;
463 if (!die_nest_count)
464 /* Nest count reaches zero, release the lock. */
465 __raw_spin_unlock(&die_lock);
466 raw_local_irq_restore(flags);
467 if (!regs) {
468 oops_exit();
469 return;
470 }
471 if (in_interrupt())
472 panic("Fatal exception in interrupt");
473 if (panic_on_oops)
474 panic("Fatal exception");
475 oops_exit();
476 do_exit(signr);
477}
478
479int __kprobes __die(const char *str, struct pt_regs *regs, long err)
480{
481 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
482#ifdef CONFIG_PREEMPT
483 printk("PREEMPT ");
484#endif
485#ifdef CONFIG_SMP
486 printk("SMP ");
487#endif
488#ifdef CONFIG_DEBUG_PAGEALLOC
489 printk("DEBUG_PAGEALLOC");
490#endif
491 printk("\n");
492 if (notify_die(DIE_OOPS, str, regs, err,
493 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
494 return 1;
495
496 show_registers(regs);
497 add_taint(TAINT_DIE);
498 /* Executive summary in case the oops scrolled away */
499 printk(KERN_ALERT "RIP ");
500 printk_address(regs->ip, 1);
501 printk(" RSP <%016lx>\n", regs->sp);
502 if (kexec_should_crash(current))
503 crash_kexec(regs);
504 return 0;
505}
506
507void die(const char *str, struct pt_regs *regs, long err)
508{
509 unsigned long flags = oops_begin();
510
511 if (!user_mode(regs))
512 report_bug(regs->ip, regs);
513
514 if (__die(str, regs, err))
515 regs = NULL;
516 oops_end(flags, regs, SIGSEGV);
517}
518
519notrace __kprobes void
520die_nmi(char *str, struct pt_regs *regs, int do_panic)
521{
522 unsigned long flags;
523
524 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
525 return;
526
527 flags = oops_begin();
528 /*
529 * We are in trouble anyway, lets at least try
530 * to get a message out.
531 */
532 printk(KERN_EMERG "%s", str);
533 printk(" on CPU%d, ip %08lx, registers:\n",
534 smp_processor_id(), regs->ip);
535 show_registers(regs);
536 if (kexec_should_crash(current))
537 crash_kexec(regs);
538 if (do_panic || panic_on_oops)
539 panic("Non maskable interrupt");
540 oops_end(flags, NULL, SIGBUS);
541 nmi_exit();
542 local_irq_enable();
543 do_exit(SIGBUS);
544}
545
546static int __init oops_setup(char *s)
547{
548 if (!s)
549 return -EINVAL;
550 if (!strcmp(s, "panic"))
551 panic_on_oops = 1;
552 return 0;
553}
554early_param("oops", oops_setup);
555
556static int __init kstack_setup(char *s)
557{
558 if (!s)
559 return -EINVAL;
560 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
561 return 0;
562}
563early_param("kstack", kstack_setup);
564
565static int __init code_bytes_setup(char *s)
566{
567 code_bytes = simple_strtoul(s, NULL, 0);
568 if (code_bytes > 8192)
569 code_bytes = 8192;
570
571 return 1;
572}
573__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
new file mode 100644
index 000000000000..78e642feac30
--- /dev/null
+++ b/arch/x86/kernel/e820.c
@@ -0,0 +1,1391 @@
1/*
2 * Handle the memory map.
3 * The functions here do the job until bootmem takes over.
4 *
5 * Getting sanitize_e820_map() in sync with i386 version by applying change:
6 * - Provisions for empty E820 memory regions (reported by certain BIOSes).
7 * Alex Achenbach <xela@slit.de>, December 2002.
8 * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
9 *
10 */
11#include <linux/kernel.h>
12#include <linux/types.h>
13#include <linux/init.h>
14#include <linux/bootmem.h>
15#include <linux/ioport.h>
16#include <linux/string.h>
17#include <linux/kexec.h>
18#include <linux/module.h>
19#include <linux/mm.h>
20#include <linux/pfn.h>
21#include <linux/suspend.h>
22#include <linux/firmware-map.h>
23
24#include <asm/pgtable.h>
25#include <asm/page.h>
26#include <asm/e820.h>
27#include <asm/proto.h>
28#include <asm/setup.h>
29#include <asm/trampoline.h>
30
31/*
32 * The e820 map is the map that gets modified e.g. with command line parameters
33 * and that is also registered with modifications in the kernel resource tree
34 * with the iomem_resource as parent.
35 *
36 * The e820_saved is directly saved after the BIOS-provided memory map is
37 * copied. It doesn't get modified afterwards. It's registered for the
38 * /sys/firmware/memmap interface.
39 *
40 * That memory map is not modified and is used as base for kexec. The kexec'd
41 * kernel should get the same memory map as the firmware provides. Then the
42 * user can e.g. boot the original kernel with mem=1G while still booting the
43 * next kernel with full memory.
44 */
45struct e820map e820;
46struct e820map e820_saved;
47
48/* For PCI or other memory-mapped resources */
49unsigned long pci_mem_start = 0xaeedbabe;
50#ifdef CONFIG_PCI
51EXPORT_SYMBOL(pci_mem_start);
52#endif
53
54/*
55 * This function checks if any part of the range <start,end> is mapped
56 * with type.
57 */
58int
59e820_any_mapped(u64 start, u64 end, unsigned type)
60{
61 int i;
62
63 for (i = 0; i < e820.nr_map; i++) {
64 struct e820entry *ei = &e820.map[i];
65
66 if (type && ei->type != type)
67 continue;
68 if (ei->addr >= end || ei->addr + ei->size <= start)
69 continue;
70 return 1;
71 }
72 return 0;
73}
74EXPORT_SYMBOL_GPL(e820_any_mapped);
75
76/*
77 * This function checks if the entire range <start,end> is mapped with type.
78 *
79 * Note: this function only works correct if the e820 table is sorted and
80 * not-overlapping, which is the case
81 */
82int __init e820_all_mapped(u64 start, u64 end, unsigned type)
83{
84 int i;
85
86 for (i = 0; i < e820.nr_map; i++) {
87 struct e820entry *ei = &e820.map[i];
88
89 if (type && ei->type != type)
90 continue;
91 /* is the region (part) in overlap with the current region ?*/
92 if (ei->addr >= end || ei->addr + ei->size <= start)
93 continue;
94
95 /* if the region is at the beginning of <start,end> we move
96 * start to the end of the region since it's ok until there
97 */
98 if (ei->addr <= start)
99 start = ei->addr + ei->size;
100 /*
101 * if start is now at or beyond end, we're done, full
102 * coverage
103 */
104 if (start >= end)
105 return 1;
106 }
107 return 0;
108}
109
110/*
111 * Add a memory region to the kernel e820 map.
112 */
113void __init e820_add_region(u64 start, u64 size, int type)
114{
115 int x = e820.nr_map;
116
117 if (x == ARRAY_SIZE(e820.map)) {
118 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
119 return;
120 }
121
122 e820.map[x].addr = start;
123 e820.map[x].size = size;
124 e820.map[x].type = type;
125 e820.nr_map++;
126}
127
128void __init e820_print_map(char *who)
129{
130 int i;
131
132 for (i = 0; i < e820.nr_map; i++) {
133 printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
134 (unsigned long long) e820.map[i].addr,
135 (unsigned long long)
136 (e820.map[i].addr + e820.map[i].size));
137 switch (e820.map[i].type) {
138 case E820_RAM:
139 case E820_RESERVED_KERN:
140 printk(KERN_CONT "(usable)\n");
141 break;
142 case E820_RESERVED:
143 printk(KERN_CONT "(reserved)\n");
144 break;
145 case E820_ACPI:
146 printk(KERN_CONT "(ACPI data)\n");
147 break;
148 case E820_NVS:
149 printk(KERN_CONT "(ACPI NVS)\n");
150 break;
151 case E820_UNUSABLE:
152 printk("(unusable)\n");
153 break;
154 default:
155 printk(KERN_CONT "type %u\n", e820.map[i].type);
156 break;
157 }
158 }
159}
160
161/*
162 * Sanitize the BIOS e820 map.
163 *
164 * Some e820 responses include overlapping entries. The following
165 * replaces the original e820 map with a new one, removing overlaps,
166 * and resolving conflicting memory types in favor of highest
167 * numbered type.
168 *
169 * The input parameter biosmap points to an array of 'struct
170 * e820entry' which on entry has elements in the range [0, *pnr_map)
171 * valid, and which has space for up to max_nr_map entries.
172 * On return, the resulting sanitized e820 map entries will be in
173 * overwritten in the same location, starting at biosmap.
174 *
175 * The integer pointed to by pnr_map must be valid on entry (the
176 * current number of valid entries located at biosmap) and will
177 * be updated on return, with the new number of valid entries
178 * (something no more than max_nr_map.)
179 *
180 * The return value from sanitize_e820_map() is zero if it
181 * successfully 'sanitized' the map entries passed in, and is -1
182 * if it did nothing, which can happen if either of (1) it was
183 * only passed one map entry, or (2) any of the input map entries
184 * were invalid (start + size < start, meaning that the size was
185 * so big the described memory range wrapped around through zero.)
186 *
187 * Visually we're performing the following
188 * (1,2,3,4 = memory types)...
189 *
190 * Sample memory map (w/overlaps):
191 * ____22__________________
192 * ______________________4_
193 * ____1111________________
194 * _44_____________________
195 * 11111111________________
196 * ____________________33__
197 * ___________44___________
198 * __________33333_________
199 * ______________22________
200 * ___________________2222_
201 * _________111111111______
202 * _____________________11_
203 * _________________4______
204 *
205 * Sanitized equivalent (no overlap):
206 * 1_______________________
207 * _44_____________________
208 * ___1____________________
209 * ____22__________________
210 * ______11________________
211 * _________1______________
212 * __________3_____________
213 * ___________44___________
214 * _____________33_________
215 * _______________2________
216 * ________________1_______
217 * _________________4______
218 * ___________________2____
219 * ____________________33__
220 * ______________________4_
221 */
222
223int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
224 int *pnr_map)
225{
226 struct change_member {
227 struct e820entry *pbios; /* pointer to original bios entry */
228 unsigned long long addr; /* address for this change point */
229 };
230 static struct change_member change_point_list[2*E820_X_MAX] __initdata;
231 static struct change_member *change_point[2*E820_X_MAX] __initdata;
232 static struct e820entry *overlap_list[E820_X_MAX] __initdata;
233 static struct e820entry new_bios[E820_X_MAX] __initdata;
234 struct change_member *change_tmp;
235 unsigned long current_type, last_type;
236 unsigned long long last_addr;
237 int chgidx, still_changing;
238 int overlap_entries;
239 int new_bios_entry;
240 int old_nr, new_nr, chg_nr;
241 int i;
242
243 /* if there's only one memory region, don't bother */
244 if (*pnr_map < 2)
245 return -1;
246
247 old_nr = *pnr_map;
248 BUG_ON(old_nr > max_nr_map);
249
250 /* bail out if we find any unreasonable addresses in bios map */
251 for (i = 0; i < old_nr; i++)
252 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
253 return -1;
254
255 /* create pointers for initial change-point information (for sorting) */
256 for (i = 0; i < 2 * old_nr; i++)
257 change_point[i] = &change_point_list[i];
258
259 /* record all known change-points (starting and ending addresses),
260 omitting those that are for empty memory regions */
261 chgidx = 0;
262 for (i = 0; i < old_nr; i++) {
263 if (biosmap[i].size != 0) {
264 change_point[chgidx]->addr = biosmap[i].addr;
265 change_point[chgidx++]->pbios = &biosmap[i];
266 change_point[chgidx]->addr = biosmap[i].addr +
267 biosmap[i].size;
268 change_point[chgidx++]->pbios = &biosmap[i];
269 }
270 }
271 chg_nr = chgidx;
272
273 /* sort change-point list by memory addresses (low -> high) */
274 still_changing = 1;
275 while (still_changing) {
276 still_changing = 0;
277 for (i = 1; i < chg_nr; i++) {
278 unsigned long long curaddr, lastaddr;
279 unsigned long long curpbaddr, lastpbaddr;
280
281 curaddr = change_point[i]->addr;
282 lastaddr = change_point[i - 1]->addr;
283 curpbaddr = change_point[i]->pbios->addr;
284 lastpbaddr = change_point[i - 1]->pbios->addr;
285
286 /*
287 * swap entries, when:
288 *
289 * curaddr > lastaddr or
290 * curaddr == lastaddr and curaddr == curpbaddr and
291 * lastaddr != lastpbaddr
292 */
293 if (curaddr < lastaddr ||
294 (curaddr == lastaddr && curaddr == curpbaddr &&
295 lastaddr != lastpbaddr)) {
296 change_tmp = change_point[i];
297 change_point[i] = change_point[i-1];
298 change_point[i-1] = change_tmp;
299 still_changing = 1;
300 }
301 }
302 }
303
304 /* create a new bios memory map, removing overlaps */
305 overlap_entries = 0; /* number of entries in the overlap table */
306 new_bios_entry = 0; /* index for creating new bios map entries */
307 last_type = 0; /* start with undefined memory type */
308 last_addr = 0; /* start with 0 as last starting address */
309
310 /* loop through change-points, determining affect on the new bios map */
311 for (chgidx = 0; chgidx < chg_nr; chgidx++) {
312 /* keep track of all overlapping bios entries */
313 if (change_point[chgidx]->addr ==
314 change_point[chgidx]->pbios->addr) {
315 /*
316 * add map entry to overlap list (> 1 entry
317 * implies an overlap)
318 */
319 overlap_list[overlap_entries++] =
320 change_point[chgidx]->pbios;
321 } else {
322 /*
323 * remove entry from list (order independent,
324 * so swap with last)
325 */
326 for (i = 0; i < overlap_entries; i++) {
327 if (overlap_list[i] ==
328 change_point[chgidx]->pbios)
329 overlap_list[i] =
330 overlap_list[overlap_entries-1];
331 }
332 overlap_entries--;
333 }
334 /*
335 * if there are overlapping entries, decide which
336 * "type" to use (larger value takes precedence --
337 * 1=usable, 2,3,4,4+=unusable)
338 */
339 current_type = 0;
340 for (i = 0; i < overlap_entries; i++)
341 if (overlap_list[i]->type > current_type)
342 current_type = overlap_list[i]->type;
343 /*
344 * continue building up new bios map based on this
345 * information
346 */
347 if (current_type != last_type) {
348 if (last_type != 0) {
349 new_bios[new_bios_entry].size =
350 change_point[chgidx]->addr - last_addr;
351 /*
352 * move forward only if the new size
353 * was non-zero
354 */
355 if (new_bios[new_bios_entry].size != 0)
356 /*
357 * no more space left for new
358 * bios entries ?
359 */
360 if (++new_bios_entry >= max_nr_map)
361 break;
362 }
363 if (current_type != 0) {
364 new_bios[new_bios_entry].addr =
365 change_point[chgidx]->addr;
366 new_bios[new_bios_entry].type = current_type;
367 last_addr = change_point[chgidx]->addr;
368 }
369 last_type = current_type;
370 }
371 }
372 /* retain count for new bios entries */
373 new_nr = new_bios_entry;
374
375 /* copy new bios mapping into original location */
376 memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
377 *pnr_map = new_nr;
378
379 return 0;
380}
381
382static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
383{
384 while (nr_map) {
385 u64 start = biosmap->addr;
386 u64 size = biosmap->size;
387 u64 end = start + size;
388 u32 type = biosmap->type;
389
390 /* Overflow in 64 bits? Ignore the memory map. */
391 if (start > end)
392 return -1;
393
394 e820_add_region(start, size, type);
395
396 biosmap++;
397 nr_map--;
398 }
399 return 0;
400}
401
402/*
403 * Copy the BIOS e820 map into a safe place.
404 *
405 * Sanity-check it while we're at it..
406 *
407 * If we're lucky and live on a modern system, the setup code
408 * will have given us a memory map that we can use to properly
409 * set up memory. If we aren't, we'll fake a memory map.
410 */
411static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
412{
413 /* Only one memory region (or negative)? Ignore it */
414 if (nr_map < 2)
415 return -1;
416
417 return __append_e820_map(biosmap, nr_map);
418}
419
420static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
421 u64 size, unsigned old_type,
422 unsigned new_type)
423{
424 int i;
425 u64 real_updated_size = 0;
426
427 BUG_ON(old_type == new_type);
428
429 if (size > (ULLONG_MAX - start))
430 size = ULLONG_MAX - start;
431
432 for (i = 0; i < e820.nr_map; i++) {
433 struct e820entry *ei = &e820x->map[i];
434 u64 final_start, final_end;
435 if (ei->type != old_type)
436 continue;
437 /* totally covered? */
438 if (ei->addr >= start &&
439 (ei->addr + ei->size) <= (start + size)) {
440 ei->type = new_type;
441 real_updated_size += ei->size;
442 continue;
443 }
444 /* partially covered */
445 final_start = max(start, ei->addr);
446 final_end = min(start + size, ei->addr + ei->size);
447 if (final_start >= final_end)
448 continue;
449 e820_add_region(final_start, final_end - final_start,
450 new_type);
451 real_updated_size += final_end - final_start;
452
453 ei->size -= final_end - final_start;
454 if (ei->addr < final_start)
455 continue;
456 ei->addr = final_end;
457 }
458 return real_updated_size;
459}
460
461u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
462 unsigned new_type)
463{
464 return e820_update_range_map(&e820, start, size, old_type, new_type);
465}
466
467static u64 __init e820_update_range_saved(u64 start, u64 size,
468 unsigned old_type, unsigned new_type)
469{
470 return e820_update_range_map(&e820_saved, start, size, old_type,
471 new_type);
472}
473
474/* make e820 not cover the range */
475u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
476 int checktype)
477{
478 int i;
479 u64 real_removed_size = 0;
480
481 if (size > (ULLONG_MAX - start))
482 size = ULLONG_MAX - start;
483
484 for (i = 0; i < e820.nr_map; i++) {
485 struct e820entry *ei = &e820.map[i];
486 u64 final_start, final_end;
487
488 if (checktype && ei->type != old_type)
489 continue;
490 /* totally covered? */
491 if (ei->addr >= start &&
492 (ei->addr + ei->size) <= (start + size)) {
493 real_removed_size += ei->size;
494 memset(ei, 0, sizeof(struct e820entry));
495 continue;
496 }
497 /* partially covered */
498 final_start = max(start, ei->addr);
499 final_end = min(start + size, ei->addr + ei->size);
500 if (final_start >= final_end)
501 continue;
502 real_removed_size += final_end - final_start;
503
504 ei->size -= final_end - final_start;
505 if (ei->addr < final_start)
506 continue;
507 ei->addr = final_end;
508 }
509 return real_removed_size;
510}
511
512void __init update_e820(void)
513{
514 int nr_map;
515
516 nr_map = e820.nr_map;
517 if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
518 return;
519 e820.nr_map = nr_map;
520 printk(KERN_INFO "modified physical RAM map:\n");
521 e820_print_map("modified");
522}
523static void __init update_e820_saved(void)
524{
525 int nr_map;
526
527 nr_map = e820_saved.nr_map;
528 if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
529 return;
530 e820_saved.nr_map = nr_map;
531}
532#define MAX_GAP_END 0x100000000ull
533/*
534 * Search for a gap in the e820 memory space from start_addr to end_addr.
535 */
536__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
537 unsigned long start_addr, unsigned long long end_addr)
538{
539 unsigned long long last;
540 int i = e820.nr_map;
541 int found = 0;
542
543 last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END;
544
545 while (--i >= 0) {
546 unsigned long long start = e820.map[i].addr;
547 unsigned long long end = start + e820.map[i].size;
548
549 if (end < start_addr)
550 continue;
551
552 /*
553 * Since "last" is at most 4GB, we know we'll
554 * fit in 32 bits if this condition is true
555 */
556 if (last > end) {
557 unsigned long gap = last - end;
558
559 if (gap >= *gapsize) {
560 *gapsize = gap;
561 *gapstart = end;
562 found = 1;
563 }
564 }
565 if (start < last)
566 last = start;
567 }
568 return found;
569}
570
571/*
572 * Search for the biggest gap in the low 32 bits of the e820
573 * memory space. We pass this space to PCI to assign MMIO resources
574 * for hotplug or unconfigured devices in.
575 * Hopefully the BIOS let enough space left.
576 */
577__init void e820_setup_gap(void)
578{
579 unsigned long gapstart, gapsize, round;
580 int found;
581
582 gapstart = 0x10000000;
583 gapsize = 0x400000;
584 found = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END);
585
586#ifdef CONFIG_X86_64
587 if (!found) {
588 gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
589 printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
590 "address range\n"
591 KERN_ERR "PCI: Unassigned devices with 32bit resource "
592 "registers may break!\n");
593 }
594#endif
595
596 /*
597 * See how much we want to round up: start off with
598 * rounding to the next 1MB area.
599 */
600 round = 0x100000;
601 while ((gapsize >> 4) > round)
602 round += round;
603 /* Fun with two's complement */
604 pci_mem_start = (gapstart + round) & -round;
605
606 printk(KERN_INFO
607 "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
608 pci_mem_start, gapstart, gapsize);
609}
610
611/**
612 * Because of the size limitation of struct boot_params, only first
613 * 128 E820 memory entries are passed to kernel via
614 * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
615 * linked list of struct setup_data, which is parsed here.
616 */
617void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data)
618{
619 u32 map_len;
620 int entries;
621 struct e820entry *extmap;
622
623 entries = sdata->len / sizeof(struct e820entry);
624 map_len = sdata->len + sizeof(struct setup_data);
625 if (map_len > PAGE_SIZE)
626 sdata = early_ioremap(pa_data, map_len);
627 extmap = (struct e820entry *)(sdata->data);
628 __append_e820_map(extmap, entries);
629 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
630 if (map_len > PAGE_SIZE)
631 early_iounmap(sdata, map_len);
632 printk(KERN_INFO "extended physical RAM map:\n");
633 e820_print_map("extended");
634}
635
636#if defined(CONFIG_X86_64) || \
637 (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
638/**
639 * Find the ranges of physical addresses that do not correspond to
640 * e820 RAM areas and mark the corresponding pages as nosave for
641 * hibernation (32 bit) or software suspend and suspend to RAM (64 bit).
642 *
643 * This function requires the e820 map to be sorted and without any
644 * overlapping entries and assumes the first e820 area to be RAM.
645 */
646void __init e820_mark_nosave_regions(unsigned long limit_pfn)
647{
648 int i;
649 unsigned long pfn;
650
651 pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
652 for (i = 1; i < e820.nr_map; i++) {
653 struct e820entry *ei = &e820.map[i];
654
655 if (pfn < PFN_UP(ei->addr))
656 register_nosave_region(pfn, PFN_UP(ei->addr));
657
658 pfn = PFN_DOWN(ei->addr + ei->size);
659 if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
660 register_nosave_region(PFN_UP(ei->addr), pfn);
661
662 if (pfn >= limit_pfn)
663 break;
664 }
665}
666#endif
667
668/*
669 * Early reserved memory areas.
670 */
671#define MAX_EARLY_RES 20
672
673struct early_res {
674 u64 start, end;
675 char name[16];
676 char overlap_ok;
677};
678static struct early_res early_res[MAX_EARLY_RES] __initdata = {
679 { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
680#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE)
681 { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
682#endif
683#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
684 /*
685 * But first pinch a few for the stack/trampoline stuff
686 * FIXME: Don't need the extra page at 4K, but need to fix
687 * trampoline before removing it. (see the GDT stuff)
688 */
689 { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" },
690 /*
691 * Has to be in very low memory so we can execute
692 * real-mode AP code.
693 */
694 { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" },
695#endif
696 {}
697};
698
699static int __init find_overlapped_early(u64 start, u64 end)
700{
701 int i;
702 struct early_res *r;
703
704 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
705 r = &early_res[i];
706 if (end > r->start && start < r->end)
707 break;
708 }
709
710 return i;
711}
712
713/*
714 * Drop the i-th range from the early reservation map,
715 * by copying any higher ranges down one over it, and
716 * clearing what had been the last slot.
717 */
718static void __init drop_range(int i)
719{
720 int j;
721
722 for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
723 ;
724
725 memmove(&early_res[i], &early_res[i + 1],
726 (j - 1 - i) * sizeof(struct early_res));
727
728 early_res[j - 1].end = 0;
729}
730
731/*
732 * Split any existing ranges that:
733 * 1) are marked 'overlap_ok', and
734 * 2) overlap with the stated range [start, end)
735 * into whatever portion (if any) of the existing range is entirely
736 * below or entirely above the stated range. Drop the portion
737 * of the existing range that overlaps with the stated range,
738 * which will allow the caller of this routine to then add that
739 * stated range without conflicting with any existing range.
740 */
741static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
742{
743 int i;
744 struct early_res *r;
745 u64 lower_start, lower_end;
746 u64 upper_start, upper_end;
747 char name[16];
748
749 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
750 r = &early_res[i];
751
752 /* Continue past non-overlapping ranges */
753 if (end <= r->start || start >= r->end)
754 continue;
755
756 /*
757 * Leave non-ok overlaps as is; let caller
758 * panic "Overlapping early reservations"
759 * when it hits this overlap.
760 */
761 if (!r->overlap_ok)
762 return;
763
764 /*
765 * We have an ok overlap. We will drop it from the early
766 * reservation map, and add back in any non-overlapping
767 * portions (lower or upper) as separate, overlap_ok,
768 * non-overlapping ranges.
769 */
770
771 /* 1. Note any non-overlapping (lower or upper) ranges. */
772 strncpy(name, r->name, sizeof(name) - 1);
773
774 lower_start = lower_end = 0;
775 upper_start = upper_end = 0;
776 if (r->start < start) {
777 lower_start = r->start;
778 lower_end = start;
779 }
780 if (r->end > end) {
781 upper_start = end;
782 upper_end = r->end;
783 }
784
785 /* 2. Drop the original ok overlapping range */
786 drop_range(i);
787
788 i--; /* resume for-loop on copied down entry */
789
790 /* 3. Add back in any non-overlapping ranges. */
791 if (lower_end)
792 reserve_early_overlap_ok(lower_start, lower_end, name);
793 if (upper_end)
794 reserve_early_overlap_ok(upper_start, upper_end, name);
795 }
796}
797
798static void __init __reserve_early(u64 start, u64 end, char *name,
799 int overlap_ok)
800{
801 int i;
802 struct early_res *r;
803
804 i = find_overlapped_early(start, end);
805 if (i >= MAX_EARLY_RES)
806 panic("Too many early reservations");
807 r = &early_res[i];
808 if (r->end)
809 panic("Overlapping early reservations "
810 "%llx-%llx %s to %llx-%llx %s\n",
811 start, end - 1, name?name:"", r->start,
812 r->end - 1, r->name);
813 r->start = start;
814 r->end = end;
815 r->overlap_ok = overlap_ok;
816 if (name)
817 strncpy(r->name, name, sizeof(r->name) - 1);
818}
819
820/*
821 * A few early reservtations come here.
822 *
823 * The 'overlap_ok' in the name of this routine does -not- mean it
824 * is ok for these reservations to overlap an earlier reservation.
825 * Rather it means that it is ok for subsequent reservations to
826 * overlap this one.
827 *
828 * Use this entry point to reserve early ranges when you are doing
829 * so out of "Paranoia", reserving perhaps more memory than you need,
830 * just in case, and don't mind a subsequent overlapping reservation
831 * that is known to be needed.
832 *
833 * The drop_overlaps_that_are_ok() call here isn't really needed.
834 * It would be needed if we had two colliding 'overlap_ok'
835 * reservations, so that the second such would not panic on the
836 * overlap with the first. We don't have any such as of this
837 * writing, but might as well tolerate such if it happens in
838 * the future.
839 */
840void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
841{
842 drop_overlaps_that_are_ok(start, end);
843 __reserve_early(start, end, name, 1);
844}
845
846/*
847 * Most early reservations come here.
848 *
849 * We first have drop_overlaps_that_are_ok() drop any pre-existing
850 * 'overlap_ok' ranges, so that we can then reserve this memory
851 * range without risk of panic'ing on an overlapping overlap_ok
852 * early reservation.
853 */
854void __init reserve_early(u64 start, u64 end, char *name)
855{
856 drop_overlaps_that_are_ok(start, end);
857 __reserve_early(start, end, name, 0);
858}
859
860void __init free_early(u64 start, u64 end)
861{
862 struct early_res *r;
863 int i;
864
865 i = find_overlapped_early(start, end);
866 r = &early_res[i];
867 if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
868 panic("free_early on not reserved area: %llx-%llx!",
869 start, end - 1);
870
871 drop_range(i);
872}
873
874void __init early_res_to_bootmem(u64 start, u64 end)
875{
876 int i, count;
877 u64 final_start, final_end;
878
879 count = 0;
880 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
881 count++;
882
883 printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
884 count, start, end);
885 for (i = 0; i < count; i++) {
886 struct early_res *r = &early_res[i];
887 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
888 r->start, r->end, r->name);
889 final_start = max(start, r->start);
890 final_end = min(end, r->end);
891 if (final_start >= final_end) {
892 printk(KERN_CONT "\n");
893 continue;
894 }
895 printk(KERN_CONT " ==> [%010llx - %010llx]\n",
896 final_start, final_end);
897 reserve_bootmem_generic(final_start, final_end - final_start,
898 BOOTMEM_DEFAULT);
899 }
900}
901
902/* Check for already reserved areas */
903static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
904{
905 int i;
906 u64 addr = *addrp;
907 int changed = 0;
908 struct early_res *r;
909again:
910 i = find_overlapped_early(addr, addr + size);
911 r = &early_res[i];
912 if (i < MAX_EARLY_RES && r->end) {
913 *addrp = addr = round_up(r->end, align);
914 changed = 1;
915 goto again;
916 }
917 return changed;
918}
919
920/* Check for already reserved areas */
921static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
922{
923 int i;
924 u64 addr = *addrp, last;
925 u64 size = *sizep;
926 int changed = 0;
927again:
928 last = addr + size;
929 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
930 struct early_res *r = &early_res[i];
931 if (last > r->start && addr < r->start) {
932 size = r->start - addr;
933 changed = 1;
934 goto again;
935 }
936 if (last > r->end && addr < r->end) {
937 addr = round_up(r->end, align);
938 size = last - addr;
939 changed = 1;
940 goto again;
941 }
942 if (last <= r->end && addr >= r->start) {
943 (*sizep)++;
944 return 0;
945 }
946 }
947 if (changed) {
948 *addrp = addr;
949 *sizep = size;
950 }
951 return changed;
952}
953
954/*
955 * Find a free area with specified alignment in a specific range.
956 */
957u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
958{
959 int i;
960
961 for (i = 0; i < e820.nr_map; i++) {
962 struct e820entry *ei = &e820.map[i];
963 u64 addr, last;
964 u64 ei_last;
965
966 if (ei->type != E820_RAM)
967 continue;
968 addr = round_up(ei->addr, align);
969 ei_last = ei->addr + ei->size;
970 if (addr < start)
971 addr = round_up(start, align);
972 if (addr >= ei_last)
973 continue;
974 while (bad_addr(&addr, size, align) && addr+size <= ei_last)
975 ;
976 last = addr + size;
977 if (last > ei_last)
978 continue;
979 if (last > end)
980 continue;
981 return addr;
982 }
983 return -1ULL;
984}
985
986/*
987 * Find next free range after *start
988 */
989u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
990{
991 int i;
992
993 for (i = 0; i < e820.nr_map; i++) {
994 struct e820entry *ei = &e820.map[i];
995 u64 addr, last;
996 u64 ei_last;
997
998 if (ei->type != E820_RAM)
999 continue;
1000 addr = round_up(ei->addr, align);
1001 ei_last = ei->addr + ei->size;
1002 if (addr < start)
1003 addr = round_up(start, align);
1004 if (addr >= ei_last)
1005 continue;
1006 *sizep = ei_last - addr;
1007 while (bad_addr_size(&addr, sizep, align) &&
1008 addr + *sizep <= ei_last)
1009 ;
1010 last = addr + *sizep;
1011 if (last > ei_last)
1012 continue;
1013 return addr;
1014 }
1015 return -1UL;
1016
1017}
1018
1019/*
1020 * pre allocated 4k and reserved it in e820
1021 */
1022u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
1023{
1024 u64 size = 0;
1025 u64 addr;
1026 u64 start;
1027
1028 start = startt;
1029 while (size < sizet)
1030 start = find_e820_area_size(start, &size, align);
1031
1032 if (size < sizet)
1033 return 0;
1034
1035 addr = round_down(start + size - sizet, align);
1036 e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
1037 e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
1038 printk(KERN_INFO "update e820 for early_reserve_e820\n");
1039 update_e820();
1040 update_e820_saved();
1041
1042 return addr;
1043}
1044
1045#ifdef CONFIG_X86_32
1046# ifdef CONFIG_X86_PAE
1047# define MAX_ARCH_PFN (1ULL<<(36-PAGE_SHIFT))
1048# else
1049# define MAX_ARCH_PFN (1ULL<<(32-PAGE_SHIFT))
1050# endif
1051#else /* CONFIG_X86_32 */
1052# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
1053#endif
1054
1055/*
1056 * Find the highest page frame number we have available
1057 */
1058static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
1059{
1060 int i;
1061 unsigned long last_pfn = 0;
1062 unsigned long max_arch_pfn = MAX_ARCH_PFN;
1063
1064 for (i = 0; i < e820.nr_map; i++) {
1065 struct e820entry *ei = &e820.map[i];
1066 unsigned long start_pfn;
1067 unsigned long end_pfn;
1068
1069 if (ei->type != type)
1070 continue;
1071
1072 start_pfn = ei->addr >> PAGE_SHIFT;
1073 end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
1074
1075 if (start_pfn >= limit_pfn)
1076 continue;
1077 if (end_pfn > limit_pfn) {
1078 last_pfn = limit_pfn;
1079 break;
1080 }
1081 if (end_pfn > last_pfn)
1082 last_pfn = end_pfn;
1083 }
1084
1085 if (last_pfn > max_arch_pfn)
1086 last_pfn = max_arch_pfn;
1087
1088 printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
1089 last_pfn, max_arch_pfn);
1090 return last_pfn;
1091}
1092unsigned long __init e820_end_of_ram_pfn(void)
1093{
1094 return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
1095}
1096
1097unsigned long __init e820_end_of_low_ram_pfn(void)
1098{
1099 return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
1100}
1101/*
1102 * Finds an active region in the address range from start_pfn to last_pfn and
1103 * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
1104 */
1105int __init e820_find_active_region(const struct e820entry *ei,
1106 unsigned long start_pfn,
1107 unsigned long last_pfn,
1108 unsigned long *ei_startpfn,
1109 unsigned long *ei_endpfn)
1110{
1111 u64 align = PAGE_SIZE;
1112
1113 *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT;
1114 *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT;
1115
1116 /* Skip map entries smaller than a page */
1117 if (*ei_startpfn >= *ei_endpfn)
1118 return 0;
1119
1120 /* Skip if map is outside the node */
1121 if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
1122 *ei_startpfn >= last_pfn)
1123 return 0;
1124
1125 /* Check for overlaps */
1126 if (*ei_startpfn < start_pfn)
1127 *ei_startpfn = start_pfn;
1128 if (*ei_endpfn > last_pfn)
1129 *ei_endpfn = last_pfn;
1130
1131 return 1;
1132}
1133
1134/* Walk the e820 map and register active regions within a node */
1135void __init e820_register_active_regions(int nid, unsigned long start_pfn,
1136 unsigned long last_pfn)
1137{
1138 unsigned long ei_startpfn;
1139 unsigned long ei_endpfn;
1140 int i;
1141
1142 for (i = 0; i < e820.nr_map; i++)
1143 if (e820_find_active_region(&e820.map[i],
1144 start_pfn, last_pfn,
1145 &ei_startpfn, &ei_endpfn))
1146 add_active_range(nid, ei_startpfn, ei_endpfn);
1147}
1148
1149/*
1150 * Find the hole size (in bytes) in the memory range.
1151 * @start: starting address of the memory range to scan
1152 * @end: ending address of the memory range to scan
1153 */
1154u64 __init e820_hole_size(u64 start, u64 end)
1155{
1156 unsigned long start_pfn = start >> PAGE_SHIFT;
1157 unsigned long last_pfn = end >> PAGE_SHIFT;
1158 unsigned long ei_startpfn, ei_endpfn, ram = 0;
1159 int i;
1160
1161 for (i = 0; i < e820.nr_map; i++) {
1162 if (e820_find_active_region(&e820.map[i],
1163 start_pfn, last_pfn,
1164 &ei_startpfn, &ei_endpfn))
1165 ram += ei_endpfn - ei_startpfn;
1166 }
1167 return end - start - ((u64)ram << PAGE_SHIFT);
1168}
1169
1170static void early_panic(char *msg)
1171{
1172 early_printk(msg);
1173 panic(msg);
1174}
1175
1176static int userdef __initdata;
1177
1178/* "mem=nopentium" disables the 4MB page tables. */
1179static int __init parse_memopt(char *p)
1180{
1181 u64 mem_size;
1182
1183 if (!p)
1184 return -EINVAL;
1185
1186#ifdef CONFIG_X86_32
1187 if (!strcmp(p, "nopentium")) {
1188 setup_clear_cpu_cap(X86_FEATURE_PSE);
1189 return 0;
1190 }
1191#endif
1192
1193 userdef = 1;
1194 mem_size = memparse(p, &p);
1195 e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
1196
1197 return 0;
1198}
1199early_param("mem", parse_memopt);
1200
1201static int __init parse_memmap_opt(char *p)
1202{
1203 char *oldp;
1204 u64 start_at, mem_size;
1205
1206 if (!p)
1207 return -EINVAL;
1208
1209 if (!strncmp(p, "exactmap", 8)) {
1210#ifdef CONFIG_CRASH_DUMP
1211 /*
1212 * If we are doing a crash dump, we still need to know
1213 * the real mem size before original memory map is
1214 * reset.
1215 */
1216 saved_max_pfn = e820_end_of_ram_pfn();
1217#endif
1218 e820.nr_map = 0;
1219 userdef = 1;
1220 return 0;
1221 }
1222
1223 oldp = p;
1224 mem_size = memparse(p, &p);
1225 if (p == oldp)
1226 return -EINVAL;
1227
1228 userdef = 1;
1229 if (*p == '@') {
1230 start_at = memparse(p+1, &p);
1231 e820_add_region(start_at, mem_size, E820_RAM);
1232 } else if (*p == '#') {
1233 start_at = memparse(p+1, &p);
1234 e820_add_region(start_at, mem_size, E820_ACPI);
1235 } else if (*p == '$') {
1236 start_at = memparse(p+1, &p);
1237 e820_add_region(start_at, mem_size, E820_RESERVED);
1238 } else
1239 e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
1240
1241 return *p == '\0' ? 0 : -EINVAL;
1242}
1243early_param("memmap", parse_memmap_opt);
1244
1245void __init finish_e820_parsing(void)
1246{
1247 if (userdef) {
1248 int nr = e820.nr_map;
1249
1250 if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
1251 early_panic("Invalid user supplied memory map");
1252 e820.nr_map = nr;
1253
1254 printk(KERN_INFO "user-defined physical RAM map:\n");
1255 e820_print_map("user");
1256 }
1257}
1258
1259static inline const char *e820_type_to_string(int e820_type)
1260{
1261 switch (e820_type) {
1262 case E820_RESERVED_KERN:
1263 case E820_RAM: return "System RAM";
1264 case E820_ACPI: return "ACPI Tables";
1265 case E820_NVS: return "ACPI Non-volatile Storage";
1266 case E820_UNUSABLE: return "Unusable memory";
1267 default: return "reserved";
1268 }
1269}
1270
1271/*
1272 * Mark e820 reserved areas as busy for the resource manager.
1273 */
1274static struct resource __initdata *e820_res;
1275void __init e820_reserve_resources(void)
1276{
1277 int i;
1278 struct resource *res;
1279 u64 end;
1280
1281 res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
1282 e820_res = res;
1283 for (i = 0; i < e820.nr_map; i++) {
1284 end = e820.map[i].addr + e820.map[i].size - 1;
1285#ifndef CONFIG_RESOURCES_64BIT
1286 if (end > 0x100000000ULL) {
1287 res++;
1288 continue;
1289 }
1290#endif
1291 res->name = e820_type_to_string(e820.map[i].type);
1292 res->start = e820.map[i].addr;
1293 res->end = end;
1294
1295 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
1296
1297 /*
1298 * don't register the region that could be conflicted with
1299 * pci device BAR resource and insert them later in
1300 * pcibios_resource_survey()
1301 */
1302 if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20))
1303 insert_resource(&iomem_resource, res);
1304 res++;
1305 }
1306
1307 for (i = 0; i < e820_saved.nr_map; i++) {
1308 struct e820entry *entry = &e820_saved.map[i];
1309 firmware_map_add_early(entry->addr,
1310 entry->addr + entry->size - 1,
1311 e820_type_to_string(entry->type));
1312 }
1313}
1314
1315void __init e820_reserve_resources_late(void)
1316{
1317 int i;
1318 struct resource *res;
1319
1320 res = e820_res;
1321 for (i = 0; i < e820.nr_map; i++) {
1322 if (!res->parent && res->end)
1323 reserve_region_with_split(&iomem_resource, res->start, res->end, res->name);
1324 res++;
1325 }
1326}
1327
1328char *__init default_machine_specific_memory_setup(void)
1329{
1330 char *who = "BIOS-e820";
1331 int new_nr;
1332 /*
1333 * Try to copy the BIOS-supplied E820-map.
1334 *
1335 * Otherwise fake a memory map; one section from 0k->640k,
1336 * the next section from 1mb->appropriate_mem_k
1337 */
1338 new_nr = boot_params.e820_entries;
1339 sanitize_e820_map(boot_params.e820_map,
1340 ARRAY_SIZE(boot_params.e820_map),
1341 &new_nr);
1342 boot_params.e820_entries = new_nr;
1343 if (append_e820_map(boot_params.e820_map, boot_params.e820_entries)
1344 < 0) {
1345 u64 mem_size;
1346
1347 /* compare results from other methods and take the greater */
1348 if (boot_params.alt_mem_k
1349 < boot_params.screen_info.ext_mem_k) {
1350 mem_size = boot_params.screen_info.ext_mem_k;
1351 who = "BIOS-88";
1352 } else {
1353 mem_size = boot_params.alt_mem_k;
1354 who = "BIOS-e801";
1355 }
1356
1357 e820.nr_map = 0;
1358 e820_add_region(0, LOWMEMSIZE(), E820_RAM);
1359 e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
1360 }
1361
1362 /* In case someone cares... */
1363 return who;
1364}
1365
1366char *__init __attribute__((weak)) machine_specific_memory_setup(void)
1367{
1368 if (x86_quirks->arch_memory_setup) {
1369 char *who = x86_quirks->arch_memory_setup();
1370
1371 if (who)
1372 return who;
1373 }
1374 return default_machine_specific_memory_setup();
1375}
1376
1377/* Overridden in paravirt.c if CONFIG_PARAVIRT */
1378char * __init __attribute__((weak)) memory_setup(void)
1379{
1380 return machine_specific_memory_setup();
1381}
1382
1383void __init setup_memory_map(void)
1384{
1385 char *who;
1386
1387 who = memory_setup();
1388 memcpy(&e820_saved, &e820, sizeof(struct e820map));
1389 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
1390 e820_print_map(who);
1391}
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
deleted file mode 100644
index ed733e7cf4e6..000000000000
--- a/arch/x86/kernel/e820_32.c
+++ /dev/null
@@ -1,775 +0,0 @@
1#include <linux/kernel.h>
2#include <linux/types.h>
3#include <linux/init.h>
4#include <linux/bootmem.h>
5#include <linux/ioport.h>
6#include <linux/string.h>
7#include <linux/kexec.h>
8#include <linux/module.h>
9#include <linux/mm.h>
10#include <linux/pfn.h>
11#include <linux/uaccess.h>
12#include <linux/suspend.h>
13
14#include <asm/pgtable.h>
15#include <asm/page.h>
16#include <asm/e820.h>
17#include <asm/setup.h>
18
19struct e820map e820;
20struct change_member {
21 struct e820entry *pbios; /* pointer to original bios entry */
22 unsigned long long addr; /* address for this change point */
23};
24static struct change_member change_point_list[2*E820MAX] __initdata;
25static struct change_member *change_point[2*E820MAX] __initdata;
26static struct e820entry *overlap_list[E820MAX] __initdata;
27static struct e820entry new_bios[E820MAX] __initdata;
28/* For PCI or other memory-mapped resources */
29unsigned long pci_mem_start = 0x10000000;
30#ifdef CONFIG_PCI
31EXPORT_SYMBOL(pci_mem_start);
32#endif
33extern int user_defined_memmap;
34
35static struct resource system_rom_resource = {
36 .name = "System ROM",
37 .start = 0xf0000,
38 .end = 0xfffff,
39 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
40};
41
42static struct resource extension_rom_resource = {
43 .name = "Extension ROM",
44 .start = 0xe0000,
45 .end = 0xeffff,
46 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
47};
48
49static struct resource adapter_rom_resources[] = { {
50 .name = "Adapter ROM",
51 .start = 0xc8000,
52 .end = 0,
53 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
54}, {
55 .name = "Adapter ROM",
56 .start = 0,
57 .end = 0,
58 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
59}, {
60 .name = "Adapter ROM",
61 .start = 0,
62 .end = 0,
63 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
64}, {
65 .name = "Adapter ROM",
66 .start = 0,
67 .end = 0,
68 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
69}, {
70 .name = "Adapter ROM",
71 .start = 0,
72 .end = 0,
73 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
74}, {
75 .name = "Adapter ROM",
76 .start = 0,
77 .end = 0,
78 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
79} };
80
81static struct resource video_rom_resource = {
82 .name = "Video ROM",
83 .start = 0xc0000,
84 .end = 0xc7fff,
85 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
86};
87
88#define ROMSIGNATURE 0xaa55
89
90static int __init romsignature(const unsigned char *rom)
91{
92 const unsigned short * const ptr = (const unsigned short *)rom;
93 unsigned short sig;
94
95 return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
96}
97
98static int __init romchecksum(const unsigned char *rom, unsigned long length)
99{
100 unsigned char sum, c;
101
102 for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
103 sum += c;
104 return !length && !sum;
105}
106
107static void __init probe_roms(void)
108{
109 const unsigned char *rom;
110 unsigned long start, length, upper;
111 unsigned char c;
112 int i;
113
114 /* video rom */
115 upper = adapter_rom_resources[0].start;
116 for (start = video_rom_resource.start; start < upper; start += 2048) {
117 rom = isa_bus_to_virt(start);
118 if (!romsignature(rom))
119 continue;
120
121 video_rom_resource.start = start;
122
123 if (probe_kernel_address(rom + 2, c) != 0)
124 continue;
125
126 /* 0 < length <= 0x7f * 512, historically */
127 length = c * 512;
128
129 /* if checksum okay, trust length byte */
130 if (length && romchecksum(rom, length))
131 video_rom_resource.end = start + length - 1;
132
133 request_resource(&iomem_resource, &video_rom_resource);
134 break;
135 }
136
137 start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
138 if (start < upper)
139 start = upper;
140
141 /* system rom */
142 request_resource(&iomem_resource, &system_rom_resource);
143 upper = system_rom_resource.start;
144
145 /* check for extension rom (ignore length byte!) */
146 rom = isa_bus_to_virt(extension_rom_resource.start);
147 if (romsignature(rom)) {
148 length = extension_rom_resource.end - extension_rom_resource.start + 1;
149 if (romchecksum(rom, length)) {
150 request_resource(&iomem_resource, &extension_rom_resource);
151 upper = extension_rom_resource.start;
152 }
153 }
154
155 /* check for adapter roms on 2k boundaries */
156 for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
157 rom = isa_bus_to_virt(start);
158 if (!romsignature(rom))
159 continue;
160
161 if (probe_kernel_address(rom + 2, c) != 0)
162 continue;
163
164 /* 0 < length <= 0x7f * 512, historically */
165 length = c * 512;
166
167 /* but accept any length that fits if checksum okay */
168 if (!length || start + length > upper || !romchecksum(rom, length))
169 continue;
170
171 adapter_rom_resources[i].start = start;
172 adapter_rom_resources[i].end = start + length - 1;
173 request_resource(&iomem_resource, &adapter_rom_resources[i]);
174
175 start = adapter_rom_resources[i++].end & ~2047UL;
176 }
177}
178
179/*
180 * Request address space for all standard RAM and ROM resources
181 * and also for regions reported as reserved by the e820.
182 */
183void __init init_iomem_resources(struct resource *code_resource,
184 struct resource *data_resource,
185 struct resource *bss_resource)
186{
187 int i;
188
189 probe_roms();
190 for (i = 0; i < e820.nr_map; i++) {
191 struct resource *res;
192#ifndef CONFIG_RESOURCES_64BIT
193 if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
194 continue;
195#endif
196 res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
197 switch (e820.map[i].type) {
198 case E820_RAM: res->name = "System RAM"; break;
199 case E820_ACPI: res->name = "ACPI Tables"; break;
200 case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
201 default: res->name = "reserved";
202 }
203 res->start = e820.map[i].addr;
204 res->end = res->start + e820.map[i].size - 1;
205 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
206 if (request_resource(&iomem_resource, res)) {
207 kfree(res);
208 continue;
209 }
210 if (e820.map[i].type == E820_RAM) {
211 /*
212 * We don't know which RAM region contains kernel data,
213 * so we try it repeatedly and let the resource manager
214 * test it.
215 */
216 request_resource(res, code_resource);
217 request_resource(res, data_resource);
218 request_resource(res, bss_resource);
219#ifdef CONFIG_KEXEC
220 if (crashk_res.start != crashk_res.end)
221 request_resource(res, &crashk_res);
222#endif
223 }
224 }
225}
226
227#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
228/**
229 * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
230 * correspond to e820 RAM areas and mark the corresponding pages as nosave for
231 * hibernation.
232 *
233 * This function requires the e820 map to be sorted and without any
234 * overlapping entries and assumes the first e820 area to be RAM.
235 */
236void __init e820_mark_nosave_regions(void)
237{
238 int i;
239 unsigned long pfn;
240
241 pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
242 for (i = 1; i < e820.nr_map; i++) {
243 struct e820entry *ei = &e820.map[i];
244
245 if (pfn < PFN_UP(ei->addr))
246 register_nosave_region(pfn, PFN_UP(ei->addr));
247
248 pfn = PFN_DOWN(ei->addr + ei->size);
249 if (ei->type != E820_RAM)
250 register_nosave_region(PFN_UP(ei->addr), pfn);
251
252 if (pfn >= max_low_pfn)
253 break;
254 }
255}
256#endif
257
258void __init add_memory_region(unsigned long long start,
259 unsigned long long size, int type)
260{
261 int x;
262
263 x = e820.nr_map;
264
265 if (x == E820MAX) {
266 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
267 return;
268 }
269
270 e820.map[x].addr = start;
271 e820.map[x].size = size;
272 e820.map[x].type = type;
273 e820.nr_map++;
274} /* add_memory_region */
275
276/*
277 * Sanitize the BIOS e820 map.
278 *
279 * Some e820 responses include overlapping entries. The following
280 * replaces the original e820 map with a new one, removing overlaps.
281 *
282 */
283int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
284{
285 struct change_member *change_tmp;
286 unsigned long current_type, last_type;
287 unsigned long long last_addr;
288 int chgidx, still_changing;
289 int overlap_entries;
290 int new_bios_entry;
291 int old_nr, new_nr, chg_nr;
292 int i;
293
294 /*
295 Visually we're performing the following (1,2,3,4 = memory types)...
296
297 Sample memory map (w/overlaps):
298 ____22__________________
299 ______________________4_
300 ____1111________________
301 _44_____________________
302 11111111________________
303 ____________________33__
304 ___________44___________
305 __________33333_________
306 ______________22________
307 ___________________2222_
308 _________111111111______
309 _____________________11_
310 _________________4______
311
312 Sanitized equivalent (no overlap):
313 1_______________________
314 _44_____________________
315 ___1____________________
316 ____22__________________
317 ______11________________
318 _________1______________
319 __________3_____________
320 ___________44___________
321 _____________33_________
322 _______________2________
323 ________________1_______
324 _________________4______
325 ___________________2____
326 ____________________33__
327 ______________________4_
328 */
329 /* if there's only one memory region, don't bother */
330 if (*pnr_map < 2) {
331 return -1;
332 }
333
334 old_nr = *pnr_map;
335
336 /* bail out if we find any unreasonable addresses in bios map */
337 for (i=0; i<old_nr; i++)
338 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
339 return -1;
340 }
341
342 /* create pointers for initial change-point information (for sorting) */
343 for (i=0; i < 2*old_nr; i++)
344 change_point[i] = &change_point_list[i];
345
346 /* record all known change-points (starting and ending addresses),
347 omitting those that are for empty memory regions */
348 chgidx = 0;
349 for (i=0; i < old_nr; i++) {
350 if (biosmap[i].size != 0) {
351 change_point[chgidx]->addr = biosmap[i].addr;
352 change_point[chgidx++]->pbios = &biosmap[i];
353 change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
354 change_point[chgidx++]->pbios = &biosmap[i];
355 }
356 }
357 chg_nr = chgidx; /* true number of change-points */
358
359 /* sort change-point list by memory addresses (low -> high) */
360 still_changing = 1;
361 while (still_changing) {
362 still_changing = 0;
363 for (i=1; i < chg_nr; i++) {
364 /* if <current_addr> > <last_addr>, swap */
365 /* or, if current=<start_addr> & last=<end_addr>, swap */
366 if ((change_point[i]->addr < change_point[i-1]->addr) ||
367 ((change_point[i]->addr == change_point[i-1]->addr) &&
368 (change_point[i]->addr == change_point[i]->pbios->addr) &&
369 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
370 )
371 {
372 change_tmp = change_point[i];
373 change_point[i] = change_point[i-1];
374 change_point[i-1] = change_tmp;
375 still_changing=1;
376 }
377 }
378 }
379
380 /* create a new bios memory map, removing overlaps */
381 overlap_entries=0; /* number of entries in the overlap table */
382 new_bios_entry=0; /* index for creating new bios map entries */
383 last_type = 0; /* start with undefined memory type */
384 last_addr = 0; /* start with 0 as last starting address */
385 /* loop through change-points, determining affect on the new bios map */
386 for (chgidx=0; chgidx < chg_nr; chgidx++)
387 {
388 /* keep track of all overlapping bios entries */
389 if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
390 {
391 /* add map entry to overlap list (> 1 entry implies an overlap) */
392 overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
393 }
394 else
395 {
396 /* remove entry from list (order independent, so swap with last) */
397 for (i=0; i<overlap_entries; i++)
398 {
399 if (overlap_list[i] == change_point[chgidx]->pbios)
400 overlap_list[i] = overlap_list[overlap_entries-1];
401 }
402 overlap_entries--;
403 }
404 /* if there are overlapping entries, decide which "type" to use */
405 /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
406 current_type = 0;
407 for (i=0; i<overlap_entries; i++)
408 if (overlap_list[i]->type > current_type)
409 current_type = overlap_list[i]->type;
410 /* continue building up new bios map based on this information */
411 if (current_type != last_type) {
412 if (last_type != 0) {
413 new_bios[new_bios_entry].size =
414 change_point[chgidx]->addr - last_addr;
415 /* move forward only if the new size was non-zero */
416 if (new_bios[new_bios_entry].size != 0)
417 if (++new_bios_entry >= E820MAX)
418 break; /* no more space left for new bios entries */
419 }
420 if (current_type != 0) {
421 new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
422 new_bios[new_bios_entry].type = current_type;
423 last_addr=change_point[chgidx]->addr;
424 }
425 last_type = current_type;
426 }
427 }
428 new_nr = new_bios_entry; /* retain count for new bios entries */
429
430 /* copy new bios mapping into original location */
431 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
432 *pnr_map = new_nr;
433
434 return 0;
435}
436
437/*
438 * Copy the BIOS e820 map into a safe place.
439 *
440 * Sanity-check it while we're at it..
441 *
442 * If we're lucky and live on a modern system, the setup code
443 * will have given us a memory map that we can use to properly
444 * set up memory. If we aren't, we'll fake a memory map.
445 *
446 * We check to see that the memory map contains at least 2 elements
447 * before we'll use it, because the detection code in setup.S may
448 * not be perfect and most every PC known to man has two memory
449 * regions: one from 0 to 640k, and one from 1mb up. (The IBM
450 * thinkpad 560x, for example, does not cooperate with the memory
451 * detection code.)
452 */
453int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
454{
455 /* Only one memory region (or negative)? Ignore it */
456 if (nr_map < 2)
457 return -1;
458
459 do {
460 u64 start = biosmap->addr;
461 u64 size = biosmap->size;
462 u64 end = start + size;
463 u32 type = biosmap->type;
464
465 /* Overflow in 64 bits? Ignore the memory map. */
466 if (start > end)
467 return -1;
468
469 add_memory_region(start, size, type);
470 } while (biosmap++, --nr_map);
471
472 return 0;
473}
474
475/*
476 * Find the highest page frame number we have available
477 */
478void __init propagate_e820_map(void)
479{
480 int i;
481
482 max_pfn = 0;
483
484 for (i = 0; i < e820.nr_map; i++) {
485 unsigned long start, end;
486 /* RAM? */
487 if (e820.map[i].type != E820_RAM)
488 continue;
489 start = PFN_UP(e820.map[i].addr);
490 end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
491 if (start >= end)
492 continue;
493 if (end > max_pfn)
494 max_pfn = end;
495 memory_present(0, start, end);
496 }
497}
498
499/*
500 * Register fully available low RAM pages with the bootmem allocator.
501 */
502void __init register_bootmem_low_pages(unsigned long max_low_pfn)
503{
504 int i;
505
506 for (i = 0; i < e820.nr_map; i++) {
507 unsigned long curr_pfn, last_pfn, size;
508 /*
509 * Reserve usable low memory
510 */
511 if (e820.map[i].type != E820_RAM)
512 continue;
513 /*
514 * We are rounding up the start address of usable memory:
515 */
516 curr_pfn = PFN_UP(e820.map[i].addr);
517 if (curr_pfn >= max_low_pfn)
518 continue;
519 /*
520 * ... and at the end of the usable range downwards:
521 */
522 last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
523
524 if (last_pfn > max_low_pfn)
525 last_pfn = max_low_pfn;
526
527 /*
528 * .. finally, did all the rounding and playing
529 * around just make the area go away?
530 */
531 if (last_pfn <= curr_pfn)
532 continue;
533
534 size = last_pfn - curr_pfn;
535 free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
536 }
537}
538
539void __init e820_register_memory(void)
540{
541 unsigned long gapstart, gapsize, round;
542 unsigned long long last;
543 int i;
544
545 /*
546 * Search for the biggest gap in the low 32 bits of the e820
547 * memory space.
548 */
549 last = 0x100000000ull;
550 gapstart = 0x10000000;
551 gapsize = 0x400000;
552 i = e820.nr_map;
553 while (--i >= 0) {
554 unsigned long long start = e820.map[i].addr;
555 unsigned long long end = start + e820.map[i].size;
556
557 /*
558 * Since "last" is at most 4GB, we know we'll
559 * fit in 32 bits if this condition is true
560 */
561 if (last > end) {
562 unsigned long gap = last - end;
563
564 if (gap > gapsize) {
565 gapsize = gap;
566 gapstart = end;
567 }
568 }
569 if (start < last)
570 last = start;
571 }
572
573 /*
574 * See how much we want to round up: start off with
575 * rounding to the next 1MB area.
576 */
577 round = 0x100000;
578 while ((gapsize >> 4) > round)
579 round += round;
580 /* Fun with two's complement */
581 pci_mem_start = (gapstart + round) & -round;
582
583 printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
584 pci_mem_start, gapstart, gapsize);
585}
586
587void __init print_memory_map(char *who)
588{
589 int i;
590
591 for (i = 0; i < e820.nr_map; i++) {
592 printk(" %s: %016Lx - %016Lx ", who,
593 e820.map[i].addr,
594 e820.map[i].addr + e820.map[i].size);
595 switch (e820.map[i].type) {
596 case E820_RAM: printk("(usable)\n");
597 break;
598 case E820_RESERVED:
599 printk("(reserved)\n");
600 break;
601 case E820_ACPI:
602 printk("(ACPI data)\n");
603 break;
604 case E820_NVS:
605 printk("(ACPI NVS)\n");
606 break;
607 default: printk("type %u\n", e820.map[i].type);
608 break;
609 }
610 }
611}
612
613void __init limit_regions(unsigned long long size)
614{
615 unsigned long long current_addr;
616 int i;
617
618 print_memory_map("limit_regions start");
619 for (i = 0; i < e820.nr_map; i++) {
620 current_addr = e820.map[i].addr + e820.map[i].size;
621 if (current_addr < size)
622 continue;
623
624 if (e820.map[i].type != E820_RAM)
625 continue;
626
627 if (e820.map[i].addr >= size) {
628 /*
629 * This region starts past the end of the
630 * requested size, skip it completely.
631 */
632 e820.nr_map = i;
633 } else {
634 e820.nr_map = i + 1;
635 e820.map[i].size -= current_addr - size;
636 }
637 print_memory_map("limit_regions endfor");
638 return;
639 }
640 print_memory_map("limit_regions endfunc");
641}
642
643/*
644 * This function checks if any part of the range <start,end> is mapped
645 * with type.
646 */
647int
648e820_any_mapped(u64 start, u64 end, unsigned type)
649{
650 int i;
651 for (i = 0; i < e820.nr_map; i++) {
652 const struct e820entry *ei = &e820.map[i];
653 if (type && ei->type != type)
654 continue;
655 if (ei->addr >= end || ei->addr + ei->size <= start)
656 continue;
657 return 1;
658 }
659 return 0;
660}
661EXPORT_SYMBOL_GPL(e820_any_mapped);
662
663 /*
664 * This function checks if the entire range <start,end> is mapped with type.
665 *
666 * Note: this function only works correct if the e820 table is sorted and
667 * not-overlapping, which is the case
668 */
669int __init
670e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
671{
672 u64 start = s;
673 u64 end = e;
674 int i;
675 for (i = 0; i < e820.nr_map; i++) {
676 struct e820entry *ei = &e820.map[i];
677 if (type && ei->type != type)
678 continue;
679 /* is the region (part) in overlap with the current region ?*/
680 if (ei->addr >= end || ei->addr + ei->size <= start)
681 continue;
682 /* if the region is at the beginning of <start,end> we move
683 * start to the end of the region since it's ok until there
684 */
685 if (ei->addr <= start)
686 start = ei->addr + ei->size;
687 /* if start is now at or beyond end, we're done, full
688 * coverage */
689 if (start >= end)
690 return 1; /* we're done */
691 }
692 return 0;
693}
694
695static int __init parse_memmap(char *arg)
696{
697 if (!arg)
698 return -EINVAL;
699
700 if (strcmp(arg, "exactmap") == 0) {
701#ifdef CONFIG_CRASH_DUMP
702 /* If we are doing a crash dump, we
703 * still need to know the real mem
704 * size before original memory map is
705 * reset.
706 */
707 propagate_e820_map();
708 saved_max_pfn = max_pfn;
709#endif
710 e820.nr_map = 0;
711 user_defined_memmap = 1;
712 } else {
713 /* If the user specifies memory size, we
714 * limit the BIOS-provided memory map to
715 * that size. exactmap can be used to specify
716 * the exact map. mem=number can be used to
717 * trim the existing memory map.
718 */
719 unsigned long long start_at, mem_size;
720
721 mem_size = memparse(arg, &arg);
722 if (*arg == '@') {
723 start_at = memparse(arg+1, &arg);
724 add_memory_region(start_at, mem_size, E820_RAM);
725 } else if (*arg == '#') {
726 start_at = memparse(arg+1, &arg);
727 add_memory_region(start_at, mem_size, E820_ACPI);
728 } else if (*arg == '$') {
729 start_at = memparse(arg+1, &arg);
730 add_memory_region(start_at, mem_size, E820_RESERVED);
731 } else {
732 limit_regions(mem_size);
733 user_defined_memmap = 1;
734 }
735 }
736 return 0;
737}
738early_param("memmap", parse_memmap);
739void __init update_memory_range(u64 start, u64 size, unsigned old_type,
740 unsigned new_type)
741{
742 int i;
743
744 BUG_ON(old_type == new_type);
745
746 for (i = 0; i < e820.nr_map; i++) {
747 struct e820entry *ei = &e820.map[i];
748 u64 final_start, final_end;
749 if (ei->type != old_type)
750 continue;
751 /* totally covered? */
752 if (ei->addr >= start && ei->size <= size) {
753 ei->type = new_type;
754 continue;
755 }
756 /* partially covered */
757 final_start = max(start, ei->addr);
758 final_end = min(start + size, ei->addr + ei->size);
759 if (final_start >= final_end)
760 continue;
761 add_memory_region(final_start, final_end - final_start,
762 new_type);
763 }
764}
765void __init update_e820(void)
766{
767 u8 nr_map;
768
769 nr_map = e820.nr_map;
770 if (sanitize_e820_map(e820.map, &nr_map))
771 return;
772 e820.nr_map = nr_map;
773 printk(KERN_INFO "modified physical RAM map:\n");
774 print_memory_map("modified");
775}
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
deleted file mode 100644
index 124480c0008d..000000000000
--- a/arch/x86/kernel/e820_64.c
+++ /dev/null
@@ -1,952 +0,0 @@
1/*
2 * Handle the memory map.
3 * The functions here do the job until bootmem takes over.
4 *
5 * Getting sanitize_e820_map() in sync with i386 version by applying change:
6 * - Provisions for empty E820 memory regions (reported by certain BIOSes).
7 * Alex Achenbach <xela@slit.de>, December 2002.
8 * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
9 *
10 */
11#include <linux/kernel.h>
12#include <linux/types.h>
13#include <linux/init.h>
14#include <linux/bootmem.h>
15#include <linux/ioport.h>
16#include <linux/string.h>
17#include <linux/kexec.h>
18#include <linux/module.h>
19#include <linux/mm.h>
20#include <linux/suspend.h>
21#include <linux/pfn.h>
22
23#include <asm/pgtable.h>
24#include <asm/page.h>
25#include <asm/e820.h>
26#include <asm/proto.h>
27#include <asm/setup.h>
28#include <asm/sections.h>
29#include <asm/kdebug.h>
30#include <asm/trampoline.h>
31
32struct e820map e820;
33
34/*
35 * PFN of last memory page.
36 */
37unsigned long end_pfn;
38
39/*
40 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
41 * The direct mapping extends to max_pfn_mapped, so that we can directly access
42 * apertures, ACPI and other tables without having to play with fixmaps.
43 */
44unsigned long max_pfn_mapped;
45
46/*
47 * Last pfn which the user wants to use.
48 */
49static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
50
51/*
52 * Early reserved memory areas.
53 */
54#define MAX_EARLY_RES 20
55
56struct early_res {
57 unsigned long start, end;
58 char name[16];
59};
60static struct early_res early_res[MAX_EARLY_RES] __initdata = {
61 { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
62#ifdef CONFIG_X86_TRAMPOLINE
63 { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
64#endif
65 {}
66};
67
68void __init reserve_early(unsigned long start, unsigned long end, char *name)
69{
70 int i;
71 struct early_res *r;
72 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
73 r = &early_res[i];
74 if (end > r->start && start < r->end)
75 panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n",
76 start, end - 1, name?name:"", r->start, r->end - 1, r->name);
77 }
78 if (i >= MAX_EARLY_RES)
79 panic("Too many early reservations");
80 r = &early_res[i];
81 r->start = start;
82 r->end = end;
83 if (name)
84 strncpy(r->name, name, sizeof(r->name) - 1);
85}
86
87void __init free_early(unsigned long start, unsigned long end)
88{
89 struct early_res *r;
90 int i, j;
91
92 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
93 r = &early_res[i];
94 if (start == r->start && end == r->end)
95 break;
96 }
97 if (i >= MAX_EARLY_RES || !early_res[i].end)
98 panic("free_early on not reserved area: %lx-%lx!", start, end);
99
100 for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
101 ;
102
103 memmove(&early_res[i], &early_res[i + 1],
104 (j - 1 - i) * sizeof(struct early_res));
105
106 early_res[j - 1].end = 0;
107}
108
109void __init early_res_to_bootmem(unsigned long start, unsigned long end)
110{
111 int i;
112 unsigned long final_start, final_end;
113 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
114 struct early_res *r = &early_res[i];
115 final_start = max(start, r->start);
116 final_end = min(end, r->end);
117 if (final_start >= final_end)
118 continue;
119 printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i,
120 final_start, final_end - 1, r->name);
121 reserve_bootmem_generic(final_start, final_end - final_start);
122 }
123}
124
125/* Check for already reserved areas */
126static inline int __init
127bad_addr(unsigned long *addrp, unsigned long size, unsigned long align)
128{
129 int i;
130 unsigned long addr = *addrp, last;
131 int changed = 0;
132again:
133 last = addr + size;
134 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
135 struct early_res *r = &early_res[i];
136 if (last >= r->start && addr < r->end) {
137 *addrp = addr = round_up(r->end, align);
138 changed = 1;
139 goto again;
140 }
141 }
142 return changed;
143}
144
145/* Check for already reserved areas */
146static inline int __init
147bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align)
148{
149 int i;
150 unsigned long addr = *addrp, last;
151 unsigned long size = *sizep;
152 int changed = 0;
153again:
154 last = addr + size;
155 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
156 struct early_res *r = &early_res[i];
157 if (last > r->start && addr < r->start) {
158 size = r->start - addr;
159 changed = 1;
160 goto again;
161 }
162 if (last > r->end && addr < r->end) {
163 addr = round_up(r->end, align);
164 size = last - addr;
165 changed = 1;
166 goto again;
167 }
168 if (last <= r->end && addr >= r->start) {
169 (*sizep)++;
170 return 0;
171 }
172 }
173 if (changed) {
174 *addrp = addr;
175 *sizep = size;
176 }
177 return changed;
178}
179/*
180 * This function checks if any part of the range <start,end> is mapped
181 * with type.
182 */
183int
184e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
185{
186 int i;
187
188 for (i = 0; i < e820.nr_map; i++) {
189 struct e820entry *ei = &e820.map[i];
190
191 if (type && ei->type != type)
192 continue;
193 if (ei->addr >= end || ei->addr + ei->size <= start)
194 continue;
195 return 1;
196 }
197 return 0;
198}
199EXPORT_SYMBOL_GPL(e820_any_mapped);
200
201/*
202 * This function checks if the entire range <start,end> is mapped with type.
203 *
204 * Note: this function only works correct if the e820 table is sorted and
205 * not-overlapping, which is the case
206 */
207int __init e820_all_mapped(unsigned long start, unsigned long end,
208 unsigned type)
209{
210 int i;
211
212 for (i = 0; i < e820.nr_map; i++) {
213 struct e820entry *ei = &e820.map[i];
214
215 if (type && ei->type != type)
216 continue;
217 /* is the region (part) in overlap with the current region ?*/
218 if (ei->addr >= end || ei->addr + ei->size <= start)
219 continue;
220
221 /* if the region is at the beginning of <start,end> we move
222 * start to the end of the region since it's ok until there
223 */
224 if (ei->addr <= start)
225 start = ei->addr + ei->size;
226 /*
227 * if start is now at or beyond end, we're done, full
228 * coverage
229 */
230 if (start >= end)
231 return 1;
232 }
233 return 0;
234}
235
236/*
237 * Find a free area with specified alignment in a specific range.
238 */
239unsigned long __init find_e820_area(unsigned long start, unsigned long end,
240 unsigned long size, unsigned long align)
241{
242 int i;
243
244 for (i = 0; i < e820.nr_map; i++) {
245 struct e820entry *ei = &e820.map[i];
246 unsigned long addr, last;
247 unsigned long ei_last;
248
249 if (ei->type != E820_RAM)
250 continue;
251 addr = round_up(ei->addr, align);
252 ei_last = ei->addr + ei->size;
253 if (addr < start)
254 addr = round_up(start, align);
255 if (addr >= ei_last)
256 continue;
257 while (bad_addr(&addr, size, align) && addr+size <= ei_last)
258 ;
259 last = addr + size;
260 if (last > ei_last)
261 continue;
262 if (last > end)
263 continue;
264 return addr;
265 }
266 return -1UL;
267}
268
269/*
270 * Find next free range after *start
271 */
272unsigned long __init find_e820_area_size(unsigned long start,
273 unsigned long *sizep,
274 unsigned long align)
275{
276 int i;
277
278 for (i = 0; i < e820.nr_map; i++) {
279 struct e820entry *ei = &e820.map[i];
280 unsigned long addr, last;
281 unsigned long ei_last;
282
283 if (ei->type != E820_RAM)
284 continue;
285 addr = round_up(ei->addr, align);
286 ei_last = ei->addr + ei->size;
287 if (addr < start)
288 addr = round_up(start, align);
289 if (addr >= ei_last)
290 continue;
291 *sizep = ei_last - addr;
292 while (bad_addr_size(&addr, sizep, align) &&
293 addr + *sizep <= ei_last)
294 ;
295 last = addr + *sizep;
296 if (last > ei_last)
297 continue;
298 return addr;
299 }
300 return -1UL;
301
302}
303/*
304 * Find the highest page frame number we have available
305 */
306unsigned long __init e820_end_of_ram(void)
307{
308 unsigned long end_pfn;
309
310 end_pfn = find_max_pfn_with_active_regions();
311
312 if (end_pfn > max_pfn_mapped)
313 max_pfn_mapped = end_pfn;
314 if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT)
315 max_pfn_mapped = MAXMEM>>PAGE_SHIFT;
316 if (end_pfn > end_user_pfn)
317 end_pfn = end_user_pfn;
318 if (end_pfn > max_pfn_mapped)
319 end_pfn = max_pfn_mapped;
320
321 printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped);
322 return end_pfn;
323}
324
325/*
326 * Mark e820 reserved areas as busy for the resource manager.
327 */
328void __init e820_reserve_resources(void)
329{
330 int i;
331 struct resource *res;
332
333 res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
334 for (i = 0; i < e820.nr_map; i++) {
335 switch (e820.map[i].type) {
336 case E820_RAM: res->name = "System RAM"; break;
337 case E820_ACPI: res->name = "ACPI Tables"; break;
338 case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
339 default: res->name = "reserved";
340 }
341 res->start = e820.map[i].addr;
342 res->end = res->start + e820.map[i].size - 1;
343 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
344 insert_resource(&iomem_resource, res);
345 res++;
346 }
347}
348
349/*
350 * Find the ranges of physical addresses that do not correspond to
351 * e820 RAM areas and mark the corresponding pages as nosave for software
352 * suspend and suspend to RAM.
353 *
354 * This function requires the e820 map to be sorted and without any
355 * overlapping entries and assumes the first e820 area to be RAM.
356 */
357void __init e820_mark_nosave_regions(void)
358{
359 int i;
360 unsigned long paddr;
361
362 paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
363 for (i = 1; i < e820.nr_map; i++) {
364 struct e820entry *ei = &e820.map[i];
365
366 if (paddr < ei->addr)
367 register_nosave_region(PFN_DOWN(paddr),
368 PFN_UP(ei->addr));
369
370 paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
371 if (ei->type != E820_RAM)
372 register_nosave_region(PFN_UP(ei->addr),
373 PFN_DOWN(paddr));
374
375 if (paddr >= (end_pfn << PAGE_SHIFT))
376 break;
377 }
378}
379
380/*
381 * Finds an active region in the address range from start_pfn to end_pfn and
382 * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
383 */
384static int __init e820_find_active_region(const struct e820entry *ei,
385 unsigned long start_pfn,
386 unsigned long end_pfn,
387 unsigned long *ei_startpfn,
388 unsigned long *ei_endpfn)
389{
390 *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
391 *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT;
392
393 /* Skip map entries smaller than a page */
394 if (*ei_startpfn >= *ei_endpfn)
395 return 0;
396
397 /* Check if max_pfn_mapped should be updated */
398 if (ei->type != E820_RAM && *ei_endpfn > max_pfn_mapped)
399 max_pfn_mapped = *ei_endpfn;
400
401 /* Skip if map is outside the node */
402 if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
403 *ei_startpfn >= end_pfn)
404 return 0;
405
406 /* Check for overlaps */
407 if (*ei_startpfn < start_pfn)
408 *ei_startpfn = start_pfn;
409 if (*ei_endpfn > end_pfn)
410 *ei_endpfn = end_pfn;
411
412 /* Obey end_user_pfn to save on memmap */
413 if (*ei_startpfn >= end_user_pfn)
414 return 0;
415 if (*ei_endpfn > end_user_pfn)
416 *ei_endpfn = end_user_pfn;
417
418 return 1;
419}
420
421/* Walk the e820 map and register active regions within a node */
422void __init
423e820_register_active_regions(int nid, unsigned long start_pfn,
424 unsigned long end_pfn)
425{
426 unsigned long ei_startpfn;
427 unsigned long ei_endpfn;
428 int i;
429
430 for (i = 0; i < e820.nr_map; i++)
431 if (e820_find_active_region(&e820.map[i],
432 start_pfn, end_pfn,
433 &ei_startpfn, &ei_endpfn))
434 add_active_range(nid, ei_startpfn, ei_endpfn);
435}
436
437/*
438 * Add a memory region to the kernel e820 map.
439 */
440void __init add_memory_region(unsigned long start, unsigned long size, int type)
441{
442 int x = e820.nr_map;
443
444 if (x == E820MAX) {
445 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
446 return;
447 }
448
449 e820.map[x].addr = start;
450 e820.map[x].size = size;
451 e820.map[x].type = type;
452 e820.nr_map++;
453}
454
455/*
456 * Find the hole size (in bytes) in the memory range.
457 * @start: starting address of the memory range to scan
458 * @end: ending address of the memory range to scan
459 */
460unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
461{
462 unsigned long start_pfn = start >> PAGE_SHIFT;
463 unsigned long end_pfn = end >> PAGE_SHIFT;
464 unsigned long ei_startpfn, ei_endpfn, ram = 0;
465 int i;
466
467 for (i = 0; i < e820.nr_map; i++) {
468 if (e820_find_active_region(&e820.map[i],
469 start_pfn, end_pfn,
470 &ei_startpfn, &ei_endpfn))
471 ram += ei_endpfn - ei_startpfn;
472 }
473 return end - start - (ram << PAGE_SHIFT);
474}
475
476static void __init e820_print_map(char *who)
477{
478 int i;
479
480 for (i = 0; i < e820.nr_map; i++) {
481 printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
482 (unsigned long long) e820.map[i].addr,
483 (unsigned long long)
484 (e820.map[i].addr + e820.map[i].size));
485 switch (e820.map[i].type) {
486 case E820_RAM:
487 printk(KERN_CONT "(usable)\n");
488 break;
489 case E820_RESERVED:
490 printk(KERN_CONT "(reserved)\n");
491 break;
492 case E820_ACPI:
493 printk(KERN_CONT "(ACPI data)\n");
494 break;
495 case E820_NVS:
496 printk(KERN_CONT "(ACPI NVS)\n");
497 break;
498 default:
499 printk(KERN_CONT "type %u\n", e820.map[i].type);
500 break;
501 }
502 }
503}
504
505/*
506 * Sanitize the BIOS e820 map.
507 *
508 * Some e820 responses include overlapping entries. The following
509 * replaces the original e820 map with a new one, removing overlaps.
510 *
511 */
512static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
513{
514 struct change_member {
515 struct e820entry *pbios; /* pointer to original bios entry */
516 unsigned long long addr; /* address for this change point */
517 };
518 static struct change_member change_point_list[2*E820MAX] __initdata;
519 static struct change_member *change_point[2*E820MAX] __initdata;
520 static struct e820entry *overlap_list[E820MAX] __initdata;
521 static struct e820entry new_bios[E820MAX] __initdata;
522 struct change_member *change_tmp;
523 unsigned long current_type, last_type;
524 unsigned long long last_addr;
525 int chgidx, still_changing;
526 int overlap_entries;
527 int new_bios_entry;
528 int old_nr, new_nr, chg_nr;
529 int i;
530
531 /*
532 Visually we're performing the following
533 (1,2,3,4 = memory types)...
534
535 Sample memory map (w/overlaps):
536 ____22__________________
537 ______________________4_
538 ____1111________________
539 _44_____________________
540 11111111________________
541 ____________________33__
542 ___________44___________
543 __________33333_________
544 ______________22________
545 ___________________2222_
546 _________111111111______
547 _____________________11_
548 _________________4______
549
550 Sanitized equivalent (no overlap):
551 1_______________________
552 _44_____________________
553 ___1____________________
554 ____22__________________
555 ______11________________
556 _________1______________
557 __________3_____________
558 ___________44___________
559 _____________33_________
560 _______________2________
561 ________________1_______
562 _________________4______
563 ___________________2____
564 ____________________33__
565 ______________________4_
566 */
567
568 /* if there's only one memory region, don't bother */
569 if (*pnr_map < 2)
570 return -1;
571
572 old_nr = *pnr_map;
573
574 /* bail out if we find any unreasonable addresses in bios map */
575 for (i = 0; i < old_nr; i++)
576 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
577 return -1;
578
579 /* create pointers for initial change-point information (for sorting) */
580 for (i = 0; i < 2 * old_nr; i++)
581 change_point[i] = &change_point_list[i];
582
583 /* record all known change-points (starting and ending addresses),
584 omitting those that are for empty memory regions */
585 chgidx = 0;
586 for (i = 0; i < old_nr; i++) {
587 if (biosmap[i].size != 0) {
588 change_point[chgidx]->addr = biosmap[i].addr;
589 change_point[chgidx++]->pbios = &biosmap[i];
590 change_point[chgidx]->addr = biosmap[i].addr +
591 biosmap[i].size;
592 change_point[chgidx++]->pbios = &biosmap[i];
593 }
594 }
595 chg_nr = chgidx;
596
597 /* sort change-point list by memory addresses (low -> high) */
598 still_changing = 1;
599 while (still_changing) {
600 still_changing = 0;
601 for (i = 1; i < chg_nr; i++) {
602 unsigned long long curaddr, lastaddr;
603 unsigned long long curpbaddr, lastpbaddr;
604
605 curaddr = change_point[i]->addr;
606 lastaddr = change_point[i - 1]->addr;
607 curpbaddr = change_point[i]->pbios->addr;
608 lastpbaddr = change_point[i - 1]->pbios->addr;
609
610 /*
611 * swap entries, when:
612 *
613 * curaddr > lastaddr or
614 * curaddr == lastaddr and curaddr == curpbaddr and
615 * lastaddr != lastpbaddr
616 */
617 if (curaddr < lastaddr ||
618 (curaddr == lastaddr && curaddr == curpbaddr &&
619 lastaddr != lastpbaddr)) {
620 change_tmp = change_point[i];
621 change_point[i] = change_point[i-1];
622 change_point[i-1] = change_tmp;
623 still_changing = 1;
624 }
625 }
626 }
627
628 /* create a new bios memory map, removing overlaps */
629 overlap_entries = 0; /* number of entries in the overlap table */
630 new_bios_entry = 0; /* index for creating new bios map entries */
631 last_type = 0; /* start with undefined memory type */
632 last_addr = 0; /* start with 0 as last starting address */
633
634 /* loop through change-points, determining affect on the new bios map */
635 for (chgidx = 0; chgidx < chg_nr; chgidx++) {
636 /* keep track of all overlapping bios entries */
637 if (change_point[chgidx]->addr ==
638 change_point[chgidx]->pbios->addr) {
639 /*
640 * add map entry to overlap list (> 1 entry
641 * implies an overlap)
642 */
643 overlap_list[overlap_entries++] =
644 change_point[chgidx]->pbios;
645 } else {
646 /*
647 * remove entry from list (order independent,
648 * so swap with last)
649 */
650 for (i = 0; i < overlap_entries; i++) {
651 if (overlap_list[i] ==
652 change_point[chgidx]->pbios)
653 overlap_list[i] =
654 overlap_list[overlap_entries-1];
655 }
656 overlap_entries--;
657 }
658 /*
659 * if there are overlapping entries, decide which
660 * "type" to use (larger value takes precedence --
661 * 1=usable, 2,3,4,4+=unusable)
662 */
663 current_type = 0;
664 for (i = 0; i < overlap_entries; i++)
665 if (overlap_list[i]->type > current_type)
666 current_type = overlap_list[i]->type;
667 /*
668 * continue building up new bios map based on this
669 * information
670 */
671 if (current_type != last_type) {
672 if (last_type != 0) {
673 new_bios[new_bios_entry].size =
674 change_point[chgidx]->addr - last_addr;
675 /*
676 * move forward only if the new size
677 * was non-zero
678 */
679 if (new_bios[new_bios_entry].size != 0)
680 /*
681 * no more space left for new
682 * bios entries ?
683 */
684 if (++new_bios_entry >= E820MAX)
685 break;
686 }
687 if (current_type != 0) {
688 new_bios[new_bios_entry].addr =
689 change_point[chgidx]->addr;
690 new_bios[new_bios_entry].type = current_type;
691 last_addr = change_point[chgidx]->addr;
692 }
693 last_type = current_type;
694 }
695 }
696 /* retain count for new bios entries */
697 new_nr = new_bios_entry;
698
699 /* copy new bios mapping into original location */
700 memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
701 *pnr_map = new_nr;
702
703 return 0;
704}
705
706/*
707 * Copy the BIOS e820 map into a safe place.
708 *
709 * Sanity-check it while we're at it..
710 *
711 * If we're lucky and live on a modern system, the setup code
712 * will have given us a memory map that we can use to properly
713 * set up memory. If we aren't, we'll fake a memory map.
714 */
715static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
716{
717 /* Only one memory region (or negative)? Ignore it */
718 if (nr_map < 2)
719 return -1;
720
721 do {
722 u64 start = biosmap->addr;
723 u64 size = biosmap->size;
724 u64 end = start + size;
725 u32 type = biosmap->type;
726
727 /* Overflow in 64 bits? Ignore the memory map. */
728 if (start > end)
729 return -1;
730
731 add_memory_region(start, size, type);
732 } while (biosmap++, --nr_map);
733 return 0;
734}
735
736static void early_panic(char *msg)
737{
738 early_printk(msg);
739 panic(msg);
740}
741
742/* We're not void only for x86 32-bit compat */
743char * __init machine_specific_memory_setup(void)
744{
745 char *who = "BIOS-e820";
746 /*
747 * Try to copy the BIOS-supplied E820-map.
748 *
749 * Otherwise fake a memory map; one section from 0k->640k,
750 * the next section from 1mb->appropriate_mem_k
751 */
752 sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
753 if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
754 early_panic("Cannot find a valid memory map");
755 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
756 e820_print_map(who);
757
758 /* In case someone cares... */
759 return who;
760}
761
762static int __init parse_memopt(char *p)
763{
764 if (!p)
765 return -EINVAL;
766 end_user_pfn = memparse(p, &p);
767 end_user_pfn >>= PAGE_SHIFT;
768 return 0;
769}
770early_param("mem", parse_memopt);
771
772static int userdef __initdata;
773
774static int __init parse_memmap_opt(char *p)
775{
776 char *oldp;
777 unsigned long long start_at, mem_size;
778
779 if (!strcmp(p, "exactmap")) {
780#ifdef CONFIG_CRASH_DUMP
781 /*
782 * If we are doing a crash dump, we still need to know
783 * the real mem size before original memory map is
784 * reset.
785 */
786 e820_register_active_regions(0, 0, -1UL);
787 saved_max_pfn = e820_end_of_ram();
788 remove_all_active_ranges();
789#endif
790 max_pfn_mapped = 0;
791 e820.nr_map = 0;
792 userdef = 1;
793 return 0;
794 }
795
796 oldp = p;
797 mem_size = memparse(p, &p);
798 if (p == oldp)
799 return -EINVAL;
800
801 userdef = 1;
802 if (*p == '@') {
803 start_at = memparse(p+1, &p);
804 add_memory_region(start_at, mem_size, E820_RAM);
805 } else if (*p == '#') {
806 start_at = memparse(p+1, &p);
807 add_memory_region(start_at, mem_size, E820_ACPI);
808 } else if (*p == '$') {
809 start_at = memparse(p+1, &p);
810 add_memory_region(start_at, mem_size, E820_RESERVED);
811 } else {
812 end_user_pfn = (mem_size >> PAGE_SHIFT);
813 }
814 return *p == '\0' ? 0 : -EINVAL;
815}
816early_param("memmap", parse_memmap_opt);
817
818void __init finish_e820_parsing(void)
819{
820 if (userdef) {
821 char nr = e820.nr_map;
822
823 if (sanitize_e820_map(e820.map, &nr) < 0)
824 early_panic("Invalid user supplied memory map");
825 e820.nr_map = nr;
826
827 printk(KERN_INFO "user-defined physical RAM map:\n");
828 e820_print_map("user");
829 }
830}
831
832void __init update_memory_range(u64 start, u64 size, unsigned old_type,
833 unsigned new_type)
834{
835 int i;
836
837 BUG_ON(old_type == new_type);
838
839 for (i = 0; i < e820.nr_map; i++) {
840 struct e820entry *ei = &e820.map[i];
841 u64 final_start, final_end;
842 if (ei->type != old_type)
843 continue;
844 /* totally covered? */
845 if (ei->addr >= start && ei->size <= size) {
846 ei->type = new_type;
847 continue;
848 }
849 /* partially covered */
850 final_start = max(start, ei->addr);
851 final_end = min(start + size, ei->addr + ei->size);
852 if (final_start >= final_end)
853 continue;
854 add_memory_region(final_start, final_end - final_start,
855 new_type);
856 }
857}
858
859void __init update_e820(void)
860{
861 u8 nr_map;
862
863 nr_map = e820.nr_map;
864 if (sanitize_e820_map(e820.map, &nr_map))
865 return;
866 e820.nr_map = nr_map;
867 printk(KERN_INFO "modified physical RAM map:\n");
868 e820_print_map("modified");
869}
870
871unsigned long pci_mem_start = 0xaeedbabe;
872EXPORT_SYMBOL(pci_mem_start);
873
874/*
875 * Search for the biggest gap in the low 32 bits of the e820
876 * memory space. We pass this space to PCI to assign MMIO resources
877 * for hotplug or unconfigured devices in.
878 * Hopefully the BIOS let enough space left.
879 */
880__init void e820_setup_gap(void)
881{
882 unsigned long gapstart, gapsize, round;
883 unsigned long last;
884 int i;
885 int found = 0;
886
887 last = 0x100000000ull;
888 gapstart = 0x10000000;
889 gapsize = 0x400000;
890 i = e820.nr_map;
891 while (--i >= 0) {
892 unsigned long long start = e820.map[i].addr;
893 unsigned long long end = start + e820.map[i].size;
894
895 /*
896 * Since "last" is at most 4GB, we know we'll
897 * fit in 32 bits if this condition is true
898 */
899 if (last > end) {
900 unsigned long gap = last - end;
901
902 if (gap > gapsize) {
903 gapsize = gap;
904 gapstart = end;
905 found = 1;
906 }
907 }
908 if (start < last)
909 last = start;
910 }
911
912 if (!found) {
913 gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
914 printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
915 "address range\n"
916 KERN_ERR "PCI: Unassigned devices with 32bit resource "
917 "registers may break!\n");
918 }
919
920 /*
921 * See how much we want to round up: start off with
922 * rounding to the next 1MB area.
923 */
924 round = 0x100000;
925 while ((gapsize >> 4) > round)
926 round += round;
927 /* Fun with two's complement */
928 pci_mem_start = (gapstart + round) & -round;
929
930 printk(KERN_INFO
931 "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
932 pci_mem_start, gapstart, gapsize);
933}
934
935int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
936{
937 int i;
938
939 if (slot < 0 || slot >= e820.nr_map)
940 return -1;
941 for (i = slot; i < e820.nr_map; i++) {
942 if (e820.map[i].type != E820_RAM)
943 continue;
944 break;
945 }
946 if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
947 return -1;
948 *addr = e820.map[i].addr;
949 *size = min_t(u64, e820.map[i].size + e820.map[i].addr,
950 max_pfn << PAGE_SHIFT) - *addr;
951 return i + 1;
952}
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 9f51e1ea9e82..733c4f8d42ea 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -16,10 +16,7 @@
16#include <asm/dma.h> 16#include <asm/dma.h>
17#include <asm/io_apic.h> 17#include <asm/io_apic.h>
18#include <asm/apic.h> 18#include <asm/apic.h>
19 19#include <asm/iommu.h>
20#ifdef CONFIG_GART_IOMMU
21#include <asm/gart.h>
22#endif
23 20
24static void __init fix_hypertransport_config(int num, int slot, int func) 21static void __init fix_hypertransport_config(int num, int slot, int func)
25{ 22{
@@ -50,7 +47,7 @@ static void __init fix_hypertransport_config(int num, int slot, int func)
50static void __init via_bugs(int num, int slot, int func) 47static void __init via_bugs(int num, int slot, int func)
51{ 48{
52#ifdef CONFIG_GART_IOMMU 49#ifdef CONFIG_GART_IOMMU
53 if ((end_pfn > MAX_DMA32_PFN || force_iommu) && 50 if ((max_pfn > MAX_DMA32_PFN || force_iommu) &&
54 !gart_iommu_aperture_allowed) { 51 !gart_iommu_aperture_allowed) {
55 printk(KERN_INFO 52 printk(KERN_INFO
56 "Looks like a VIA chipset. Disabling IOMMU." 53 "Looks like a VIA chipset. Disabling IOMMU."
@@ -98,17 +95,66 @@ static void __init nvidia_bugs(int num, int slot, int func)
98 95
99} 96}
100 97
98static u32 ati_ixp4x0_rev(int num, int slot, int func)
99{
100 u32 d;
101 u8 b;
102
103 b = read_pci_config_byte(num, slot, func, 0xac);
104 b &= ~(1<<5);
105 write_pci_config_byte(num, slot, func, 0xac, b);
106
107 d = read_pci_config(num, slot, func, 0x70);
108 d |= 1<<8;
109 write_pci_config(num, slot, func, 0x70, d);
110
111 d = read_pci_config(num, slot, func, 0x8);
112 d &= 0xff;
113 return d;
114}
115
101static void __init ati_bugs(int num, int slot, int func) 116static void __init ati_bugs(int num, int slot, int func)
102{ 117{
103#ifdef CONFIG_X86_IO_APIC 118#if defined(CONFIG_ACPI) && defined (CONFIG_X86_IO_APIC)
104 if (timer_over_8254 == 1) { 119 u32 d;
105 timer_over_8254 = 0; 120 u8 b;
106 printk(KERN_INFO 121
107 "ATI board detected. Disabling timer routing over 8254.\n"); 122 if (acpi_use_timer_override)
123 return;
124
125 d = ati_ixp4x0_rev(num, slot, func);
126 if (d < 0x82)
127 acpi_skip_timer_override = 1;
128 else {
129 /* check for IRQ0 interrupt swap */
130 outb(0x72, 0xcd6); b = inb(0xcd7);
131 if (!(b & 0x2))
132 acpi_skip_timer_override = 1;
133 }
134
135 if (acpi_skip_timer_override) {
136 printk(KERN_INFO "SB4X0 revision 0x%x\n", d);
137 printk(KERN_INFO "Ignoring ACPI timer override.\n");
138 printk(KERN_INFO "If you got timer trouble "
139 "try acpi_use_timer_override\n");
108 } 140 }
109#endif 141#endif
110} 142}
111 143
144#ifdef CONFIG_DMAR
145static void __init intel_g33_dmar(int num, int slot, int func)
146{
147 struct acpi_table_header *dmar_tbl;
148 acpi_status status;
149
150 status = acpi_get_table(ACPI_SIG_DMAR, 0, &dmar_tbl);
151 if (ACPI_SUCCESS(status)) {
152 printk(KERN_INFO "BIOS BUG: DMAR advertised on Intel G31/G33 chipset -- ignoring\n");
153 dmar_disabled = 1;
154 }
155}
156#endif
157
112#define QFLAG_APPLY_ONCE 0x1 158#define QFLAG_APPLY_ONCE 0x1
113#define QFLAG_APPLIED 0x2 159#define QFLAG_APPLIED 0x2
114#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) 160#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -126,14 +172,29 @@ static struct chipset early_qrk[] __initdata = {
126 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs }, 172 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs },
127 { PCI_VENDOR_ID_VIA, PCI_ANY_ID, 173 { PCI_VENDOR_ID_VIA, PCI_ANY_ID,
128 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs }, 174 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs },
129 { PCI_VENDOR_ID_ATI, PCI_ANY_ID,
130 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, ati_bugs },
131 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, 175 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
132 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, 176 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config },
177 { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
178 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs },
179#ifdef CONFIG_DMAR
180 { PCI_VENDOR_ID_INTEL, 0x29c0,
181 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, intel_g33_dmar },
182#endif
133 {} 183 {}
134}; 184};
135 185
136static void __init check_dev_quirk(int num, int slot, int func) 186/**
187 * check_dev_quirk - apply early quirks to a given PCI device
188 * @num: bus number
189 * @slot: slot number
190 * @func: PCI function
191 *
192 * Check the vendor & device ID against the early quirks table.
193 *
194 * If the device is single function, let early_quirks() know so we don't
195 * poke at this device again.
196 */
197static int __init check_dev_quirk(int num, int slot, int func)
137{ 198{
138 u16 class; 199 u16 class;
139 u16 vendor; 200 u16 vendor;
@@ -144,7 +205,7 @@ static void __init check_dev_quirk(int num, int slot, int func)
144 class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE); 205 class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE);
145 206
146 if (class == 0xffff) 207 if (class == 0xffff)
147 return; 208 return -1; /* no class, treat as single function */
148 209
149 vendor = read_pci_config_16(num, slot, func, PCI_VENDOR_ID); 210 vendor = read_pci_config_16(num, slot, func, PCI_VENDOR_ID);
150 211
@@ -167,7 +228,9 @@ static void __init check_dev_quirk(int num, int slot, int func)
167 type = read_pci_config_byte(num, slot, func, 228 type = read_pci_config_byte(num, slot, func,
168 PCI_HEADER_TYPE); 229 PCI_HEADER_TYPE);
169 if (!(type & 0x80)) 230 if (!(type & 0x80))
170 return; 231 return -1;
232
233 return 0;
171} 234}
172 235
173void __init early_quirks(void) 236void __init early_quirks(void)
@@ -180,6 +243,9 @@ void __init early_quirks(void)
180 /* Poor man's PCI discovery */ 243 /* Poor man's PCI discovery */
181 for (num = 0; num < 32; num++) 244 for (num = 0; num < 32; num++)
182 for (slot = 0; slot < 32; slot++) 245 for (slot = 0; slot < 32; slot++)
183 for (func = 0; func < 8; func++) 246 for (func = 0; func < 8; func++) {
184 check_dev_quirk(num, slot, func); 247 /* Only probe function 0 on single fn devices */
248 if (check_dev_quirk(num, slot, func))
249 break;
250 }
185} 251}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 643fd861b724..34ad997d3834 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -3,11 +3,19 @@
3#include <linux/init.h> 3#include <linux/init.h>
4#include <linux/string.h> 4#include <linux/string.h>
5#include <linux/screen_info.h> 5#include <linux/screen_info.h>
6#include <linux/usb/ch9.h>
7#include <linux/pci_regs.h>
8#include <linux/pci_ids.h>
9#include <linux/errno.h>
6#include <asm/io.h> 10#include <asm/io.h>
7#include <asm/processor.h> 11#include <asm/processor.h>
8#include <asm/fcntl.h> 12#include <asm/fcntl.h>
9#include <asm/setup.h> 13#include <asm/setup.h>
10#include <xen/hvc-console.h> 14#include <xen/hvc-console.h>
15#include <asm/pci-direct.h>
16#include <asm/pgtable.h>
17#include <asm/fixmap.h>
18#include <linux/usb/ehci_def.h>
11 19
12/* Simple VGA output */ 20/* Simple VGA output */
13#define VGABASE (__ISA_IO_base + 0xb8000) 21#define VGABASE (__ISA_IO_base + 0xb8000)
@@ -78,6 +86,7 @@ static int early_serial_base = 0x3f8; /* ttyS0 */
78static int early_serial_putc(unsigned char ch) 86static int early_serial_putc(unsigned char ch)
79{ 87{
80 unsigned timeout = 0xffff; 88 unsigned timeout = 0xffff;
89
81 while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) 90 while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
82 cpu_relax(); 91 cpu_relax();
83 outb(ch, early_serial_base + TXR); 92 outb(ch, early_serial_base + TXR);
@@ -111,7 +120,7 @@ static __init void early_serial_init(char *s)
111 if (!strncmp(s, "0x", 2)) { 120 if (!strncmp(s, "0x", 2)) {
112 early_serial_base = simple_strtoul(s, &e, 16); 121 early_serial_base = simple_strtoul(s, &e, 16);
113 } else { 122 } else {
114 static int bases[] = { 0x3f8, 0x2f8 }; 123 static const int __initconst bases[] = { 0x3f8, 0x2f8 };
115 124
116 if (!strncmp(s, "ttyS", 4)) 125 if (!strncmp(s, "ttyS", 4))
117 s += 4; 126 s += 4;
@@ -151,6 +160,721 @@ static struct console early_serial_console = {
151 .index = -1, 160 .index = -1,
152}; 161};
153 162
163#ifdef CONFIG_EARLY_PRINTK_DBGP
164
165static struct ehci_caps __iomem *ehci_caps;
166static struct ehci_regs __iomem *ehci_regs;
167static struct ehci_dbg_port __iomem *ehci_debug;
168static unsigned int dbgp_endpoint_out;
169
170struct ehci_dev {
171 u32 bus;
172 u32 slot;
173 u32 func;
174};
175
176static struct ehci_dev ehci_dev;
177
178#define USB_DEBUG_DEVNUM 127
179
180#define DBGP_DATA_TOGGLE 0x8800
181
182static inline u32 dbgp_pid_update(u32 x, u32 tok)
183{
184 return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff);
185}
186
187static inline u32 dbgp_len_update(u32 x, u32 len)
188{
189 return (x & ~0x0f) | (len & 0x0f);
190}
191
192/*
193 * USB Packet IDs (PIDs)
194 */
195
196/* token */
197#define USB_PID_OUT 0xe1
198#define USB_PID_IN 0x69
199#define USB_PID_SOF 0xa5
200#define USB_PID_SETUP 0x2d
201/* handshake */
202#define USB_PID_ACK 0xd2
203#define USB_PID_NAK 0x5a
204#define USB_PID_STALL 0x1e
205#define USB_PID_NYET 0x96
206/* data */
207#define USB_PID_DATA0 0xc3
208#define USB_PID_DATA1 0x4b
209#define USB_PID_DATA2 0x87
210#define USB_PID_MDATA 0x0f
211/* Special */
212#define USB_PID_PREAMBLE 0x3c
213#define USB_PID_ERR 0x3c
214#define USB_PID_SPLIT 0x78
215#define USB_PID_PING 0xb4
216#define USB_PID_UNDEF_0 0xf0
217
218#define USB_PID_DATA_TOGGLE 0x88
219#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE)
220
221#define PCI_CAP_ID_EHCI_DEBUG 0xa
222
223#define HUB_ROOT_RESET_TIME 50 /* times are in msec */
224#define HUB_SHORT_RESET_TIME 10
225#define HUB_LONG_RESET_TIME 200
226#define HUB_RESET_TIMEOUT 500
227
228#define DBGP_MAX_PACKET 8
229
230static int dbgp_wait_until_complete(void)
231{
232 u32 ctrl;
233 int loop = 0x100000;
234
235 do {
236 ctrl = readl(&ehci_debug->control);
237 /* Stop when the transaction is finished */
238 if (ctrl & DBGP_DONE)
239 break;
240 } while (--loop > 0);
241
242 if (!loop)
243 return -1;
244
245 /*
246 * Now that we have observed the completed transaction,
247 * clear the done bit.
248 */
249 writel(ctrl | DBGP_DONE, &ehci_debug->control);
250 return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
251}
252
253static void dbgp_mdelay(int ms)
254{
255 int i;
256
257 while (ms--) {
258 for (i = 0; i < 1000; i++)
259 outb(0x1, 0x80);
260 }
261}
262
263static void dbgp_breath(void)
264{
265 /* Sleep to give the debug port a chance to breathe */
266}
267
268static int dbgp_wait_until_done(unsigned ctrl)
269{
270 u32 pids, lpid;
271 int ret;
272 int loop = 3;
273
274retry:
275 writel(ctrl | DBGP_GO, &ehci_debug->control);
276 ret = dbgp_wait_until_complete();
277 pids = readl(&ehci_debug->pids);
278 lpid = DBGP_PID_GET(pids);
279
280 if (ret < 0)
281 return ret;
282
283 /*
284 * If the port is getting full or it has dropped data
285 * start pacing ourselves, not necessary but it's friendly.
286 */
287 if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET))
288 dbgp_breath();
289
290 /* If I get a NACK reissue the transmission */
291 if (lpid == USB_PID_NAK) {
292 if (--loop > 0)
293 goto retry;
294 }
295
296 return ret;
297}
298
299static void dbgp_set_data(const void *buf, int size)
300{
301 const unsigned char *bytes = buf;
302 u32 lo, hi;
303 int i;
304
305 lo = hi = 0;
306 for (i = 0; i < 4 && i < size; i++)
307 lo |= bytes[i] << (8*i);
308 for (; i < 8 && i < size; i++)
309 hi |= bytes[i] << (8*(i - 4));
310 writel(lo, &ehci_debug->data03);
311 writel(hi, &ehci_debug->data47);
312}
313
314static void dbgp_get_data(void *buf, int size)
315{
316 unsigned char *bytes = buf;
317 u32 lo, hi;
318 int i;
319
320 lo = readl(&ehci_debug->data03);
321 hi = readl(&ehci_debug->data47);
322 for (i = 0; i < 4 && i < size; i++)
323 bytes[i] = (lo >> (8*i)) & 0xff;
324 for (; i < 8 && i < size; i++)
325 bytes[i] = (hi >> (8*(i - 4))) & 0xff;
326}
327
328static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
329 const char *bytes, int size)
330{
331 u32 pids, addr, ctrl;
332 int ret;
333
334 if (size > DBGP_MAX_PACKET)
335 return -1;
336
337 addr = DBGP_EPADDR(devnum, endpoint);
338
339 pids = readl(&ehci_debug->pids);
340 pids = dbgp_pid_update(pids, USB_PID_OUT);
341
342 ctrl = readl(&ehci_debug->control);
343 ctrl = dbgp_len_update(ctrl, size);
344 ctrl |= DBGP_OUT;
345 ctrl |= DBGP_GO;
346
347 dbgp_set_data(bytes, size);
348 writel(addr, &ehci_debug->address);
349 writel(pids, &ehci_debug->pids);
350
351 ret = dbgp_wait_until_done(ctrl);
352 if (ret < 0)
353 return ret;
354
355 return ret;
356}
357
358static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
359 int size)
360{
361 u32 pids, addr, ctrl;
362 int ret;
363
364 if (size > DBGP_MAX_PACKET)
365 return -1;
366
367 addr = DBGP_EPADDR(devnum, endpoint);
368
369 pids = readl(&ehci_debug->pids);
370 pids = dbgp_pid_update(pids, USB_PID_IN);
371
372 ctrl = readl(&ehci_debug->control);
373 ctrl = dbgp_len_update(ctrl, size);
374 ctrl &= ~DBGP_OUT;
375 ctrl |= DBGP_GO;
376
377 writel(addr, &ehci_debug->address);
378 writel(pids, &ehci_debug->pids);
379 ret = dbgp_wait_until_done(ctrl);
380 if (ret < 0)
381 return ret;
382
383 if (size > ret)
384 size = ret;
385 dbgp_get_data(data, size);
386 return ret;
387}
388
389static int dbgp_control_msg(unsigned devnum, int requesttype, int request,
390 int value, int index, void *data, int size)
391{
392 u32 pids, addr, ctrl;
393 struct usb_ctrlrequest req;
394 int read;
395 int ret;
396
397 read = (requesttype & USB_DIR_IN) != 0;
398 if (size > (read ? DBGP_MAX_PACKET:0))
399 return -1;
400
401 /* Compute the control message */
402 req.bRequestType = requesttype;
403 req.bRequest = request;
404 req.wValue = cpu_to_le16(value);
405 req.wIndex = cpu_to_le16(index);
406 req.wLength = cpu_to_le16(size);
407
408 pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP);
409 addr = DBGP_EPADDR(devnum, 0);
410
411 ctrl = readl(&ehci_debug->control);
412 ctrl = dbgp_len_update(ctrl, sizeof(req));
413 ctrl |= DBGP_OUT;
414 ctrl |= DBGP_GO;
415
416 /* Send the setup message */
417 dbgp_set_data(&req, sizeof(req));
418 writel(addr, &ehci_debug->address);
419 writel(pids, &ehci_debug->pids);
420 ret = dbgp_wait_until_done(ctrl);
421 if (ret < 0)
422 return ret;
423
424 /* Read the result */
425 return dbgp_bulk_read(devnum, 0, data, size);
426}
427
428
429/* Find a PCI capability */
430static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap)
431{
432 u8 pos;
433 int bytes;
434
435 if (!(read_pci_config_16(num, slot, func, PCI_STATUS) &
436 PCI_STATUS_CAP_LIST))
437 return 0;
438
439 pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST);
440 for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
441 u8 id;
442
443 pos &= ~3;
444 id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID);
445 if (id == 0xff)
446 break;
447 if (id == cap)
448 return pos;
449
450 pos = read_pci_config_byte(num, slot, func,
451 pos+PCI_CAP_LIST_NEXT);
452 }
453 return 0;
454}
455
456static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func)
457{
458 u32 class;
459
460 class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION);
461 if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI)
462 return 0;
463
464 return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG);
465}
466
467static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
468{
469 u32 bus, slot, func;
470
471 for (bus = 0; bus < 256; bus++) {
472 for (slot = 0; slot < 32; slot++) {
473 for (func = 0; func < 8; func++) {
474 unsigned cap;
475
476 cap = __find_dbgp(bus, slot, func);
477
478 if (!cap)
479 continue;
480 if (ehci_num-- != 0)
481 continue;
482 *rbus = bus;
483 *rslot = slot;
484 *rfunc = func;
485 return cap;
486 }
487 }
488 }
489 return 0;
490}
491
492static int ehci_reset_port(int port)
493{
494 u32 portsc;
495 u32 delay_time, delay;
496 int loop;
497
498 /* Reset the usb debug port */
499 portsc = readl(&ehci_regs->port_status[port - 1]);
500 portsc &= ~PORT_PE;
501 portsc |= PORT_RESET;
502 writel(portsc, &ehci_regs->port_status[port - 1]);
503
504 delay = HUB_ROOT_RESET_TIME;
505 for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT;
506 delay_time += delay) {
507 dbgp_mdelay(delay);
508
509 portsc = readl(&ehci_regs->port_status[port - 1]);
510 if (portsc & PORT_RESET) {
511 /* force reset to complete */
512 loop = 2;
513 writel(portsc & ~(PORT_RWC_BITS | PORT_RESET),
514 &ehci_regs->port_status[port - 1]);
515 do {
516 portsc = readl(&ehci_regs->port_status[port-1]);
517 } while ((portsc & PORT_RESET) && (--loop > 0));
518 }
519
520 /* Device went away? */
521 if (!(portsc & PORT_CONNECT))
522 return -ENOTCONN;
523
524 /* bomb out completely if something weird happend */
525 if ((portsc & PORT_CSC))
526 return -EINVAL;
527
528 /* If we've finished resetting, then break out of the loop */
529 if (!(portsc & PORT_RESET) && (portsc & PORT_PE))
530 return 0;
531 }
532 return -EBUSY;
533}
534
535static int ehci_wait_for_port(int port)
536{
537 u32 status;
538 int ret, reps;
539
540 for (reps = 0; reps < 3; reps++) {
541 dbgp_mdelay(100);
542 status = readl(&ehci_regs->status);
543 if (status & STS_PCD) {
544 ret = ehci_reset_port(port);
545 if (ret == 0)
546 return 0;
547 }
548 }
549 return -ENOTCONN;
550}
551
552#ifdef DBGP_DEBUG
553# define dbgp_printk early_printk
554#else
555static inline void dbgp_printk(const char *fmt, ...) { }
556#endif
557
558typedef void (*set_debug_port_t)(int port);
559
560static void default_set_debug_port(int port)
561{
562}
563
564static set_debug_port_t set_debug_port = default_set_debug_port;
565
566static void nvidia_set_debug_port(int port)
567{
568 u32 dword;
569 dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
570 0x74);
571 dword &= ~(0x0f<<12);
572 dword |= ((port & 0x0f)<<12);
573 write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74,
574 dword);
575 dbgp_printk("set debug port to %d\n", port);
576}
577
578static void __init detect_set_debug_port(void)
579{
580 u32 vendorid;
581
582 vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
583 0x00);
584
585 if ((vendorid & 0xffff) == 0x10de) {
586 dbgp_printk("using nvidia set_debug_port\n");
587 set_debug_port = nvidia_set_debug_port;
588 }
589}
590
591static int __init ehci_setup(void)
592{
593 struct usb_debug_descriptor dbgp_desc;
594 u32 cmd, ctrl, status, portsc, hcs_params;
595 u32 debug_port, new_debug_port = 0, n_ports;
596 u32 devnum;
597 int ret, i;
598 int loop;
599 int port_map_tried;
600 int playtimes = 3;
601
602try_next_time:
603 port_map_tried = 0;
604
605try_next_port:
606
607 hcs_params = readl(&ehci_caps->hcs_params);
608 debug_port = HCS_DEBUG_PORT(hcs_params);
609 n_ports = HCS_N_PORTS(hcs_params);
610
611 dbgp_printk("debug_port: %d\n", debug_port);
612 dbgp_printk("n_ports: %d\n", n_ports);
613
614 for (i = 1; i <= n_ports; i++) {
615 portsc = readl(&ehci_regs->port_status[i-1]);
616 dbgp_printk("portstatus%d: %08x\n", i, portsc);
617 }
618
619 if (port_map_tried && (new_debug_port != debug_port)) {
620 if (--playtimes) {
621 set_debug_port(new_debug_port);
622 goto try_next_time;
623 }
624 return -1;
625 }
626
627 loop = 10;
628 /* Reset the EHCI controller */
629 cmd = readl(&ehci_regs->command);
630 cmd |= CMD_RESET;
631 writel(cmd, &ehci_regs->command);
632 do {
633 cmd = readl(&ehci_regs->command);
634 } while ((cmd & CMD_RESET) && (--loop > 0));
635
636 if (!loop) {
637 dbgp_printk("can not reset ehci\n");
638 return -1;
639 }
640 dbgp_printk("ehci reset done\n");
641
642 /* Claim ownership, but do not enable yet */
643 ctrl = readl(&ehci_debug->control);
644 ctrl |= DBGP_OWNER;
645 ctrl &= ~(DBGP_ENABLED | DBGP_INUSE);
646 writel(ctrl, &ehci_debug->control);
647
648 /* Start the ehci running */
649 cmd = readl(&ehci_regs->command);
650 cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET);
651 cmd |= CMD_RUN;
652 writel(cmd, &ehci_regs->command);
653
654 /* Ensure everything is routed to the EHCI */
655 writel(FLAG_CF, &ehci_regs->configured_flag);
656
657 /* Wait until the controller is no longer halted */
658 loop = 10;
659 do {
660 status = readl(&ehci_regs->status);
661 } while ((status & STS_HALT) && (--loop > 0));
662
663 if (!loop) {
664 dbgp_printk("ehci can be started\n");
665 return -1;
666 }
667 dbgp_printk("ehci started\n");
668
669 /* Wait for a device to show up in the debug port */
670 ret = ehci_wait_for_port(debug_port);
671 if (ret < 0) {
672 dbgp_printk("No device found in debug port\n");
673 goto next_debug_port;
674 }
675 dbgp_printk("ehci wait for port done\n");
676
677 /* Enable the debug port */
678 ctrl = readl(&ehci_debug->control);
679 ctrl |= DBGP_CLAIM;
680 writel(ctrl, &ehci_debug->control);
681 ctrl = readl(&ehci_debug->control);
682 if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) {
683 dbgp_printk("No device in debug port\n");
684 writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control);
685 goto err;
686 }
687 dbgp_printk("debug ported enabled\n");
688
689 /* Completely transfer the debug device to the debug controller */
690 portsc = readl(&ehci_regs->port_status[debug_port - 1]);
691 portsc &= ~PORT_PE;
692 writel(portsc, &ehci_regs->port_status[debug_port - 1]);
693
694 dbgp_mdelay(100);
695
696 /* Find the debug device and make it device number 127 */
697 for (devnum = 0; devnum <= 127; devnum++) {
698 ret = dbgp_control_msg(devnum,
699 USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
700 USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0,
701 &dbgp_desc, sizeof(dbgp_desc));
702 if (ret > 0)
703 break;
704 }
705 if (devnum > 127) {
706 dbgp_printk("Could not find attached debug device\n");
707 goto err;
708 }
709 if (ret < 0) {
710 dbgp_printk("Attached device is not a debug device\n");
711 goto err;
712 }
713 dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint;
714
715 /* Move the device to 127 if it isn't already there */
716 if (devnum != USB_DEBUG_DEVNUM) {
717 ret = dbgp_control_msg(devnum,
718 USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
719 USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0);
720 if (ret < 0) {
721 dbgp_printk("Could not move attached device to %d\n",
722 USB_DEBUG_DEVNUM);
723 goto err;
724 }
725 devnum = USB_DEBUG_DEVNUM;
726 dbgp_printk("debug device renamed to 127\n");
727 }
728
729 /* Enable the debug interface */
730 ret = dbgp_control_msg(USB_DEBUG_DEVNUM,
731 USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
732 USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0);
733 if (ret < 0) {
734 dbgp_printk(" Could not enable the debug device\n");
735 goto err;
736 }
737 dbgp_printk("debug interface enabled\n");
738
739 /* Perform a small write to get the even/odd data state in sync
740 */
741 ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1);
742 if (ret < 0) {
743 dbgp_printk("dbgp_bulk_write failed: %d\n", ret);
744 goto err;
745 }
746 dbgp_printk("small write doned\n");
747
748 return 0;
749err:
750 /* Things didn't work so remove my claim */
751 ctrl = readl(&ehci_debug->control);
752 ctrl &= ~(DBGP_CLAIM | DBGP_OUT);
753 writel(ctrl, &ehci_debug->control);
754 return -1;
755
756next_debug_port:
757 port_map_tried |= (1<<(debug_port - 1));
758 new_debug_port = ((debug_port-1+1)%n_ports) + 1;
759 if (port_map_tried != ((1<<n_ports) - 1)) {
760 set_debug_port(new_debug_port);
761 goto try_next_port;
762 }
763 if (--playtimes) {
764 set_debug_port(new_debug_port);
765 goto try_next_time;
766 }
767
768 return -1;
769}
770
771static int __init early_dbgp_init(char *s)
772{
773 u32 debug_port, bar, offset;
774 u32 bus, slot, func, cap;
775 void __iomem *ehci_bar;
776 u32 dbgp_num;
777 u32 bar_val;
778 char *e;
779 int ret;
780 u8 byte;
781
782 if (!early_pci_allowed())
783 return -1;
784
785 dbgp_num = 0;
786 if (*s)
787 dbgp_num = simple_strtoul(s, &e, 10);
788 dbgp_printk("dbgp_num: %d\n", dbgp_num);
789
790 cap = find_dbgp(dbgp_num, &bus, &slot, &func);
791 if (!cap)
792 return -1;
793
794 dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot,
795 func);
796
797 debug_port = read_pci_config(bus, slot, func, cap);
798 bar = (debug_port >> 29) & 0x7;
799 bar = (bar * 4) + 0xc;
800 offset = (debug_port >> 16) & 0xfff;
801 dbgp_printk("bar: %02x offset: %03x\n", bar, offset);
802 if (bar != PCI_BASE_ADDRESS_0) {
803 dbgp_printk("only debug ports on bar 1 handled.\n");
804
805 return -1;
806 }
807
808 bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
809 dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset);
810 if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) {
811 dbgp_printk("only simple 32bit mmio bars supported\n");
812
813 return -1;
814 }
815
816 /* double check if the mem space is enabled */
817 byte = read_pci_config_byte(bus, slot, func, 0x04);
818 if (!(byte & 0x2)) {
819 byte |= 0x02;
820 write_pci_config_byte(bus, slot, func, 0x04, byte);
821 dbgp_printk("mmio for ehci enabled\n");
822 }
823
824 /*
825 * FIXME I don't have the bar size so just guess PAGE_SIZE is more
826 * than enough. 1K is the biggest I have seen.
827 */
828 set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK);
829 ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE);
830 ehci_bar += bar_val & ~PAGE_MASK;
831 dbgp_printk("ehci_bar: %p\n", ehci_bar);
832
833 ehci_caps = ehci_bar;
834 ehci_regs = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase));
835 ehci_debug = ehci_bar + offset;
836 ehci_dev.bus = bus;
837 ehci_dev.slot = slot;
838 ehci_dev.func = func;
839
840 detect_set_debug_port();
841
842 ret = ehci_setup();
843 if (ret < 0) {
844 dbgp_printk("ehci_setup failed\n");
845 ehci_debug = NULL;
846
847 return -1;
848 }
849
850 return 0;
851}
852
853static void early_dbgp_write(struct console *con, const char *str, u32 n)
854{
855 int chunk, ret;
856
857 if (!ehci_debug)
858 return;
859 while (n > 0) {
860 chunk = n;
861 if (chunk > DBGP_MAX_PACKET)
862 chunk = DBGP_MAX_PACKET;
863 ret = dbgp_bulk_write(USB_DEBUG_DEVNUM,
864 dbgp_endpoint_out, str, chunk);
865 str += chunk;
866 n -= chunk;
867 }
868}
869
870static struct console early_dbgp_console = {
871 .name = "earlydbg",
872 .write = early_dbgp_write,
873 .flags = CON_PRINTBUFFER,
874 .index = -1,
875};
876#endif
877
154/* Console interface to a host file on AMD's SimNow! */ 878/* Console interface to a host file on AMD's SimNow! */
155 879
156static int simnow_fd; 880static int simnow_fd;
@@ -165,6 +889,7 @@ enum {
165static noinline long simnow(long cmd, long a, long b, long c) 889static noinline long simnow(long cmd, long a, long b, long c)
166{ 890{
167 long ret; 891 long ret;
892
168 asm volatile("cpuid" : 893 asm volatile("cpuid" :
169 "=a" (ret) : 894 "=a" (ret) :
170 "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2)); 895 "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
@@ -174,6 +899,7 @@ static noinline long simnow(long cmd, long a, long b, long c)
174static void __init simnow_init(char *str) 899static void __init simnow_init(char *str)
175{ 900{
176 char *fn = "klog"; 901 char *fn = "klog";
902
177 if (*str == '=') 903 if (*str == '=')
178 fn = ++str; 904 fn = ++str;
179 /* error ignored */ 905 /* error ignored */
@@ -194,9 +920,9 @@ static struct console simnow_console = {
194 920
195/* Direct interface for emergencies */ 921/* Direct interface for emergencies */
196static struct console *early_console = &early_vga_console; 922static struct console *early_console = &early_vga_console;
197static int early_console_initialized; 923static int __initdata early_console_initialized;
198 924
199void early_printk(const char *fmt, ...) 925asmlinkage void early_printk(const char *fmt, ...)
200{ 926{
201 char buf[512]; 927 char buf[512];
202 int n; 928 int n;
@@ -208,10 +934,11 @@ void early_printk(const char *fmt, ...)
208 va_end(ap); 934 va_end(ap);
209} 935}
210 936
211static int __initdata keep_early;
212 937
213static int __init setup_early_printk(char *buf) 938static int __init setup_early_printk(char *buf)
214{ 939{
940 int keep_early;
941
215 if (!buf) 942 if (!buf)
216 return 0; 943 return 0;
217 944
@@ -219,8 +946,7 @@ static int __init setup_early_printk(char *buf)
219 return 0; 946 return 0;
220 early_console_initialized = 1; 947 early_console_initialized = 1;
221 948
222 if (strstr(buf, "keep")) 949 keep_early = (strstr(buf, "keep") != NULL);
223 keep_early = 1;
224 950
225 if (!strncmp(buf, "serial", 6)) { 951 if (!strncmp(buf, "serial", 6)) {
226 early_serial_init(buf + 6); 952 early_serial_init(buf + 6);
@@ -238,6 +964,17 @@ static int __init setup_early_printk(char *buf)
238 simnow_init(buf + 6); 964 simnow_init(buf + 6);
239 early_console = &simnow_console; 965 early_console = &simnow_console;
240 keep_early = 1; 966 keep_early = 1;
967#ifdef CONFIG_EARLY_PRINTK_DBGP
968 } else if (!strncmp(buf, "dbgp", 4)) {
969 if (early_dbgp_init(buf+4) < 0)
970 return 0;
971 early_console = &early_dbgp_console;
972 /*
973 * usb subsys will reset ehci controller, so don't keep
974 * that early console
975 */
976 keep_early = 0;
977#endif
241#ifdef CONFIG_HVC_XEN 978#ifdef CONFIG_HVC_XEN
242 } else if (!strncmp(buf, "xen", 3)) { 979 } else if (!strncmp(buf, "xen", 3)) {
243 early_console = &xenboot_console; 980 early_console = &xenboot_console;
@@ -251,4 +988,5 @@ static int __init setup_early_printk(char *buf)
251 register_console(early_console); 988 register_console(early_console);
252 return 0; 989 return 0;
253} 990}
991
254early_param("earlyprintk", setup_early_printk); 992early_param("earlyprintk", setup_early_printk);
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 77d424cf68b3..945a31cdd81f 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -64,6 +64,17 @@ static int __init setup_noefi(char *arg)
64} 64}
65early_param("noefi", setup_noefi); 65early_param("noefi", setup_noefi);
66 66
67int add_efi_memmap;
68EXPORT_SYMBOL(add_efi_memmap);
69
70static int __init setup_add_efi_memmap(char *arg)
71{
72 add_efi_memmap = 1;
73 return 0;
74}
75early_param("add_efi_memmap", setup_add_efi_memmap);
76
77
67static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) 78static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
68{ 79{
69 return efi_call_virt2(get_time, tm, tc); 80 return efi_call_virt2(get_time, tm, tc);
@@ -213,6 +224,50 @@ unsigned long efi_get_time(void)
213 eft.minute, eft.second); 224 eft.minute, eft.second);
214} 225}
215 226
227/*
228 * Tell the kernel about the EFI memory map. This might include
229 * more than the max 128 entries that can fit in the e820 legacy
230 * (zeropage) memory map.
231 */
232
233static void __init do_add_efi_memmap(void)
234{
235 void *p;
236
237 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
238 efi_memory_desc_t *md = p;
239 unsigned long long start = md->phys_addr;
240 unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
241 int e820_type;
242
243 if (md->attribute & EFI_MEMORY_WB)
244 e820_type = E820_RAM;
245 else
246 e820_type = E820_RESERVED;
247 e820_add_region(start, size, e820_type);
248 }
249 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
250}
251
252void __init efi_reserve_early(void)
253{
254 unsigned long pmap;
255
256#ifdef CONFIG_X86_32
257 pmap = boot_params.efi_info.efi_memmap;
258#else
259 pmap = (boot_params.efi_info.efi_memmap |
260 ((__u64)boot_params.efi_info.efi_memmap_hi<<32));
261#endif
262 memmap.phys_map = (void *)pmap;
263 memmap.nr_map = boot_params.efi_info.efi_memmap_size /
264 boot_params.efi_info.efi_memdesc_size;
265 memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
266 memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
267 reserve_early(pmap, pmap + memmap.nr_map * memmap.desc_size,
268 "EFI memmap");
269}
270
216#if EFI_DEBUG 271#if EFI_DEBUG
217static void __init print_efi_memmap(void) 272static void __init print_efi_memmap(void)
218{ 273{
@@ -244,19 +299,11 @@ void __init efi_init(void)
244 299
245#ifdef CONFIG_X86_32 300#ifdef CONFIG_X86_32
246 efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab; 301 efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab;
247 memmap.phys_map = (void *)boot_params.efi_info.efi_memmap;
248#else 302#else
249 efi_phys.systab = (efi_system_table_t *) 303 efi_phys.systab = (efi_system_table_t *)
250 (boot_params.efi_info.efi_systab | 304 (boot_params.efi_info.efi_systab |
251 ((__u64)boot_params.efi_info.efi_systab_hi<<32)); 305 ((__u64)boot_params.efi_info.efi_systab_hi<<32));
252 memmap.phys_map = (void *)
253 (boot_params.efi_info.efi_memmap |
254 ((__u64)boot_params.efi_info.efi_memmap_hi<<32));
255#endif 306#endif
256 memmap.nr_map = boot_params.efi_info.efi_memmap_size /
257 boot_params.efi_info.efi_memdesc_size;
258 memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
259 memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
260 307
261 efi.systab = early_ioremap((unsigned long)efi_phys.systab, 308 efi.systab = early_ioremap((unsigned long)efi_phys.systab,
262 sizeof(efi_system_table_t)); 309 sizeof(efi_system_table_t));
@@ -367,9 +414,13 @@ void __init efi_init(void)
367 if (memmap.map == NULL) 414 if (memmap.map == NULL)
368 printk(KERN_ERR "Could not map the EFI memory map!\n"); 415 printk(KERN_ERR "Could not map the EFI memory map!\n");
369 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); 416 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
417
370 if (memmap.desc_size != sizeof(efi_memory_desc_t)) 418 if (memmap.desc_size != sizeof(efi_memory_desc_t))
371 printk(KERN_WARNING "Kernel-defined memdesc" 419 printk(KERN_WARNING
372 "doesn't match the one from EFI!\n"); 420 "Kernel-defined memdesc doesn't match the one from EFI!\n");
421
422 if (add_efi_memmap)
423 do_add_efi_memmap();
373 424
374 /* Setup for EFI runtime service */ 425 /* Setup for EFI runtime service */
375 reboot_type = BOOT_EFI; 426 reboot_type = BOOT_EFI;
@@ -424,7 +475,7 @@ void __init efi_enter_virtual_mode(void)
424 size = md->num_pages << EFI_PAGE_SHIFT; 475 size = md->num_pages << EFI_PAGE_SHIFT;
425 end = md->phys_addr + size; 476 end = md->phys_addr + size;
426 477
427 if (PFN_UP(end) <= max_pfn_mapped) 478 if (PFN_UP(end) <= max_low_pfn_mapped)
428 va = __va(md->phys_addr); 479 va = __va(md->phys_addr);
429 else 480 else
430 va = efi_ioremap(md->phys_addr, size); 481 va = efi_ioremap(md->phys_addr, size);
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
index 5d23d85624d4..5cab48ee61a4 100644
--- a/arch/x86/kernel/efi_32.c
+++ b/arch/x86/kernel/efi_32.c
@@ -49,13 +49,13 @@ void efi_call_phys_prelog(void)
49 local_irq_save(efi_rt_eflags); 49 local_irq_save(efi_rt_eflags);
50 50
51 /* 51 /*
52 * If I don't have PSE, I should just duplicate two entries in page 52 * If I don't have PAE, I should just duplicate two entries in page
53 * directory. If I have PSE, I just need to duplicate one entry in 53 * directory. If I have PAE, I just need to duplicate one entry in
54 * page directory. 54 * page directory.
55 */ 55 */
56 cr4 = read_cr4(); 56 cr4 = read_cr4_safe();
57 57
58 if (cr4 & X86_CR4_PSE) { 58 if (cr4 & X86_CR4_PAE) {
59 efi_bak_pg_dir_pointer[0].pgd = 59 efi_bak_pg_dir_pointer[0].pgd =
60 swapper_pg_dir[pgd_index(0)].pgd; 60 swapper_pg_dir[pgd_index(0)].pgd;
61 swapper_pg_dir[0].pgd = 61 swapper_pg_dir[0].pgd =
@@ -91,9 +91,9 @@ void efi_call_phys_epilog(void)
91 gdt_descr.size = GDT_SIZE - 1; 91 gdt_descr.size = GDT_SIZE - 1;
92 load_gdt(&gdt_descr); 92 load_gdt(&gdt_descr);
93 93
94 cr4 = read_cr4(); 94 cr4 = read_cr4_safe();
95 95
96 if (cr4 & X86_CR4_PSE) { 96 if (cr4 & X86_CR4_PAE) {
97 swapper_pg_dir[pgd_index(0)].pgd = 97 swapper_pg_dir[pgd_index(0)].pgd =
98 efi_bak_pg_dir_pointer[0].pgd; 98 efi_bak_pg_dir_pointer[0].pgd;
99 } else { 99 } else {
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index d0060fdcccac..652c5287215f 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -97,13 +97,7 @@ void __init efi_call_phys_epilog(void)
97 early_runtime_code_mapping_set_exec(0); 97 early_runtime_code_mapping_set_exec(0);
98} 98}
99 99
100void __init efi_reserve_bootmem(void) 100void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size)
101{
102 reserve_bootmem_generic((unsigned long)memmap.phys_map,
103 memmap.nr_map * memmap.desc_size);
104}
105
106void __iomem * __init efi_ioremap(unsigned long phys_addr, unsigned long size)
107{ 101{
108 static unsigned pages_mapped __initdata; 102 static unsigned pages_mapped __initdata;
109 unsigned i, pages; 103 unsigned i, pages;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c778e4fa55a2..b21fbfaffe39 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -51,14 +51,25 @@
51#include <asm/percpu.h> 51#include <asm/percpu.h>
52#include <asm/dwarf2.h> 52#include <asm/dwarf2.h>
53#include <asm/processor-flags.h> 53#include <asm/processor-flags.h>
54#include "irq_vectors.h" 54#include <asm/ftrace.h>
55#include <asm/irq_vectors.h>
56
57/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
58#include <linux/elf-em.h>
59#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
60#define __AUDIT_ARCH_LE 0x40000000
61
62#ifndef CONFIG_AUDITSYSCALL
63#define sysenter_audit syscall_trace_entry
64#define sysexit_audit syscall_exit_work
65#endif
55 66
56/* 67/*
57 * We use macros for low-level operations which need to be overridden 68 * We use macros for low-level operations which need to be overridden
58 * for paravirtualization. The following will never clobber any registers: 69 * for paravirtualization. The following will never clobber any registers:
59 * INTERRUPT_RETURN (aka. "iret") 70 * INTERRUPT_RETURN (aka. "iret")
60 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") 71 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
61 * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit"). 72 * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
62 * 73 *
63 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must 74 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
64 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). 75 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
@@ -331,8 +342,9 @@ sysenter_past_esp:
331 GET_THREAD_INFO(%ebp) 342 GET_THREAD_INFO(%ebp)
332 343
333 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ 344 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
334 testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) 345 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
335 jnz syscall_trace_entry 346 jnz sysenter_audit
347sysenter_do_call:
336 cmpl $(nr_syscalls), %eax 348 cmpl $(nr_syscalls), %eax
337 jae syscall_badsys 349 jae syscall_badsys
338 call *sys_call_table(,%eax,4) 350 call *sys_call_table(,%eax,4)
@@ -342,14 +354,54 @@ sysenter_past_esp:
342 TRACE_IRQS_OFF 354 TRACE_IRQS_OFF
343 movl TI_flags(%ebp), %ecx 355 movl TI_flags(%ebp), %ecx
344 testw $_TIF_ALLWORK_MASK, %cx 356 testw $_TIF_ALLWORK_MASK, %cx
345 jne syscall_exit_work 357 jne sysexit_audit
358sysenter_exit:
346/* if something modifies registers it must also disable sysexit */ 359/* if something modifies registers it must also disable sysexit */
347 movl PT_EIP(%esp), %edx 360 movl PT_EIP(%esp), %edx
348 movl PT_OLDESP(%esp), %ecx 361 movl PT_OLDESP(%esp), %ecx
349 xorl %ebp,%ebp 362 xorl %ebp,%ebp
350 TRACE_IRQS_ON 363 TRACE_IRQS_ON
3511: mov PT_FS(%esp), %fs 3641: mov PT_FS(%esp), %fs
352 ENABLE_INTERRUPTS_SYSCALL_RET 365 ENABLE_INTERRUPTS_SYSEXIT
366
367#ifdef CONFIG_AUDITSYSCALL
368sysenter_audit:
369 testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
370 jnz syscall_trace_entry
371 addl $4,%esp
372 CFI_ADJUST_CFA_OFFSET -4
373 /* %esi already in 8(%esp) 6th arg: 4th syscall arg */
374 /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */
375 /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */
376 movl %ebx,%ecx /* 3rd arg: 1st syscall arg */
377 movl %eax,%edx /* 2nd arg: syscall number */
378 movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
379 call audit_syscall_entry
380 pushl %ebx
381 CFI_ADJUST_CFA_OFFSET 4
382 movl PT_EAX(%esp),%eax /* reload syscall number */
383 jmp sysenter_do_call
384
385sysexit_audit:
386 testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
387 jne syscall_exit_work
388 TRACE_IRQS_ON
389 ENABLE_INTERRUPTS(CLBR_ANY)
390 movl %eax,%edx /* second arg, syscall return value */
391 cmpl $0,%eax /* is it < 0? */
392 setl %al /* 1 if so, 0 if not */
393 movzbl %al,%eax /* zero-extend that */
394 inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
395 call audit_syscall_exit
396 DISABLE_INTERRUPTS(CLBR_ANY)
397 TRACE_IRQS_OFF
398 movl TI_flags(%ebp), %ecx
399 testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
400 jne syscall_exit_work
401 movl PT_EAX(%esp),%eax /* reload syscall return value */
402 jmp sysenter_exit
403#endif
404
353 CFI_ENDPROC 405 CFI_ENDPROC
354.pushsection .fixup,"ax" 406.pushsection .fixup,"ax"
3552: movl $0,PT_FS(%esp) 4072: movl $0,PT_FS(%esp)
@@ -369,7 +421,7 @@ ENTRY(system_call)
369 GET_THREAD_INFO(%ebp) 421 GET_THREAD_INFO(%ebp)
370 # system call tracing in operation / emulation 422 # system call tracing in operation / emulation
371 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ 423 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
372 testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) 424 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
373 jnz syscall_trace_entry 425 jnz syscall_trace_entry
374 cmpl $(nr_syscalls), %eax 426 cmpl $(nr_syscalls), %eax
375 jae syscall_badsys 427 jae syscall_badsys
@@ -382,10 +434,6 @@ syscall_exit:
382 # setting need_resched or sigpending 434 # setting need_resched or sigpending
383 # between sampling and the iret 435 # between sampling and the iret
384 TRACE_IRQS_OFF 436 TRACE_IRQS_OFF
385 testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
386 jz no_singlestep
387 orl $_TIF_SINGLESTEP,TI_flags(%ebp)
388no_singlestep:
389 movl TI_flags(%ebp), %ecx 437 movl TI_flags(%ebp), %ecx
390 testw $_TIF_ALLWORK_MASK, %cx # current->work 438 testw $_TIF_ALLWORK_MASK, %cx # current->work
391 jne syscall_exit_work 439 jne syscall_exit_work
@@ -513,12 +561,8 @@ END(work_pending)
513syscall_trace_entry: 561syscall_trace_entry:
514 movl $-ENOSYS,PT_EAX(%esp) 562 movl $-ENOSYS,PT_EAX(%esp)
515 movl %esp, %eax 563 movl %esp, %eax
516 xorl %edx,%edx 564 call syscall_trace_enter
517 call do_syscall_trace 565 /* What it returned is what we'll actually use. */
518 cmpl $0, %eax
519 jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
520 # so must skip actual syscall
521 movl PT_ORIG_EAX(%esp), %eax
522 cmpl $(nr_syscalls), %eax 566 cmpl $(nr_syscalls), %eax
523 jnae syscall_call 567 jnae syscall_call
524 jmp syscall_exit 568 jmp syscall_exit
@@ -527,14 +571,13 @@ END(syscall_trace_entry)
527 # perform syscall exit tracing 571 # perform syscall exit tracing
528 ALIGN 572 ALIGN
529syscall_exit_work: 573syscall_exit_work:
530 testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl 574 testb $_TIF_WORK_SYSCALL_EXIT, %cl
531 jz work_pending 575 jz work_pending
532 TRACE_IRQS_ON 576 TRACE_IRQS_ON
533 ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call 577 ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
534 # schedule() instead 578 # schedule() instead
535 movl %esp, %eax 579 movl %esp, %eax
536 movl $1, %edx 580 call syscall_trace_leave
537 call do_syscall_trace
538 jmp resume_userspace 581 jmp resume_userspace
539END(syscall_exit_work) 582END(syscall_exit_work)
540 CFI_ENDPROC 583 CFI_ENDPROC
@@ -687,6 +730,7 @@ error_code:
687 movl $(__USER_DS), %ecx 730 movl $(__USER_DS), %ecx
688 movl %ecx, %ds 731 movl %ecx, %ds
689 movl %ecx, %es 732 movl %ecx, %es
733 TRACE_IRQS_OFF
690 movl %esp,%eax # pt_regs pointer 734 movl %esp,%eax # pt_regs pointer
691 call *%edi 735 call *%edi
692 jmp ret_from_exception 736 jmp ret_from_exception
@@ -717,20 +761,9 @@ ENTRY(device_not_available)
717 RING0_INT_FRAME 761 RING0_INT_FRAME
718 pushl $-1 # mark this as an int 762 pushl $-1 # mark this as an int
719 CFI_ADJUST_CFA_OFFSET 4 763 CFI_ADJUST_CFA_OFFSET 4
720 SAVE_ALL 764 pushl $do_device_not_available
721 GET_CR0_INTO_EAX
722 testl $0x4, %eax # EM (math emulation bit)
723 jne device_not_available_emulate
724 preempt_stop(CLBR_ANY)
725 call math_state_restore
726 jmp ret_from_exception
727device_not_available_emulate:
728 pushl $0 # temporary storage for ORIG_EIP
729 CFI_ADJUST_CFA_OFFSET 4 765 CFI_ADJUST_CFA_OFFSET 4
730 call math_emulate 766 jmp error_code
731 addl $4, %esp
732 CFI_ADJUST_CFA_OFFSET -4
733 jmp ret_from_exception
734 CFI_ENDPROC 767 CFI_ENDPROC
735END(device_not_available) 768END(device_not_available)
736 769
@@ -771,6 +804,7 @@ debug_stack_correct:
771 pushl $-1 # mark this as an int 804 pushl $-1 # mark this as an int
772 CFI_ADJUST_CFA_OFFSET 4 805 CFI_ADJUST_CFA_OFFSET 4
773 SAVE_ALL 806 SAVE_ALL
807 TRACE_IRQS_OFF
774 xorl %edx,%edx # error code 0 808 xorl %edx,%edx # error code 0
775 movl %esp,%eax # pt_regs pointer 809 movl %esp,%eax # pt_regs pointer
776 call do_debug 810 call do_debug
@@ -815,6 +849,7 @@ nmi_stack_correct:
815 pushl %eax 849 pushl %eax
816 CFI_ADJUST_CFA_OFFSET 4 850 CFI_ADJUST_CFA_OFFSET 4
817 SAVE_ALL 851 SAVE_ALL
852 TRACE_IRQS_OFF
818 xorl %edx,%edx # zero error code 853 xorl %edx,%edx # zero error code
819 movl %esp,%eax # pt_regs pointer 854 movl %esp,%eax # pt_regs pointer
820 call do_nmi 855 call do_nmi
@@ -855,6 +890,7 @@ nmi_espfix_stack:
855 pushl %eax 890 pushl %eax
856 CFI_ADJUST_CFA_OFFSET 4 891 CFI_ADJUST_CFA_OFFSET 4
857 SAVE_ALL 892 SAVE_ALL
893 TRACE_IRQS_OFF
858 FIXUP_ESPFIX_STACK # %eax == %esp 894 FIXUP_ESPFIX_STACK # %eax == %esp
859 xorl %edx,%edx # zero error code 895 xorl %edx,%edx # zero error code
860 call do_nmi 896 call do_nmi
@@ -874,10 +910,10 @@ ENTRY(native_iret)
874.previous 910.previous
875END(native_iret) 911END(native_iret)
876 912
877ENTRY(native_irq_enable_syscall_ret) 913ENTRY(native_irq_enable_sysexit)
878 sti 914 sti
879 sysexit 915 sysexit
880END(native_irq_enable_syscall_ret) 916END(native_irq_enable_sysexit)
881#endif 917#endif
882 918
883KPROBE_ENTRY(int3) 919KPROBE_ENTRY(int3)
@@ -885,6 +921,7 @@ KPROBE_ENTRY(int3)
885 pushl $-1 # mark this as an int 921 pushl $-1 # mark this as an int
886 CFI_ADJUST_CFA_OFFSET 4 922 CFI_ADJUST_CFA_OFFSET 4
887 SAVE_ALL 923 SAVE_ALL
924 TRACE_IRQS_OFF
888 xorl %edx,%edx # zero error code 925 xorl %edx,%edx # zero error code
889 movl %esp,%eax # pt_regs pointer 926 movl %esp,%eax # pt_regs pointer
890 call do_int3 927 call do_int3
@@ -987,7 +1024,7 @@ ENTRY(machine_check)
987 RING0_INT_FRAME 1024 RING0_INT_FRAME
988 pushl $0 1025 pushl $0
989 CFI_ADJUST_CFA_OFFSET 4 1026 CFI_ADJUST_CFA_OFFSET 4
990 pushl machine_check_vector 1027 pushl $do_machine_check
991 CFI_ADJUST_CFA_OFFSET 4 1028 CFI_ADJUST_CFA_OFFSET 4
992 jmp error_code 1029 jmp error_code
993 CFI_ENDPROC 1030 CFI_ENDPROC
@@ -1023,7 +1060,9 @@ ENDPROC(kernel_thread_helper)
1023ENTRY(xen_sysenter_target) 1060ENTRY(xen_sysenter_target)
1024 RING0_INT_FRAME 1061 RING0_INT_FRAME
1025 addl $5*4, %esp /* remove xen-provided frame */ 1062 addl $5*4, %esp /* remove xen-provided frame */
1063 CFI_ADJUST_CFA_OFFSET -5*4
1026 jmp sysenter_past_esp 1064 jmp sysenter_past_esp
1065 CFI_ENDPROC
1027 1066
1028ENTRY(xen_hypervisor_callback) 1067ENTRY(xen_hypervisor_callback)
1029 CFI_STARTPROC 1068 CFI_STARTPROC
@@ -1110,6 +1149,77 @@ ENDPROC(xen_failsafe_callback)
1110 1149
1111#endif /* CONFIG_XEN */ 1150#endif /* CONFIG_XEN */
1112 1151
1152#ifdef CONFIG_FTRACE
1153#ifdef CONFIG_DYNAMIC_FTRACE
1154
1155ENTRY(mcount)
1156 pushl %eax
1157 pushl %ecx
1158 pushl %edx
1159 movl 0xc(%esp), %eax
1160 subl $MCOUNT_INSN_SIZE, %eax
1161
1162.globl mcount_call
1163mcount_call:
1164 call ftrace_stub
1165
1166 popl %edx
1167 popl %ecx
1168 popl %eax
1169
1170 ret
1171END(mcount)
1172
1173ENTRY(ftrace_caller)
1174 pushl %eax
1175 pushl %ecx
1176 pushl %edx
1177 movl 0xc(%esp), %eax
1178 movl 0x4(%ebp), %edx
1179 subl $MCOUNT_INSN_SIZE, %eax
1180
1181.globl ftrace_call
1182ftrace_call:
1183 call ftrace_stub
1184
1185 popl %edx
1186 popl %ecx
1187 popl %eax
1188
1189.globl ftrace_stub
1190ftrace_stub:
1191 ret
1192END(ftrace_caller)
1193
1194#else /* ! CONFIG_DYNAMIC_FTRACE */
1195
1196ENTRY(mcount)
1197 cmpl $ftrace_stub, ftrace_trace_function
1198 jnz trace
1199.globl ftrace_stub
1200ftrace_stub:
1201 ret
1202
1203 /* taken from glibc */
1204trace:
1205 pushl %eax
1206 pushl %ecx
1207 pushl %edx
1208 movl 0xc(%esp), %eax
1209 movl 0x4(%ebp), %edx
1210 subl $MCOUNT_INSN_SIZE, %eax
1211
1212 call *ftrace_trace_function
1213
1214 popl %edx
1215 popl %ecx
1216 popl %eax
1217
1218 jmp ftrace_stub
1219END(mcount)
1220#endif /* CONFIG_DYNAMIC_FTRACE */
1221#endif /* CONFIG_FTRACE */
1222
1113.section .rodata,"a" 1223.section .rodata,"a"
1114#include "syscall_table_32.S" 1224#include "syscall_table_32.S"
1115 1225
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 556a8df522a7..1db6ce4314e1 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -51,16 +51,127 @@
51#include <asm/page.h> 51#include <asm/page.h>
52#include <asm/irqflags.h> 52#include <asm/irqflags.h>
53#include <asm/paravirt.h> 53#include <asm/paravirt.h>
54#include <asm/ftrace.h>
55
56/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
57#include <linux/elf-em.h>
58#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
59#define __AUDIT_ARCH_64BIT 0x80000000
60#define __AUDIT_ARCH_LE 0x40000000
54 61
55 .code64 62 .code64
56 63
64#ifdef CONFIG_FTRACE
65#ifdef CONFIG_DYNAMIC_FTRACE
66ENTRY(mcount)
67
68 subq $0x38, %rsp
69 movq %rax, (%rsp)
70 movq %rcx, 8(%rsp)
71 movq %rdx, 16(%rsp)
72 movq %rsi, 24(%rsp)
73 movq %rdi, 32(%rsp)
74 movq %r8, 40(%rsp)
75 movq %r9, 48(%rsp)
76
77 movq 0x38(%rsp), %rdi
78 subq $MCOUNT_INSN_SIZE, %rdi
79
80.globl mcount_call
81mcount_call:
82 call ftrace_stub
83
84 movq 48(%rsp), %r9
85 movq 40(%rsp), %r8
86 movq 32(%rsp), %rdi
87 movq 24(%rsp), %rsi
88 movq 16(%rsp), %rdx
89 movq 8(%rsp), %rcx
90 movq (%rsp), %rax
91 addq $0x38, %rsp
92
93 retq
94END(mcount)
95
96ENTRY(ftrace_caller)
97
98 /* taken from glibc */
99 subq $0x38, %rsp
100 movq %rax, (%rsp)
101 movq %rcx, 8(%rsp)
102 movq %rdx, 16(%rsp)
103 movq %rsi, 24(%rsp)
104 movq %rdi, 32(%rsp)
105 movq %r8, 40(%rsp)
106 movq %r9, 48(%rsp)
107
108 movq 0x38(%rsp), %rdi
109 movq 8(%rbp), %rsi
110 subq $MCOUNT_INSN_SIZE, %rdi
111
112.globl ftrace_call
113ftrace_call:
114 call ftrace_stub
115
116 movq 48(%rsp), %r9
117 movq 40(%rsp), %r8
118 movq 32(%rsp), %rdi
119 movq 24(%rsp), %rsi
120 movq 16(%rsp), %rdx
121 movq 8(%rsp), %rcx
122 movq (%rsp), %rax
123 addq $0x38, %rsp
124
125.globl ftrace_stub
126ftrace_stub:
127 retq
128END(ftrace_caller)
129
130#else /* ! CONFIG_DYNAMIC_FTRACE */
131ENTRY(mcount)
132 cmpq $ftrace_stub, ftrace_trace_function
133 jnz trace
134.globl ftrace_stub
135ftrace_stub:
136 retq
137
138trace:
139 /* taken from glibc */
140 subq $0x38, %rsp
141 movq %rax, (%rsp)
142 movq %rcx, 8(%rsp)
143 movq %rdx, 16(%rsp)
144 movq %rsi, 24(%rsp)
145 movq %rdi, 32(%rsp)
146 movq %r8, 40(%rsp)
147 movq %r9, 48(%rsp)
148
149 movq 0x38(%rsp), %rdi
150 movq 8(%rbp), %rsi
151 subq $MCOUNT_INSN_SIZE, %rdi
152
153 call *ftrace_trace_function
154
155 movq 48(%rsp), %r9
156 movq 40(%rsp), %r8
157 movq 32(%rsp), %rdi
158 movq 24(%rsp), %rsi
159 movq 16(%rsp), %rdx
160 movq 8(%rsp), %rcx
161 movq (%rsp), %rax
162 addq $0x38, %rsp
163
164 jmp ftrace_stub
165END(mcount)
166#endif /* CONFIG_DYNAMIC_FTRACE */
167#endif /* CONFIG_FTRACE */
168
57#ifndef CONFIG_PREEMPT 169#ifndef CONFIG_PREEMPT
58#define retint_kernel retint_restore_args 170#define retint_kernel retint_restore_args
59#endif 171#endif
60 172
61#ifdef CONFIG_PARAVIRT 173#ifdef CONFIG_PARAVIRT
62ENTRY(native_irq_enable_syscall_ret) 174ENTRY(native_usergs_sysret64)
63 movq %gs:pda_oldrsp,%rsp
64 swapgs 175 swapgs
65 sysretq 176 sysretq
66#endif /* CONFIG_PARAVIRT */ 177#endif /* CONFIG_PARAVIRT */
@@ -104,7 +215,7 @@ ENTRY(native_irq_enable_syscall_ret)
104 .macro FAKE_STACK_FRAME child_rip 215 .macro FAKE_STACK_FRAME child_rip
105 /* push in order ss, rsp, eflags, cs, rip */ 216 /* push in order ss, rsp, eflags, cs, rip */
106 xorl %eax, %eax 217 xorl %eax, %eax
107 pushq %rax /* ss */ 218 pushq $__KERNEL_DS /* ss */
108 CFI_ADJUST_CFA_OFFSET 8 219 CFI_ADJUST_CFA_OFFSET 8
109 /*CFI_REL_OFFSET ss,0*/ 220 /*CFI_REL_OFFSET ss,0*/
110 pushq %rax /* rsp */ 221 pushq %rax /* rsp */
@@ -164,18 +275,18 @@ ENTRY(native_irq_enable_syscall_ret)
164ENTRY(ret_from_fork) 275ENTRY(ret_from_fork)
165 CFI_DEFAULT_STACK 276 CFI_DEFAULT_STACK
166 push kernel_eflags(%rip) 277 push kernel_eflags(%rip)
167 CFI_ADJUST_CFA_OFFSET 4 278 CFI_ADJUST_CFA_OFFSET 8
168 popf # reset kernel eflags 279 popf # reset kernel eflags
169 CFI_ADJUST_CFA_OFFSET -4 280 CFI_ADJUST_CFA_OFFSET -8
170 call schedule_tail 281 call schedule_tail
171 GET_THREAD_INFO(%rcx) 282 GET_THREAD_INFO(%rcx)
172 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) 283 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
173 jnz rff_trace 284 jnz rff_trace
174rff_action: 285rff_action:
175 RESTORE_REST 286 RESTORE_REST
176 testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? 287 testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
177 je int_ret_from_sys_call 288 je int_ret_from_sys_call
178 testl $_TIF_IA32,threadinfo_flags(%rcx) 289 testl $_TIF_IA32,TI_flags(%rcx)
179 jnz int_ret_from_sys_call 290 jnz int_ret_from_sys_call
180 RESTORE_TOP_OF_STACK %rdi,ARGOFFSET 291 RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
181 jmp ret_from_sys_call 292 jmp ret_from_sys_call
@@ -244,8 +355,9 @@ ENTRY(system_call_after_swapgs)
244 movq %rcx,RIP-ARGOFFSET(%rsp) 355 movq %rcx,RIP-ARGOFFSET(%rsp)
245 CFI_REL_OFFSET rip,RIP-ARGOFFSET 356 CFI_REL_OFFSET rip,RIP-ARGOFFSET
246 GET_THREAD_INFO(%rcx) 357 GET_THREAD_INFO(%rcx)
247 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) 358 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
248 jnz tracesys 359 jnz tracesys
360system_call_fastpath:
249 cmpq $__NR_syscall_max,%rax 361 cmpq $__NR_syscall_max,%rax
250 ja badsys 362 ja badsys
251 movq %r10,%rcx 363 movq %r10,%rcx
@@ -263,7 +375,7 @@ sysret_check:
263 GET_THREAD_INFO(%rcx) 375 GET_THREAD_INFO(%rcx)
264 DISABLE_INTERRUPTS(CLBR_NONE) 376 DISABLE_INTERRUPTS(CLBR_NONE)
265 TRACE_IRQS_OFF 377 TRACE_IRQS_OFF
266 movl threadinfo_flags(%rcx),%edx 378 movl TI_flags(%rcx),%edx
267 andl %edi,%edx 379 andl %edi,%edx
268 jnz sysret_careful 380 jnz sysret_careful
269 CFI_REMEMBER_STATE 381 CFI_REMEMBER_STATE
@@ -275,7 +387,8 @@ sysret_check:
275 CFI_REGISTER rip,rcx 387 CFI_REGISTER rip,rcx
276 RESTORE_ARGS 0,-ARG_SKIP,1 388 RESTORE_ARGS 0,-ARG_SKIP,1
277 /*CFI_REGISTER rflags,r11*/ 389 /*CFI_REGISTER rflags,r11*/
278 ENABLE_INTERRUPTS_SYSCALL_RET 390 movq %gs:pda_oldrsp, %rsp
391 USERGS_SYSRET64
279 392
280 CFI_RESTORE_STATE 393 CFI_RESTORE_STATE
281 /* Handle reschedules */ 394 /* Handle reschedules */
@@ -296,16 +409,16 @@ sysret_careful:
296sysret_signal: 409sysret_signal:
297 TRACE_IRQS_ON 410 TRACE_IRQS_ON
298 ENABLE_INTERRUPTS(CLBR_NONE) 411 ENABLE_INTERRUPTS(CLBR_NONE)
299 testl $_TIF_DO_NOTIFY_MASK,%edx 412#ifdef CONFIG_AUDITSYSCALL
300 jz 1f 413 bt $TIF_SYSCALL_AUDIT,%edx
301 414 jc sysret_audit
302 /* Really a signal */ 415#endif
303 /* edx: work flags (arg3) */ 416 /* edx: work flags (arg3) */
304 leaq do_notify_resume(%rip),%rax 417 leaq do_notify_resume(%rip),%rax
305 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 418 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
306 xorl %esi,%esi # oldset -> arg2 419 xorl %esi,%esi # oldset -> arg2
307 call ptregscall_common 420 call ptregscall_common
3081: movl $_TIF_NEED_RESCHED,%edi 421 movl $_TIF_WORK_MASK,%edi
309 /* Use IRET because user could have changed frame. This 422 /* Use IRET because user could have changed frame. This
310 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ 423 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
311 DISABLE_INTERRUPTS(CLBR_NONE) 424 DISABLE_INTERRUPTS(CLBR_NONE)
@@ -316,14 +429,56 @@ badsys:
316 movq $-ENOSYS,RAX-ARGOFFSET(%rsp) 429 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
317 jmp ret_from_sys_call 430 jmp ret_from_sys_call
318 431
432#ifdef CONFIG_AUDITSYSCALL
433 /*
434 * Fast path for syscall audit without full syscall trace.
435 * We just call audit_syscall_entry() directly, and then
436 * jump back to the normal fast path.
437 */
438auditsys:
439 movq %r10,%r9 /* 6th arg: 4th syscall arg */
440 movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
441 movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
442 movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
443 movq %rax,%rsi /* 2nd arg: syscall number */
444 movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
445 call audit_syscall_entry
446 LOAD_ARGS 0 /* reload call-clobbered registers */
447 jmp system_call_fastpath
448
449 /*
450 * Return fast path for syscall audit. Call audit_syscall_exit()
451 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
452 * masked off.
453 */
454sysret_audit:
455 movq %rax,%rsi /* second arg, syscall return value */
456 cmpq $0,%rax /* is it < 0? */
457 setl %al /* 1 if so, 0 if not */
458 movzbl %al,%edi /* zero-extend that into %edi */
459 inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
460 call audit_syscall_exit
461 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
462 jmp sysret_check
463#endif /* CONFIG_AUDITSYSCALL */
464
319 /* Do syscall tracing */ 465 /* Do syscall tracing */
320tracesys: 466tracesys:
467#ifdef CONFIG_AUDITSYSCALL
468 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
469 jz auditsys
470#endif
321 SAVE_REST 471 SAVE_REST
322 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ 472 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
323 FIXUP_TOP_OF_STACK %rdi 473 FIXUP_TOP_OF_STACK %rdi
324 movq %rsp,%rdi 474 movq %rsp,%rdi
325 call syscall_trace_enter 475 call syscall_trace_enter
326 LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ 476 /*
477 * Reload arg registers from stack in case ptrace changed them.
478 * We don't reload %rax because syscall_trace_enter() returned
479 * the value it wants us to use in the table lookup.
480 */
481 LOAD_ARGS ARGOFFSET, 1
327 RESTORE_REST 482 RESTORE_REST
328 cmpq $__NR_syscall_max,%rax 483 cmpq $__NR_syscall_max,%rax
329 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ 484 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
@@ -337,6 +492,7 @@ tracesys:
337 * Has correct top of stack, but partial stack frame. 492 * Has correct top of stack, but partial stack frame.
338 */ 493 */
339 .globl int_ret_from_sys_call 494 .globl int_ret_from_sys_call
495 .globl int_with_check
340int_ret_from_sys_call: 496int_ret_from_sys_call:
341 DISABLE_INTERRUPTS(CLBR_NONE) 497 DISABLE_INTERRUPTS(CLBR_NONE)
342 TRACE_IRQS_OFF 498 TRACE_IRQS_OFF
@@ -347,10 +503,10 @@ int_ret_from_sys_call:
347int_with_check: 503int_with_check:
348 LOCKDEP_SYS_EXIT_IRQ 504 LOCKDEP_SYS_EXIT_IRQ
349 GET_THREAD_INFO(%rcx) 505 GET_THREAD_INFO(%rcx)
350 movl threadinfo_flags(%rcx),%edx 506 movl TI_flags(%rcx),%edx
351 andl %edi,%edx 507 andl %edi,%edx
352 jnz int_careful 508 jnz int_careful
353 andl $~TS_COMPAT,threadinfo_status(%rcx) 509 andl $~TS_COMPAT,TI_status(%rcx)
354 jmp retint_swapgs 510 jmp retint_swapgs
355 511
356 /* Either reschedule or signal or syscall exit tracking needed. */ 512 /* Either reschedule or signal or syscall exit tracking needed. */
@@ -376,7 +532,7 @@ int_very_careful:
376 ENABLE_INTERRUPTS(CLBR_NONE) 532 ENABLE_INTERRUPTS(CLBR_NONE)
377 SAVE_REST 533 SAVE_REST
378 /* Check for syscall exit trace */ 534 /* Check for syscall exit trace */
379 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx 535 testl $_TIF_WORK_SYSCALL_EXIT,%edx
380 jz int_signal 536 jz int_signal
381 pushq %rdi 537 pushq %rdi
382 CFI_ADJUST_CFA_OFFSET 8 538 CFI_ADJUST_CFA_OFFSET 8
@@ -384,7 +540,7 @@ int_very_careful:
384 call syscall_trace_leave 540 call syscall_trace_leave
385 popq %rdi 541 popq %rdi
386 CFI_ADJUST_CFA_OFFSET -8 542 CFI_ADJUST_CFA_OFFSET -8
387 andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi 543 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
388 jmp int_restore_rest 544 jmp int_restore_rest
389 545
390int_signal: 546int_signal:
@@ -393,7 +549,7 @@ int_signal:
393 movq %rsp,%rdi # &ptregs -> arg1 549 movq %rsp,%rdi # &ptregs -> arg1
394 xorl %esi,%esi # oldset -> arg2 550 xorl %esi,%esi # oldset -> arg2
395 call do_notify_resume 551 call do_notify_resume
3961: movl $_TIF_NEED_RESCHED,%edi 5521: movl $_TIF_WORK_MASK,%edi
397int_restore_rest: 553int_restore_rest:
398 RESTORE_REST 554 RESTORE_REST
399 DISABLE_INTERRUPTS(CLBR_NONE) 555 DISABLE_INTERRUPTS(CLBR_NONE)
@@ -420,7 +576,6 @@ END(\label)
420 PTREGSCALL stub_clone, sys_clone, %r8 576 PTREGSCALL stub_clone, sys_clone, %r8
421 PTREGSCALL stub_fork, sys_fork, %rdi 577 PTREGSCALL stub_fork, sys_fork, %rdi
422 PTREGSCALL stub_vfork, sys_vfork, %rdi 578 PTREGSCALL stub_vfork, sys_vfork, %rdi
423 PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
424 PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx 579 PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
425 PTREGSCALL stub_iopl, sys_iopl, %rsi 580 PTREGSCALL stub_iopl, sys_iopl, %rsi
426 581
@@ -512,6 +667,13 @@ END(stub_rt_sigreturn)
512 SAVE_ARGS 667 SAVE_ARGS
513 leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler 668 leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler
514 pushq %rbp 669 pushq %rbp
670 /*
671 * Save rbp twice: One is for marking the stack frame, as usual, and the
672 * other, to fill pt_regs properly. This is because bx comes right
673 * before the last saved register in that structure, and not bp. If the
674 * base pointer were in the place bx is today, this would not be needed.
675 */
676 movq %rbp, -8(%rsp)
515 CFI_ADJUST_CFA_OFFSET 8 677 CFI_ADJUST_CFA_OFFSET 8
516 CFI_REL_OFFSET rbp, 0 678 CFI_REL_OFFSET rbp, 0
517 movq %rsp,%rbp 679 movq %rsp,%rbp
@@ -559,7 +721,7 @@ retint_with_reschedule:
559 movl $_TIF_WORK_MASK,%edi 721 movl $_TIF_WORK_MASK,%edi
560retint_check: 722retint_check:
561 LOCKDEP_SYS_EXIT_IRQ 723 LOCKDEP_SYS_EXIT_IRQ
562 movl threadinfo_flags(%rcx),%edx 724 movl TI_flags(%rcx),%edx
563 andl %edi,%edx 725 andl %edi,%edx
564 CFI_REMEMBER_STATE 726 CFI_REMEMBER_STATE
565 jnz retint_careful 727 jnz retint_careful
@@ -647,17 +809,16 @@ retint_signal:
647 RESTORE_REST 809 RESTORE_REST
648 DISABLE_INTERRUPTS(CLBR_NONE) 810 DISABLE_INTERRUPTS(CLBR_NONE)
649 TRACE_IRQS_OFF 811 TRACE_IRQS_OFF
650 movl $_TIF_NEED_RESCHED,%edi
651 GET_THREAD_INFO(%rcx) 812 GET_THREAD_INFO(%rcx)
652 jmp retint_check 813 jmp retint_with_reschedule
653 814
654#ifdef CONFIG_PREEMPT 815#ifdef CONFIG_PREEMPT
655 /* Returning to kernel space. Check if we need preemption */ 816 /* Returning to kernel space. Check if we need preemption */
656 /* rcx: threadinfo. interrupts off. */ 817 /* rcx: threadinfo. interrupts off. */
657ENTRY(retint_kernel) 818ENTRY(retint_kernel)
658 cmpl $0,threadinfo_preempt_count(%rcx) 819 cmpl $0,TI_preempt_count(%rcx)
659 jnz retint_restore_args 820 jnz retint_restore_args
660 bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx) 821 bt $TIF_NEED_RESCHED,TI_flags(%rcx)
661 jnc retint_restore_args 822 jnc retint_restore_args
662 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ 823 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
663 jnc retint_restore_args 824 jnc retint_restore_args
@@ -711,6 +872,9 @@ END(invalidate_interrupt\num)
711ENTRY(call_function_interrupt) 872ENTRY(call_function_interrupt)
712 apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt 873 apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
713END(call_function_interrupt) 874END(call_function_interrupt)
875ENTRY(call_function_single_interrupt)
876 apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt
877END(call_function_single_interrupt)
714ENTRY(irq_move_cleanup_interrupt) 878ENTRY(irq_move_cleanup_interrupt)
715 apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt 879 apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
716END(irq_move_cleanup_interrupt) 880END(irq_move_cleanup_interrupt)
@@ -720,6 +884,10 @@ ENTRY(apic_timer_interrupt)
720 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt 884 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
721END(apic_timer_interrupt) 885END(apic_timer_interrupt)
722 886
887ENTRY(uv_bau_message_intr1)
888 apicinterrupt 220,uv_bau_message_interrupt
889END(uv_bau_message_intr1)
890
723ENTRY(error_interrupt) 891ENTRY(error_interrupt)
724 apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt 892 apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
725END(error_interrupt) 893END(error_interrupt)
@@ -733,6 +901,7 @@ END(spurious_interrupt)
733 */ 901 */
734 .macro zeroentry sym 902 .macro zeroentry sym
735 INTR_FRAME 903 INTR_FRAME
904 PARAVIRT_ADJUST_EXCEPTION_FRAME
736 pushq $0 /* push error code/oldrax */ 905 pushq $0 /* push error code/oldrax */
737 CFI_ADJUST_CFA_OFFSET 8 906 CFI_ADJUST_CFA_OFFSET 8
738 pushq %rax /* push real oldrax to the rdi slot */ 907 pushq %rax /* push real oldrax to the rdi slot */
@@ -745,6 +914,7 @@ END(spurious_interrupt)
745 914
746 .macro errorentry sym 915 .macro errorentry sym
747 XCPT_FRAME 916 XCPT_FRAME
917 PARAVIRT_ADJUST_EXCEPTION_FRAME
748 pushq %rax 918 pushq %rax
749 CFI_ADJUST_CFA_OFFSET 8 919 CFI_ADJUST_CFA_OFFSET 8
750 CFI_REL_OFFSET rax,0 920 CFI_REL_OFFSET rax,0
@@ -769,6 +939,9 @@ END(spurious_interrupt)
769 .if \ist 939 .if \ist
770 movq %gs:pda_data_offset, %rbp 940 movq %gs:pda_data_offset, %rbp
771 .endif 941 .endif
942 .if \irqtrace
943 TRACE_IRQS_OFF
944 .endif
772 movq %rsp,%rdi 945 movq %rsp,%rdi
773 movq ORIG_RAX(%rsp),%rsi 946 movq ORIG_RAX(%rsp),%rsi
774 movq $-1,ORIG_RAX(%rsp) 947 movq $-1,ORIG_RAX(%rsp)
@@ -814,7 +987,7 @@ paranoid_restore\trace:
814 jmp irq_return 987 jmp irq_return
815paranoid_userspace\trace: 988paranoid_userspace\trace:
816 GET_THREAD_INFO(%rcx) 989 GET_THREAD_INFO(%rcx)
817 movl threadinfo_flags(%rcx),%ebx 990 movl TI_flags(%rcx),%ebx
818 andl $_TIF_WORK_MASK,%ebx 991 andl $_TIF_WORK_MASK,%ebx
819 jz paranoid_swapgs\trace 992 jz paranoid_swapgs\trace
820 movq %rsp,%rdi /* &pt_regs */ 993 movq %rsp,%rdi /* &pt_regs */
@@ -895,7 +1068,8 @@ KPROBE_ENTRY(error_entry)
895 je error_kernelspace 1068 je error_kernelspace
896error_swapgs: 1069error_swapgs:
897 SWAPGS 1070 SWAPGS
898error_sti: 1071error_sti:
1072 TRACE_IRQS_OFF
899 movq %rdi,RDI(%rsp) 1073 movq %rdi,RDI(%rsp)
900 CFI_REL_OFFSET rdi,RDI 1074 CFI_REL_OFFSET rdi,RDI
901 movq %rsp,%rdi 1075 movq %rsp,%rdi
@@ -912,7 +1086,7 @@ error_exit:
912 testl %eax,%eax 1086 testl %eax,%eax
913 jne retint_kernel 1087 jne retint_kernel
914 LOCKDEP_SYS_EXIT_IRQ 1088 LOCKDEP_SYS_EXIT_IRQ
915 movl threadinfo_flags(%rcx),%edx 1089 movl TI_flags(%rcx),%edx
916 movl $_TIF_WORK_MASK,%edi 1090 movl $_TIF_WORK_MASK,%edi
917 andl %edi,%edx 1091 andl %edi,%edx
918 jnz retint_careful 1092 jnz retint_careful
@@ -926,11 +1100,11 @@ error_kernelspace:
926 iret run with kernel gs again, so don't set the user space flag. 1100 iret run with kernel gs again, so don't set the user space flag.
927 B stepping K8s sometimes report an truncated RIP for IRET 1101 B stepping K8s sometimes report an truncated RIP for IRET
928 exceptions returning to compat mode. Check for these here too. */ 1102 exceptions returning to compat mode. Check for these here too. */
929 leaq irq_return(%rip),%rbp 1103 leaq irq_return(%rip),%rcx
930 cmpq %rbp,RIP(%rsp) 1104 cmpq %rcx,RIP(%rsp)
931 je error_swapgs 1105 je error_swapgs
932 movl %ebp,%ebp /* zero extend */ 1106 movl %ecx,%ecx /* zero extend */
933 cmpq %rbp,RIP(%rsp) 1107 cmpq %rcx,RIP(%rsp)
934 je error_swapgs 1108 je error_swapgs
935 cmpq $gs_change,RIP(%rsp) 1109 cmpq $gs_change,RIP(%rsp)
936 je error_swapgs 1110 je error_swapgs
@@ -939,7 +1113,7 @@ KPROBE_END(error_entry)
939 1113
940 /* Reload gs selector with exception handling */ 1114 /* Reload gs selector with exception handling */
941 /* edi: new selector */ 1115 /* edi: new selector */
942ENTRY(load_gs_index) 1116ENTRY(native_load_gs_index)
943 CFI_STARTPROC 1117 CFI_STARTPROC
944 pushf 1118 pushf
945 CFI_ADJUST_CFA_OFFSET 8 1119 CFI_ADJUST_CFA_OFFSET 8
@@ -953,7 +1127,7 @@ gs_change:
953 CFI_ADJUST_CFA_OFFSET -8 1127 CFI_ADJUST_CFA_OFFSET -8
954 ret 1128 ret
955 CFI_ENDPROC 1129 CFI_ENDPROC
956ENDPROC(load_gs_index) 1130ENDPROC(native_load_gs_index)
957 1131
958 .section __ex_table,"a" 1132 .section __ex_table,"a"
959 .align 8 1133 .align 8
@@ -1069,12 +1243,13 @@ ENTRY(simd_coprocessor_error)
1069END(simd_coprocessor_error) 1243END(simd_coprocessor_error)
1070 1244
1071ENTRY(device_not_available) 1245ENTRY(device_not_available)
1072 zeroentry math_state_restore 1246 zeroentry do_device_not_available
1073END(device_not_available) 1247END(device_not_available)
1074 1248
1075 /* runs on exception stack */ 1249 /* runs on exception stack */
1076KPROBE_ENTRY(debug) 1250KPROBE_ENTRY(debug)
1077 INTR_FRAME 1251 INTR_FRAME
1252 PARAVIRT_ADJUST_EXCEPTION_FRAME
1078 pushq $0 1253 pushq $0
1079 CFI_ADJUST_CFA_OFFSET 8 1254 CFI_ADJUST_CFA_OFFSET 8
1080 paranoidentry do_debug, DEBUG_STACK 1255 paranoidentry do_debug, DEBUG_STACK
@@ -1084,6 +1259,7 @@ KPROBE_END(debug)
1084 /* runs on exception stack */ 1259 /* runs on exception stack */
1085KPROBE_ENTRY(nmi) 1260KPROBE_ENTRY(nmi)
1086 INTR_FRAME 1261 INTR_FRAME
1262 PARAVIRT_ADJUST_EXCEPTION_FRAME
1087 pushq $-1 1263 pushq $-1
1088 CFI_ADJUST_CFA_OFFSET 8 1264 CFI_ADJUST_CFA_OFFSET 8
1089 paranoidentry do_nmi, 0, 0 1265 paranoidentry do_nmi, 0, 0
@@ -1097,6 +1273,7 @@ KPROBE_END(nmi)
1097 1273
1098KPROBE_ENTRY(int3) 1274KPROBE_ENTRY(int3)
1099 INTR_FRAME 1275 INTR_FRAME
1276 PARAVIRT_ADJUST_EXCEPTION_FRAME
1100 pushq $0 1277 pushq $0
1101 CFI_ADJUST_CFA_OFFSET 8 1278 CFI_ADJUST_CFA_OFFSET 8
1102 paranoidentry do_int3, DEBUG_STACK 1279 paranoidentry do_int3, DEBUG_STACK
@@ -1120,13 +1297,10 @@ ENTRY(coprocessor_segment_overrun)
1120 zeroentry do_coprocessor_segment_overrun 1297 zeroentry do_coprocessor_segment_overrun
1121END(coprocessor_segment_overrun) 1298END(coprocessor_segment_overrun)
1122 1299
1123ENTRY(reserved)
1124 zeroentry do_reserved
1125END(reserved)
1126
1127 /* runs on exception stack */ 1300 /* runs on exception stack */
1128ENTRY(double_fault) 1301ENTRY(double_fault)
1129 XCPT_FRAME 1302 XCPT_FRAME
1303 PARAVIRT_ADJUST_EXCEPTION_FRAME
1130 paranoidentry do_double_fault 1304 paranoidentry do_double_fault
1131 jmp paranoid_exit1 1305 jmp paranoid_exit1
1132 CFI_ENDPROC 1306 CFI_ENDPROC
@@ -1143,6 +1317,7 @@ END(segment_not_present)
1143 /* runs on exception stack */ 1317 /* runs on exception stack */
1144ENTRY(stack_segment) 1318ENTRY(stack_segment)
1145 XCPT_FRAME 1319 XCPT_FRAME
1320 PARAVIRT_ADJUST_EXCEPTION_FRAME
1146 paranoidentry do_stack_segment 1321 paranoidentry do_stack_segment
1147 jmp paranoid_exit1 1322 jmp paranoid_exit1
1148 CFI_ENDPROC 1323 CFI_ENDPROC
@@ -1168,6 +1343,7 @@ END(spurious_interrupt_bug)
1168 /* runs on exception stack */ 1343 /* runs on exception stack */
1169ENTRY(machine_check) 1344ENTRY(machine_check)
1170 INTR_FRAME 1345 INTR_FRAME
1346 PARAVIRT_ADJUST_EXCEPTION_FRAME
1171 pushq $0 1347 pushq $0
1172 CFI_ADJUST_CFA_OFFSET 8 1348 CFI_ADJUST_CFA_OFFSET 8
1173 paranoidentry do_machine_check 1349 paranoidentry do_machine_check
@@ -1202,3 +1378,103 @@ KPROBE_ENTRY(ignore_sysret)
1202 sysret 1378 sysret
1203 CFI_ENDPROC 1379 CFI_ENDPROC
1204ENDPROC(ignore_sysret) 1380ENDPROC(ignore_sysret)
1381
1382#ifdef CONFIG_XEN
1383ENTRY(xen_hypervisor_callback)
1384 zeroentry xen_do_hypervisor_callback
1385END(xen_hypervisor_callback)
1386
1387/*
1388# A note on the "critical region" in our callback handler.
1389# We want to avoid stacking callback handlers due to events occurring
1390# during handling of the last event. To do this, we keep events disabled
1391# until we've done all processing. HOWEVER, we must enable events before
1392# popping the stack frame (can't be done atomically) and so it would still
1393# be possible to get enough handler activations to overflow the stack.
1394# Although unlikely, bugs of that kind are hard to track down, so we'd
1395# like to avoid the possibility.
1396# So, on entry to the handler we detect whether we interrupted an
1397# existing activation in its critical region -- if so, we pop the current
1398# activation and restart the handler using the previous one.
1399*/
1400ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1401 CFI_STARTPROC
1402/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
1403 see the correct pointer to the pt_regs */
1404 movq %rdi, %rsp # we don't return, adjust the stack frame
1405 CFI_ENDPROC
1406 CFI_DEFAULT_STACK
140711: incl %gs:pda_irqcount
1408 movq %rsp,%rbp
1409 CFI_DEF_CFA_REGISTER rbp
1410 cmovzq %gs:pda_irqstackptr,%rsp
1411 pushq %rbp # backlink for old unwinder
1412 call xen_evtchn_do_upcall
1413 popq %rsp
1414 CFI_DEF_CFA_REGISTER rsp
1415 decl %gs:pda_irqcount
1416 jmp error_exit
1417 CFI_ENDPROC
1418END(do_hypervisor_callback)
1419
1420/*
1421# Hypervisor uses this for application faults while it executes.
1422# We get here for two reasons:
1423# 1. Fault while reloading DS, ES, FS or GS
1424# 2. Fault while executing IRET
1425# Category 1 we do not need to fix up as Xen has already reloaded all segment
1426# registers that could be reloaded and zeroed the others.
1427# Category 2 we fix up by killing the current process. We cannot use the
1428# normal Linux return path in this case because if we use the IRET hypercall
1429# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1430# We distinguish between categories by comparing each saved segment register
1431# with its current contents: any discrepancy means we in category 1.
1432*/
1433ENTRY(xen_failsafe_callback)
1434 framesz = (RIP-0x30) /* workaround buggy gas */
1435 _frame framesz
1436 CFI_REL_OFFSET rcx, 0
1437 CFI_REL_OFFSET r11, 8
1438 movw %ds,%cx
1439 cmpw %cx,0x10(%rsp)
1440 CFI_REMEMBER_STATE
1441 jne 1f
1442 movw %es,%cx
1443 cmpw %cx,0x18(%rsp)
1444 jne 1f
1445 movw %fs,%cx
1446 cmpw %cx,0x20(%rsp)
1447 jne 1f
1448 movw %gs,%cx
1449 cmpw %cx,0x28(%rsp)
1450 jne 1f
1451 /* All segments match their saved values => Category 2 (Bad IRET). */
1452 movq (%rsp),%rcx
1453 CFI_RESTORE rcx
1454 movq 8(%rsp),%r11
1455 CFI_RESTORE r11
1456 addq $0x30,%rsp
1457 CFI_ADJUST_CFA_OFFSET -0x30
1458 pushq $0
1459 CFI_ADJUST_CFA_OFFSET 8
1460 pushq %r11
1461 CFI_ADJUST_CFA_OFFSET 8
1462 pushq %rcx
1463 CFI_ADJUST_CFA_OFFSET 8
1464 jmp general_protection
1465 CFI_RESTORE_STATE
14661: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
1467 movq (%rsp),%rcx
1468 CFI_RESTORE rcx
1469 movq 8(%rsp),%r11
1470 CFI_RESTORE r11
1471 addq $0x30,%rsp
1472 CFI_ADJUST_CFA_OFFSET -0x30
1473 pushq $0
1474 CFI_ADJUST_CFA_OFFSET 8
1475 SAVE_ALL
1476 jmp error_exit
1477 CFI_ENDPROC
1478END(xen_failsafe_callback)
1479
1480#endif /* CONFIG_XEN */
diff --git a/arch/x86/mach-es7000/es7000plat.c b/arch/x86/kernel/es7000_32.c
index f5d6f7d8b86e..f454c78fcef6 100644
--- a/arch/x86/mach-es7000/es7000plat.c
+++ b/arch/x86/kernel/es7000_32.c
@@ -39,10 +39,94 @@
39#include <asm/nmi.h> 39#include <asm/nmi.h>
40#include <asm/smp.h> 40#include <asm/smp.h>
41#include <asm/apicdef.h> 41#include <asm/apicdef.h>
42#include "es7000.h"
43#include <mach_mpparse.h> 42#include <mach_mpparse.h>
44 43
45/* 44/*
45 * ES7000 chipsets
46 */
47
48#define NON_UNISYS 0
49#define ES7000_CLASSIC 1
50#define ES7000_ZORRO 2
51
52
53#define MIP_REG 1
54#define MIP_PSAI_REG 4
55
56#define MIP_BUSY 1
57#define MIP_SPIN 0xf0000
58#define MIP_VALID 0x0100000000000000ULL
59#define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff)
60
61#define MIP_RD_LO(VALUE) (VALUE & 0xffffffff)
62
63struct mip_reg_info {
64 unsigned long long mip_info;
65 unsigned long long delivery_info;
66 unsigned long long host_reg;
67 unsigned long long mip_reg;
68};
69
70struct part_info {
71 unsigned char type;
72 unsigned char length;
73 unsigned char part_id;
74 unsigned char apic_mode;
75 unsigned long snum;
76 char ptype[16];
77 char sname[64];
78 char pname[64];
79};
80
81struct psai {
82 unsigned long long entry_type;
83 unsigned long long addr;
84 unsigned long long bep_addr;
85};
86
87struct es7000_mem_info {
88 unsigned char type;
89 unsigned char length;
90 unsigned char resv[6];
91 unsigned long long start;
92 unsigned long long size;
93};
94
95struct es7000_oem_table {
96 unsigned long long hdr;
97 struct mip_reg_info mip;
98 struct part_info pif;
99 struct es7000_mem_info shm;
100 struct psai psai;
101};
102
103#ifdef CONFIG_ACPI
104
105struct oem_table {
106 struct acpi_table_header Header;
107 u32 OEMTableAddr;
108 u32 OEMTableSize;
109};
110
111extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
112extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr);
113#endif
114
115struct mip_reg {
116 unsigned long long off_0;
117 unsigned long long off_8;
118 unsigned long long off_10;
119 unsigned long long off_18;
120 unsigned long long off_20;
121 unsigned long long off_28;
122 unsigned long long off_30;
123 unsigned long long off_38;
124};
125
126#define MIP_SW_APIC 0x1020b
127#define MIP_FUNC(VALUE) (VALUE & 0xff)
128
129/*
46 * ES7000 Globals 130 * ES7000 Globals
47 */ 131 */
48 132
@@ -52,6 +136,8 @@ static struct mip_reg *host_reg;
52static int mip_port; 136static int mip_port;
53static unsigned long mip_addr, host_addr; 137static unsigned long mip_addr, host_addr;
54 138
139int es7000_plat;
140
55/* 141/*
56 * GSI override for ES7000 platforms. 142 * GSI override for ES7000 platforms.
57 */ 143 */
@@ -70,7 +156,7 @@ es7000_rename_gsi(int ioapic, int gsi)
70 base += nr_ioapic_registers[i]; 156 base += nr_ioapic_registers[i];
71 } 157 }
72 158
73 if (!ioapic && (gsi < 16)) 159 if (!ioapic && (gsi < 16))
74 gsi += base; 160 gsi += base;
75 return gsi; 161 return gsi;
76} 162}
@@ -128,10 +214,10 @@ parse_unisys_oem (char *oemptr)
128 mip_addr = val; 214 mip_addr = val;
129 mip = (struct mip_reg *)val; 215 mip = (struct mip_reg *)val;
130 mip_reg = __va(mip); 216 mip_reg = __va(mip);
131 Dprintk("es7000_mipcfg: host_reg = 0x%lx \n", 217 pr_debug("es7000_mipcfg: host_reg = 0x%lx \n",
132 (unsigned long)host_reg); 218 (unsigned long)host_reg);
133 Dprintk("es7000_mipcfg: mip_reg = 0x%lx \n", 219 pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n",
134 (unsigned long)mip_reg); 220 (unsigned long)mip_reg);
135 success++; 221 success++;
136 break; 222 break;
137 case MIP_PSAI_REG: 223 case MIP_PSAI_REG:
@@ -158,69 +244,39 @@ parse_unisys_oem (char *oemptr)
158} 244}
159 245
160#ifdef CONFIG_ACPI 246#ifdef CONFIG_ACPI
161int __init 247static unsigned long oem_addrX;
162find_unisys_acpi_oem_table(unsigned long *oem_addr) 248static unsigned long oem_size;
249int __init find_unisys_acpi_oem_table(unsigned long *oem_addr)
163{ 250{
164 struct acpi_table_header *header = NULL; 251 struct acpi_table_header *header = NULL;
165 int i = 0; 252 int i = 0;
166 while (ACPI_SUCCESS(acpi_get_table("OEM1", i++, &header))) { 253 acpi_size tbl_size;
254
255 while (ACPI_SUCCESS(acpi_get_table_with_size("OEM1", i++, &header, &tbl_size))) {
167 if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) { 256 if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) {
168 struct oem_table *t = (struct oem_table *)header; 257 struct oem_table *t = (struct oem_table *)header;
169 *oem_addr = (unsigned long)__acpi_map_table(t->OEMTableAddr, 258
170 t->OEMTableSize); 259 oem_addrX = t->OEMTableAddr;
260 oem_size = t->OEMTableSize;
261 early_acpi_os_unmap_memory(header, tbl_size);
262
263 *oem_addr = (unsigned long)__acpi_map_table(oem_addrX,
264 oem_size);
171 return 0; 265 return 0;
172 } 266 }
267 early_acpi_os_unmap_memory(header, tbl_size);
173 } 268 }
174 return -1; 269 return -1;
175} 270}
176#endif
177 271
178/* 272void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr)
179 * This file also gets compiled if CONFIG_X86_GENERICARCH is set. Generic
180 * arch already has got following function definitions (asm-generic/es7000.c)
181 * hence no need to define these for that case.
182 */
183#ifndef CONFIG_X86_GENERICARCH
184void es7000_sw_apic(void);
185void __init enable_apic_mode(void)
186{ 273{
187 es7000_sw_apic(); 274 if (!oem_addr)
188 return; 275 return;
189}
190 276
191__init int mps_oem_check(struct mp_config_table *mpc, char *oem, 277 __acpi_unmap_table((char *)oem_addr, oem_size);
192 char *productid)
193{
194 if (mpc->mpc_oemptr) {
195 struct mp_config_oemtable *oem_table =
196 (struct mp_config_oemtable *)mpc->mpc_oemptr;
197 if (!strncmp(oem, "UNISYS", 6))
198 return parse_unisys_oem((char *)oem_table);
199 }
200 return 0;
201}
202#ifdef CONFIG_ACPI
203/* Hook from generic ACPI tables.c */
204int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
205{
206 unsigned long oem_addr;
207 if (!find_unisys_acpi_oem_table(&oem_addr)) {
208 if (es7000_check_dsdt())
209 return parse_unisys_oem((char *)oem_addr);
210 else {
211 setup_unisys();
212 return 1;
213 }
214 }
215 return 0;
216}
217#else
218int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
219{
220 return 0;
221} 278}
222#endif 279#endif
223#endif /* COFIG_X86_GENERICARCH */
224 280
225static void 281static void
226es7000_spin(int n) 282es7000_spin(int n)
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
new file mode 100644
index 000000000000..ab115cd15fdf
--- /dev/null
+++ b/arch/x86/kernel/ftrace.c
@@ -0,0 +1,141 @@
1/*
2 * Code for replacing ftrace calls with jumps.
3 *
4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
5 *
6 * Thanks goes to Ingo Molnar, for suggesting the idea.
7 * Mathieu Desnoyers, for suggesting postponing the modifications.
8 * Arjan van de Ven, for keeping me straight, and explaining to me
9 * the dangers of modifying code on the run.
10 */
11
12#include <linux/spinlock.h>
13#include <linux/hardirq.h>
14#include <linux/ftrace.h>
15#include <linux/percpu.h>
16#include <linux/init.h>
17#include <linux/list.h>
18
19#include <asm/alternative.h>
20#include <asm/ftrace.h>
21
22
23/* Long is fine, even if it is only 4 bytes ;-) */
24static long *ftrace_nop;
25
26union ftrace_code_union {
27 char code[MCOUNT_INSN_SIZE];
28 struct {
29 char e8;
30 int offset;
31 } __attribute__((packed));
32};
33
34
35static int notrace ftrace_calc_offset(long ip, long addr)
36{
37 return (int)(addr - ip);
38}
39
40notrace unsigned char *ftrace_nop_replace(void)
41{
42 return (char *)ftrace_nop;
43}
44
45notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
46{
47 static union ftrace_code_union calc;
48
49 calc.e8 = 0xe8;
50 calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
51
52 /*
53 * No locking needed, this must be called via kstop_machine
54 * which in essence is like running on a uniprocessor machine.
55 */
56 return calc.code;
57}
58
59notrace int
60ftrace_modify_code(unsigned long ip, unsigned char *old_code,
61 unsigned char *new_code)
62{
63 unsigned replaced;
64 unsigned old = *(unsigned *)old_code; /* 4 bytes */
65 unsigned new = *(unsigned *)new_code; /* 4 bytes */
66 unsigned char newch = new_code[4];
67 int faulted = 0;
68
69 /*
70 * Note: Due to modules and __init, code can
71 * disappear and change, we need to protect against faulting
72 * as well as code changing.
73 *
74 * No real locking needed, this code is run through
75 * kstop_machine.
76 */
77 asm volatile (
78 "1: lock\n"
79 " cmpxchg %3, (%2)\n"
80 " jnz 2f\n"
81 " movb %b4, 4(%2)\n"
82 "2:\n"
83 ".section .fixup, \"ax\"\n"
84 "3: movl $1, %0\n"
85 " jmp 2b\n"
86 ".previous\n"
87 _ASM_EXTABLE(1b, 3b)
88 : "=r"(faulted), "=a"(replaced)
89 : "r"(ip), "r"(new), "c"(newch),
90 "0"(faulted), "a"(old)
91 : "memory");
92 sync_core();
93
94 if (replaced != old && replaced != new)
95 faulted = 2;
96
97 return faulted;
98}
99
100notrace int ftrace_update_ftrace_func(ftrace_func_t func)
101{
102 unsigned long ip = (unsigned long)(&ftrace_call);
103 unsigned char old[MCOUNT_INSN_SIZE], *new;
104 int ret;
105
106 memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
107 new = ftrace_call_replace(ip, (unsigned long)func);
108 ret = ftrace_modify_code(ip, old, new);
109
110 return ret;
111}
112
113notrace int ftrace_mcount_set(unsigned long *data)
114{
115 unsigned long ip = (long)(&mcount_call);
116 unsigned long *addr = data;
117 unsigned char old[MCOUNT_INSN_SIZE], *new;
118
119 /*
120 * Replace the mcount stub with a pointer to the
121 * ip recorder function.
122 */
123 memcpy(old, &mcount_call, MCOUNT_INSN_SIZE);
124 new = ftrace_call_replace(ip, *addr);
125 *addr = ftrace_modify_code(ip, old, new);
126
127 return 0;
128}
129
130int __init ftrace_dyn_arch_init(void *data)
131{
132 const unsigned char *const *noptable = find_nop_table();
133
134 /* This is running in kstop_machine */
135
136 ftrace_mcount_set(data);
137
138 ftrace_nop = (unsigned long *)noptable[MCOUNT_INSN_SIZE];
139
140 return 0;
141}
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index cbaaf69bedb2..6c9bfc9e1e95 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -16,86 +16,63 @@
16#include <linux/ctype.h> 16#include <linux/ctype.h>
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/hardirq.h> 18#include <linux/hardirq.h>
19#include <linux/dmar.h>
19 20
20#include <asm/smp.h> 21#include <asm/smp.h>
21#include <asm/ipi.h> 22#include <asm/ipi.h>
22#include <asm/genapic.h> 23#include <asm/genapic.h>
23 24
24#ifdef CONFIG_ACPI 25extern struct genapic apic_flat;
25#include <acpi/acpi_bus.h> 26extern struct genapic apic_physflat;
26#endif 27extern struct genapic apic_x2xpic_uv_x;
27 28extern struct genapic apic_x2apic_phys;
28DEFINE_PER_CPU(int, x2apic_extra_bits); 29extern struct genapic apic_x2apic_cluster;
29 30
30struct genapic __read_mostly *genapic = &apic_flat; 31struct genapic __read_mostly *genapic = &apic_flat;
31 32
32static enum uv_system_type uv_system_type; 33static struct genapic *apic_probe[] __initdata = {
34 &apic_x2apic_uv_x,
35 &apic_x2apic_phys,
36 &apic_x2apic_cluster,
37 &apic_physflat,
38 NULL,
39};
33 40
34/* 41/*
35 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. 42 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
36 */ 43 */
37void __init setup_apic_routing(void) 44void __init setup_apic_routing(void)
38{ 45{
39 if (uv_system_type == UV_NON_UNIQUE_APIC) 46 if (genapic == &apic_x2apic_phys || genapic == &apic_x2apic_cluster) {
40 genapic = &apic_x2apic_uv_x; 47 if (!intr_remapping_enabled)
41 else 48 genapic = &apic_flat;
42#ifdef CONFIG_ACPI 49 }
43 /*
44 * Quirk: some x86_64 machines can only use physical APIC mode
45 * regardless of how many processors are present (x86_64 ES7000
46 * is an example).
47 */
48 if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
49 (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
50 genapic = &apic_physflat;
51 else
52#endif
53
54 if (num_possible_cpus() <= 8)
55 genapic = &apic_flat;
56 else
57 genapic = &apic_physflat;
58 50
59 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); 51 if (genapic == &apic_flat) {
52 if (max_physical_apicid >= 8)
53 genapic = &apic_physflat;
54 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
55 }
60} 56}
61 57
62/* Same for both flat and physical. */ 58/* Same for both flat and physical. */
63 59
64void send_IPI_self(int vector) 60void apic_send_IPI_self(int vector)
65{ 61{
66 __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); 62 __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
67} 63}
68 64
69int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) 65int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
70{ 66{
71 if (!strcmp(oem_id, "SGI")) { 67 int i;
72 if (!strcmp(oem_table_id, "UVL")) 68
73 uv_system_type = UV_LEGACY_APIC; 69 for (i = 0; apic_probe[i]; ++i) {
74 else if (!strcmp(oem_table_id, "UVX")) 70 if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
75 uv_system_type = UV_X2APIC; 71 genapic = apic_probe[i];
76 else if (!strcmp(oem_table_id, "UVH")) 72 printk(KERN_INFO "Setting APIC routing to %s.\n",
77 uv_system_type = UV_NON_UNIQUE_APIC; 73 genapic->name);
74 return 1;
75 }
78 } 76 }
79 return 0; 77 return 0;
80} 78}
81
82unsigned int read_apic_id(void)
83{
84 unsigned int id;
85
86 WARN_ON(preemptible() && num_online_cpus() > 1);
87 id = apic_read(APIC_ID);
88 if (uv_system_type >= UV_X2APIC)
89 id |= __get_cpu_var(x2apic_extra_bits);
90 return id;
91}
92
93enum uv_system_type get_uv_system_type(void)
94{
95 return uv_system_type;
96}
97
98int is_uv_system(void)
99{
100 return uv_system_type != UV_NONE;
101}
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
index 1a9c68845ee8..9eca5ba7a6b1 100644
--- a/arch/x86/kernel/genapic_flat_64.c
+++ b/arch/x86/kernel/genapic_flat_64.c
@@ -15,9 +15,20 @@
15#include <linux/kernel.h> 15#include <linux/kernel.h>
16#include <linux/ctype.h> 16#include <linux/ctype.h>
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/hardirq.h>
18#include <asm/smp.h> 19#include <asm/smp.h>
19#include <asm/ipi.h> 20#include <asm/ipi.h>
20#include <asm/genapic.h> 21#include <asm/genapic.h>
22#include <mach_apicdef.h>
23
24#ifdef CONFIG_ACPI
25#include <acpi/acpi_bus.h>
26#endif
27
28static int __init flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
29{
30 return 1;
31}
21 32
22static cpumask_t flat_target_cpus(void) 33static cpumask_t flat_target_cpus(void)
23{ 34{
@@ -95,9 +106,33 @@ static void flat_send_IPI_all(int vector)
95 __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); 106 __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
96} 107}
97 108
109static unsigned int get_apic_id(unsigned long x)
110{
111 unsigned int id;
112
113 id = (((x)>>24) & 0xFFu);
114 return id;
115}
116
117static unsigned long set_apic_id(unsigned int id)
118{
119 unsigned long x;
120
121 x = ((id & 0xFFu)<<24);
122 return x;
123}
124
125static unsigned int read_xapic_id(void)
126{
127 unsigned int id;
128
129 id = get_apic_id(apic_read(APIC_ID));
130 return id;
131}
132
98static int flat_apic_id_registered(void) 133static int flat_apic_id_registered(void)
99{ 134{
100 return physid_isset(GET_APIC_ID(read_apic_id()), phys_cpu_present_map); 135 return physid_isset(read_xapic_id(), phys_cpu_present_map);
101} 136}
102 137
103static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask) 138static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask)
@@ -112,6 +147,7 @@ static unsigned int phys_pkg_id(int index_msb)
112 147
113struct genapic apic_flat = { 148struct genapic apic_flat = {
114 .name = "flat", 149 .name = "flat",
150 .acpi_madt_oem_check = flat_acpi_madt_oem_check,
115 .int_delivery_mode = dest_LowestPrio, 151 .int_delivery_mode = dest_LowestPrio,
116 .int_dest_mode = (APIC_DEST_LOGICAL != 0), 152 .int_dest_mode = (APIC_DEST_LOGICAL != 0),
117 .target_cpus = flat_target_cpus, 153 .target_cpus = flat_target_cpus,
@@ -121,8 +157,12 @@ struct genapic apic_flat = {
121 .send_IPI_all = flat_send_IPI_all, 157 .send_IPI_all = flat_send_IPI_all,
122 .send_IPI_allbutself = flat_send_IPI_allbutself, 158 .send_IPI_allbutself = flat_send_IPI_allbutself,
123 .send_IPI_mask = flat_send_IPI_mask, 159 .send_IPI_mask = flat_send_IPI_mask,
160 .send_IPI_self = apic_send_IPI_self,
124 .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, 161 .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
125 .phys_pkg_id = phys_pkg_id, 162 .phys_pkg_id = phys_pkg_id,
163 .get_apic_id = get_apic_id,
164 .set_apic_id = set_apic_id,
165 .apic_id_mask = (0xFFu<<24),
126}; 166};
127 167
128/* 168/*
@@ -130,6 +170,21 @@ struct genapic apic_flat = {
130 * We cannot use logical delivery in this case because the mask 170 * We cannot use logical delivery in this case because the mask
131 * overflows, so use physical mode. 171 * overflows, so use physical mode.
132 */ 172 */
173static int __init physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
174{
175#ifdef CONFIG_ACPI
176 /*
177 * Quirk: some x86_64 machines can only use physical APIC mode
178 * regardless of how many processors are present (x86_64 ES7000
179 * is an example).
180 */
181 if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
182 (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
183 return 1;
184#endif
185
186 return 0;
187}
133 188
134static cpumask_t physflat_target_cpus(void) 189static cpumask_t physflat_target_cpus(void)
135{ 190{
@@ -168,7 +223,7 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
168 * May as well be the first. 223 * May as well be the first.
169 */ 224 */
170 cpu = first_cpu(cpumask); 225 cpu = first_cpu(cpumask);
171 if ((unsigned)cpu < NR_CPUS) 226 if ((unsigned)cpu < nr_cpu_ids)
172 return per_cpu(x86_cpu_to_apicid, cpu); 227 return per_cpu(x86_cpu_to_apicid, cpu);
173 else 228 else
174 return BAD_APICID; 229 return BAD_APICID;
@@ -176,6 +231,7 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
176 231
177struct genapic apic_physflat = { 232struct genapic apic_physflat = {
178 .name = "physical flat", 233 .name = "physical flat",
234 .acpi_madt_oem_check = physflat_acpi_madt_oem_check,
179 .int_delivery_mode = dest_Fixed, 235 .int_delivery_mode = dest_Fixed,
180 .int_dest_mode = (APIC_DEST_PHYSICAL != 0), 236 .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
181 .target_cpus = physflat_target_cpus, 237 .target_cpus = physflat_target_cpus,
@@ -185,6 +241,10 @@ struct genapic apic_physflat = {
185 .send_IPI_all = physflat_send_IPI_all, 241 .send_IPI_all = physflat_send_IPI_all,
186 .send_IPI_allbutself = physflat_send_IPI_allbutself, 242 .send_IPI_allbutself = physflat_send_IPI_allbutself,
187 .send_IPI_mask = physflat_send_IPI_mask, 243 .send_IPI_mask = physflat_send_IPI_mask,
244 .send_IPI_self = apic_send_IPI_self,
188 .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, 245 .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
189 .phys_pkg_id = phys_pkg_id, 246 .phys_pkg_id = phys_pkg_id,
247 .get_apic_id = get_apic_id,
248 .set_apic_id = set_apic_id,
249 .apic_id_mask = (0xFFu<<24),
190}; 250};
diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c
new file mode 100644
index 000000000000..e4bf2cc0d743
--- /dev/null
+++ b/arch/x86/kernel/genx2apic_cluster.c
@@ -0,0 +1,159 @@
1#include <linux/threads.h>
2#include <linux/cpumask.h>
3#include <linux/string.h>
4#include <linux/kernel.h>
5#include <linux/ctype.h>
6#include <linux/init.h>
7#include <linux/dmar.h>
8
9#include <asm/smp.h>
10#include <asm/ipi.h>
11#include <asm/genapic.h>
12
13DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
14
15static int __init x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
16{
17 if (cpu_has_x2apic)
18 return 1;
19
20 return 0;
21}
22
23/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
24
25static cpumask_t x2apic_target_cpus(void)
26{
27 return cpumask_of_cpu(0);
28}
29
30/*
31 * for now each logical cpu is in its own vector allocation domain.
32 */
33static cpumask_t x2apic_vector_allocation_domain(int cpu)
34{
35 cpumask_t domain = CPU_MASK_NONE;
36 cpu_set(cpu, domain);
37 return domain;
38}
39
40static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
41 unsigned int dest)
42{
43 unsigned long cfg;
44
45 cfg = __prepare_ICR(0, vector, dest);
46
47 /*
48 * send the IPI.
49 */
50 x2apic_icr_write(cfg, apicid);
51}
52
53/*
54 * for now, we send the IPI's one by one in the cpumask.
55 * TBD: Based on the cpu mask, we can send the IPI's to the cluster group
56 * at once. We have 16 cpu's in a cluster. This will minimize IPI register
57 * writes.
58 */
59static void x2apic_send_IPI_mask(cpumask_t mask, int vector)
60{
61 unsigned long flags;
62 unsigned long query_cpu;
63
64 local_irq_save(flags);
65 for_each_cpu_mask(query_cpu, mask) {
66 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_logical_apicid, query_cpu),
67 vector, APIC_DEST_LOGICAL);
68 }
69 local_irq_restore(flags);
70}
71
72static void x2apic_send_IPI_allbutself(int vector)
73{
74 cpumask_t mask = cpu_online_map;
75
76 cpu_clear(smp_processor_id(), mask);
77
78 if (!cpus_empty(mask))
79 x2apic_send_IPI_mask(mask, vector);
80}
81
82static void x2apic_send_IPI_all(int vector)
83{
84 x2apic_send_IPI_mask(cpu_online_map, vector);
85}
86
87static int x2apic_apic_id_registered(void)
88{
89 return 1;
90}
91
92static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
93{
94 int cpu;
95
96 /*
97 * We're using fixed IRQ delivery, can only return one phys APIC ID.
98 * May as well be the first.
99 */
100 cpu = first_cpu(cpumask);
101 if ((unsigned)cpu < NR_CPUS)
102 return per_cpu(x86_cpu_to_logical_apicid, cpu);
103 else
104 return BAD_APICID;
105}
106
107static unsigned int get_apic_id(unsigned long x)
108{
109 unsigned int id;
110
111 id = x;
112 return id;
113}
114
115static unsigned long set_apic_id(unsigned int id)
116{
117 unsigned long x;
118
119 x = id;
120 return x;
121}
122
123static unsigned int phys_pkg_id(int index_msb)
124{
125 return current_cpu_data.initial_apicid >> index_msb;
126}
127
128static void x2apic_send_IPI_self(int vector)
129{
130 apic_write(APIC_SELF_IPI, vector);
131}
132
133static void init_x2apic_ldr(void)
134{
135 int cpu = smp_processor_id();
136
137 per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR);
138 return;
139}
140
141struct genapic apic_x2apic_cluster = {
142 .name = "cluster x2apic",
143 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
144 .int_delivery_mode = dest_LowestPrio,
145 .int_dest_mode = (APIC_DEST_LOGICAL != 0),
146 .target_cpus = x2apic_target_cpus,
147 .vector_allocation_domain = x2apic_vector_allocation_domain,
148 .apic_id_registered = x2apic_apic_id_registered,
149 .init_apic_ldr = init_x2apic_ldr,
150 .send_IPI_all = x2apic_send_IPI_all,
151 .send_IPI_allbutself = x2apic_send_IPI_allbutself,
152 .send_IPI_mask = x2apic_send_IPI_mask,
153 .send_IPI_self = x2apic_send_IPI_self,
154 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
155 .phys_pkg_id = phys_pkg_id,
156 .get_apic_id = get_apic_id,
157 .set_apic_id = set_apic_id,
158 .apic_id_mask = (0xFFFFFFFFu),
159};
diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c
new file mode 100644
index 000000000000..8f1343df2627
--- /dev/null
+++ b/arch/x86/kernel/genx2apic_phys.c
@@ -0,0 +1,154 @@
1#include <linux/threads.h>
2#include <linux/cpumask.h>
3#include <linux/string.h>
4#include <linux/kernel.h>
5#include <linux/ctype.h>
6#include <linux/init.h>
7#include <linux/dmar.h>
8
9#include <asm/smp.h>
10#include <asm/ipi.h>
11#include <asm/genapic.h>
12
13static int x2apic_phys;
14
15static int set_x2apic_phys_mode(char *arg)
16{
17 x2apic_phys = 1;
18 return 0;
19}
20early_param("x2apic_phys", set_x2apic_phys_mode);
21
22static int __init x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
23{
24 if (cpu_has_x2apic && x2apic_phys)
25 return 1;
26
27 return 0;
28}
29
30/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
31
32static cpumask_t x2apic_target_cpus(void)
33{
34 return cpumask_of_cpu(0);
35}
36
37static cpumask_t x2apic_vector_allocation_domain(int cpu)
38{
39 cpumask_t domain = CPU_MASK_NONE;
40 cpu_set(cpu, domain);
41 return domain;
42}
43
44static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
45 unsigned int dest)
46{
47 unsigned long cfg;
48
49 cfg = __prepare_ICR(0, vector, dest);
50
51 /*
52 * send the IPI.
53 */
54 x2apic_icr_write(cfg, apicid);
55}
56
57static void x2apic_send_IPI_mask(cpumask_t mask, int vector)
58{
59 unsigned long flags;
60 unsigned long query_cpu;
61
62 local_irq_save(flags);
63 for_each_cpu_mask(query_cpu, mask) {
64 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
65 vector, APIC_DEST_PHYSICAL);
66 }
67 local_irq_restore(flags);
68}
69
70static void x2apic_send_IPI_allbutself(int vector)
71{
72 cpumask_t mask = cpu_online_map;
73
74 cpu_clear(smp_processor_id(), mask);
75
76 if (!cpus_empty(mask))
77 x2apic_send_IPI_mask(mask, vector);
78}
79
80static void x2apic_send_IPI_all(int vector)
81{
82 x2apic_send_IPI_mask(cpu_online_map, vector);
83}
84
85static int x2apic_apic_id_registered(void)
86{
87 return 1;
88}
89
90static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
91{
92 int cpu;
93
94 /*
95 * We're using fixed IRQ delivery, can only return one phys APIC ID.
96 * May as well be the first.
97 */
98 cpu = first_cpu(cpumask);
99 if ((unsigned)cpu < NR_CPUS)
100 return per_cpu(x86_cpu_to_apicid, cpu);
101 else
102 return BAD_APICID;
103}
104
105static unsigned int get_apic_id(unsigned long x)
106{
107 unsigned int id;
108
109 id = x;
110 return id;
111}
112
113static unsigned long set_apic_id(unsigned int id)
114{
115 unsigned long x;
116
117 x = id;
118 return x;
119}
120
121static unsigned int phys_pkg_id(int index_msb)
122{
123 return current_cpu_data.initial_apicid >> index_msb;
124}
125
126void x2apic_send_IPI_self(int vector)
127{
128 apic_write(APIC_SELF_IPI, vector);
129}
130
131void init_x2apic_ldr(void)
132{
133 return;
134}
135
136struct genapic apic_x2apic_phys = {
137 .name = "physical x2apic",
138 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
139 .int_delivery_mode = dest_Fixed,
140 .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
141 .target_cpus = x2apic_target_cpus,
142 .vector_allocation_domain = x2apic_vector_allocation_domain,
143 .apic_id_registered = x2apic_apic_id_registered,
144 .init_apic_ldr = init_x2apic_ldr,
145 .send_IPI_all = x2apic_send_IPI_all,
146 .send_IPI_allbutself = x2apic_send_IPI_allbutself,
147 .send_IPI_mask = x2apic_send_IPI_mask,
148 .send_IPI_self = x2apic_send_IPI_self,
149 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
150 .phys_pkg_id = phys_pkg_id,
151 .get_apic_id = get_apic_id,
152 .set_apic_id = set_apic_id,
153 .apic_id_mask = (0xFFFFFFFFu),
154};
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index ebf13908a743..33581d94a90e 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -5,23 +5,56 @@
5 * 5 *
6 * SGI UV APIC functions (note: not an Intel compatible APIC) 6 * SGI UV APIC functions (note: not an Intel compatible APIC)
7 * 7 *
8 * Copyright (C) 2007 Silicon Graphics, Inc. All rights reserved. 8 * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
9 */ 9 */
10 10
11#include <linux/kernel.h>
11#include <linux/threads.h> 12#include <linux/threads.h>
12#include <linux/cpumask.h> 13#include <linux/cpumask.h>
13#include <linux/string.h> 14#include <linux/string.h>
14#include <linux/kernel.h>
15#include <linux/ctype.h> 15#include <linux/ctype.h>
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/sched.h> 17#include <linux/sched.h>
18#include <linux/bootmem.h> 18#include <linux/bootmem.h>
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/hardirq.h>
20#include <asm/smp.h> 21#include <asm/smp.h>
21#include <asm/ipi.h> 22#include <asm/ipi.h>
22#include <asm/genapic.h> 23#include <asm/genapic.h>
24#include <asm/pgtable.h>
23#include <asm/uv/uv_mmrs.h> 25#include <asm/uv/uv_mmrs.h>
24#include <asm/uv/uv_hub.h> 26#include <asm/uv/uv_hub.h>
27#include <asm/uv/bios.h>
28
29DEFINE_PER_CPU(int, x2apic_extra_bits);
30
31static enum uv_system_type uv_system_type;
32
33static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
34{
35 if (!strcmp(oem_id, "SGI")) {
36 if (!strcmp(oem_table_id, "UVL"))
37 uv_system_type = UV_LEGACY_APIC;
38 else if (!strcmp(oem_table_id, "UVX"))
39 uv_system_type = UV_X2APIC;
40 else if (!strcmp(oem_table_id, "UVH")) {
41 uv_system_type = UV_NON_UNIQUE_APIC;
42 return 1;
43 }
44 }
45 return 0;
46}
47
48enum uv_system_type get_uv_system_type(void)
49{
50 return uv_system_type;
51}
52
53int is_uv_system(void)
54{
55 return uv_system_type != UV_NONE;
56}
57EXPORT_SYMBOL_GPL(is_uv_system);
25 58
26DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); 59DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
27EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); 60EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info);
@@ -38,6 +71,9 @@ EXPORT_SYMBOL_GPL(uv_cpu_to_blade);
38short uv_possible_blades; 71short uv_possible_blades;
39EXPORT_SYMBOL_GPL(uv_possible_blades); 72EXPORT_SYMBOL_GPL(uv_possible_blades);
40 73
74unsigned long sn_rtc_cycles_per_second;
75EXPORT_SYMBOL(sn_rtc_cycles_per_second);
76
41/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ 77/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
42 78
43static cpumask_t uv_target_cpus(void) 79static cpumask_t uv_target_cpus(void)
@@ -55,44 +91,44 @@ static cpumask_t uv_vector_allocation_domain(int cpu)
55int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip) 91int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip)
56{ 92{
57 unsigned long val; 93 unsigned long val;
58 int nasid; 94 int pnode;
59 95
60 nasid = uv_apicid_to_nasid(phys_apicid); 96 pnode = uv_apicid_to_pnode(phys_apicid);
61 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 97 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
62 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | 98 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
63 (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | 99 (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
64 APIC_DM_INIT; 100 APIC_DM_INIT;
65 uv_write_global_mmr64(nasid, UVH_IPI_INT, val); 101 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
66 mdelay(10); 102 mdelay(10);
67 103
68 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 104 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
69 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | 105 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
70 (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | 106 (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
71 APIC_DM_STARTUP; 107 APIC_DM_STARTUP;
72 uv_write_global_mmr64(nasid, UVH_IPI_INT, val); 108 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
73 return 0; 109 return 0;
74} 110}
75 111
76static void uv_send_IPI_one(int cpu, int vector) 112static void uv_send_IPI_one(int cpu, int vector)
77{ 113{
78 unsigned long val, apicid, lapicid; 114 unsigned long val, apicid, lapicid;
79 int nasid; 115 int pnode;
80 116
81 apicid = per_cpu(x86_cpu_to_apicid, cpu); /* ZZZ - cache node-local ? */ 117 apicid = per_cpu(x86_cpu_to_apicid, cpu);
82 lapicid = apicid & 0x3f; /* ZZZ macro needed */ 118 lapicid = apicid & 0x3f; /* ZZZ macro needed */
83 nasid = uv_apicid_to_nasid(apicid); 119 pnode = uv_apicid_to_pnode(apicid);
84 val = 120 val =
85 (1UL << UVH_IPI_INT_SEND_SHFT) | (lapicid << 121 (1UL << UVH_IPI_INT_SEND_SHFT) | (lapicid <<
86 UVH_IPI_INT_APIC_ID_SHFT) | 122 UVH_IPI_INT_APIC_ID_SHFT) |
87 (vector << UVH_IPI_INT_VECTOR_SHFT); 123 (vector << UVH_IPI_INT_VECTOR_SHFT);
88 uv_write_global_mmr64(nasid, UVH_IPI_INT, val); 124 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
89} 125}
90 126
91static void uv_send_IPI_mask(cpumask_t mask, int vector) 127static void uv_send_IPI_mask(cpumask_t mask, int vector)
92{ 128{
93 unsigned int cpu; 129 unsigned int cpu;
94 130
95 for (cpu = 0; cpu < NR_CPUS; ++cpu) 131 for_each_possible_cpu(cpu)
96 if (cpu_isset(cpu, mask)) 132 if (cpu_isset(cpu, mask))
97 uv_send_IPI_one(cpu, vector); 133 uv_send_IPI_one(cpu, vector);
98} 134}
@@ -117,6 +153,10 @@ static int uv_apic_id_registered(void)
117 return 1; 153 return 1;
118} 154}
119 155
156static void uv_init_apic_ldr(void)
157{
158}
159
120static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) 160static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
121{ 161{
122 int cpu; 162 int cpu;
@@ -126,72 +166,227 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
126 * May as well be the first. 166 * May as well be the first.
127 */ 167 */
128 cpu = first_cpu(cpumask); 168 cpu = first_cpu(cpumask);
129 if ((unsigned)cpu < NR_CPUS) 169 if ((unsigned)cpu < nr_cpu_ids)
130 return per_cpu(x86_cpu_to_apicid, cpu); 170 return per_cpu(x86_cpu_to_apicid, cpu);
131 else 171 else
132 return BAD_APICID; 172 return BAD_APICID;
133} 173}
134 174
175static unsigned int get_apic_id(unsigned long x)
176{
177 unsigned int id;
178
179 WARN_ON(preemptible() && num_online_cpus() > 1);
180 id = x | __get_cpu_var(x2apic_extra_bits);
181
182 return id;
183}
184
185static unsigned long set_apic_id(unsigned int id)
186{
187 unsigned long x;
188
189 /* maskout x2apic_extra_bits ? */
190 x = id;
191 return x;
192}
193
194static unsigned int uv_read_apic_id(void)
195{
196
197 return get_apic_id(apic_read(APIC_ID));
198}
199
135static unsigned int phys_pkg_id(int index_msb) 200static unsigned int phys_pkg_id(int index_msb)
136{ 201{
137 return GET_APIC_ID(read_apic_id()) >> index_msb; 202 return uv_read_apic_id() >> index_msb;
138} 203}
139 204
140#ifdef ZZZ /* Needs x2apic patch */
141static void uv_send_IPI_self(int vector) 205static void uv_send_IPI_self(int vector)
142{ 206{
143 apic_write(APIC_SELF_IPI, vector); 207 apic_write(APIC_SELF_IPI, vector);
144} 208}
145#endif
146 209
147struct genapic apic_x2apic_uv_x = { 210struct genapic apic_x2apic_uv_x = {
148 .name = "UV large system", 211 .name = "UV large system",
212 .acpi_madt_oem_check = uv_acpi_madt_oem_check,
149 .int_delivery_mode = dest_Fixed, 213 .int_delivery_mode = dest_Fixed,
150 .int_dest_mode = (APIC_DEST_PHYSICAL != 0), 214 .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
151 .target_cpus = uv_target_cpus, 215 .target_cpus = uv_target_cpus,
152 .vector_allocation_domain = uv_vector_allocation_domain,/* Fixme ZZZ */ 216 .vector_allocation_domain = uv_vector_allocation_domain,
153 .apic_id_registered = uv_apic_id_registered, 217 .apic_id_registered = uv_apic_id_registered,
218 .init_apic_ldr = uv_init_apic_ldr,
154 .send_IPI_all = uv_send_IPI_all, 219 .send_IPI_all = uv_send_IPI_all,
155 .send_IPI_allbutself = uv_send_IPI_allbutself, 220 .send_IPI_allbutself = uv_send_IPI_allbutself,
156 .send_IPI_mask = uv_send_IPI_mask, 221 .send_IPI_mask = uv_send_IPI_mask,
157 /* ZZZ.send_IPI_self = uv_send_IPI_self, */ 222 .send_IPI_self = uv_send_IPI_self,
158 .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, 223 .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
159 .phys_pkg_id = phys_pkg_id, /* Fixme ZZZ */ 224 .phys_pkg_id = phys_pkg_id,
225 .get_apic_id = get_apic_id,
226 .set_apic_id = set_apic_id,
227 .apic_id_mask = (0xFFFFFFFFu),
160}; 228};
161 229
162static __cpuinit void set_x2apic_extra_bits(int nasid) 230static __cpuinit void set_x2apic_extra_bits(int pnode)
163{ 231{
164 __get_cpu_var(x2apic_extra_bits) = ((nasid >> 1) << 6); 232 __get_cpu_var(x2apic_extra_bits) = (pnode << 6);
165} 233}
166 234
167/* 235/*
168 * Called on boot cpu. 236 * Called on boot cpu.
169 */ 237 */
170static __init void uv_system_init(void) 238static __init int boot_pnode_to_blade(int pnode)
239{
240 int blade;
241
242 for (blade = 0; blade < uv_num_possible_blades(); blade++)
243 if (pnode == uv_blade_info[blade].pnode)
244 return blade;
245 BUG();
246}
247
248struct redir_addr {
249 unsigned long redirect;
250 unsigned long alias;
251};
252
253#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT
254
255static __initdata struct redir_addr redir_addrs[] = {
256 {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_SI_ALIAS0_OVERLAY_CONFIG},
257 {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_SI_ALIAS1_OVERLAY_CONFIG},
258 {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_SI_ALIAS2_OVERLAY_CONFIG},
259};
260
261static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
262{
263 union uvh_si_alias0_overlay_config_u alias;
264 union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect;
265 int i;
266
267 for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) {
268 alias.v = uv_read_local_mmr(redir_addrs[i].alias);
269 if (alias.s.base == 0) {
270 *size = (1UL << alias.s.m_alias);
271 redirect.v = uv_read_local_mmr(redir_addrs[i].redirect);
272 *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT;
273 return;
274 }
275 }
276 BUG();
277}
278
279static __init void map_low_mmrs(void)
280{
281 init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
282 init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
283}
284
285enum map_type {map_wb, map_uc};
286
287static __init void map_high(char *id, unsigned long base, int shift,
288 int max_pnode, enum map_type map_type)
289{
290 unsigned long bytes, paddr;
291
292 paddr = base << shift;
293 bytes = (1UL << shift) * (max_pnode + 1);
294 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
295 paddr + bytes);
296 if (map_type == map_uc)
297 init_extra_mapping_uc(paddr, bytes);
298 else
299 init_extra_mapping_wb(paddr, bytes);
300
301}
302static __init void map_gru_high(int max_pnode)
303{
304 union uvh_rh_gam_gru_overlay_config_mmr_u gru;
305 int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
306
307 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
308 if (gru.s.enable)
309 map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
310}
311
312static __init void map_config_high(int max_pnode)
313{
314 union uvh_rh_gam_cfg_overlay_config_mmr_u cfg;
315 int shift = UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT;
316
317 cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR);
318 if (cfg.s.enable)
319 map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc);
320}
321
322static __init void map_mmr_high(int max_pnode)
323{
324 union uvh_rh_gam_mmr_overlay_config_mmr_u mmr;
325 int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT;
326
327 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
328 if (mmr.s.enable)
329 map_high("MMR", mmr.s.base, shift, max_pnode, map_uc);
330}
331
332static __init void map_mmioh_high(int max_pnode)
333{
334 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
335 int shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
336
337 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
338 if (mmioh.s.enable)
339 map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc);
340}
341
342static __init void uv_rtc_init(void)
343{
344 long status, ticks_per_sec, drift;
345
346 status =
347 x86_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec,
348 &drift);
349 if (status != 0 || ticks_per_sec < 100000) {
350 printk(KERN_WARNING
351 "unable to determine platform RTC clock frequency, "
352 "guessing.\n");
353 /* BIOS gives wrong value for clock freq. so guess */
354 sn_rtc_cycles_per_second = 1000000000000UL / 30000UL;
355 } else
356 sn_rtc_cycles_per_second = ticks_per_sec;
357}
358
359static bool uv_system_inited;
360
361void __init uv_system_init(void)
171{ 362{
172 union uvh_si_addr_map_config_u m_n_config; 363 union uvh_si_addr_map_config_u m_n_config;
173 int bytes, nid, cpu, lcpu, nasid, last_nasid, blade; 364 union uvh_node_id_u node_id;
174 unsigned long mmr_base; 365 unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
366 int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
367 int max_pnode = 0;
368 unsigned long mmr_base, present;
369
370 map_low_mmrs();
175 371
176 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); 372 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
373 m_val = m_n_config.s.m_skt;
374 n_val = m_n_config.s.n_skt;
177 mmr_base = 375 mmr_base =
178 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & 376 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
179 ~UV_MMR_ENABLE; 377 ~UV_MMR_ENABLE;
180 printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); 378 printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
181 379
182 last_nasid = -1; 380 for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
183 for_each_possible_cpu(cpu) { 381 uv_possible_blades +=
184 nid = cpu_to_node(cpu); 382 hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8));
185 nasid = uv_apicid_to_nasid(per_cpu(x86_cpu_to_apicid, cpu));
186 if (nasid != last_nasid)
187 uv_possible_blades++;
188 last_nasid = nasid;
189 }
190 printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades()); 383 printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades());
191 384
192 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); 385 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
193 uv_blade_info = alloc_bootmem_pages(bytes); 386 uv_blade_info = alloc_bootmem_pages(bytes);
194 387
388 get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
389
195 bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes(); 390 bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes();
196 uv_node_to_blade = alloc_bootmem_pages(bytes); 391 uv_node_to_blade = alloc_bootmem_pages(bytes);
197 memset(uv_node_to_blade, 255, bytes); 392 memset(uv_node_to_blade, 255, bytes);
@@ -200,51 +395,74 @@ static __init void uv_system_init(void)
200 uv_cpu_to_blade = alloc_bootmem_pages(bytes); 395 uv_cpu_to_blade = alloc_bootmem_pages(bytes);
201 memset(uv_cpu_to_blade, 255, bytes); 396 memset(uv_cpu_to_blade, 255, bytes);
202 397
203 last_nasid = -1; 398 blade = 0;
204 blade = -1; 399 for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) {
205 lcpu = -1; 400 present = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8);
206 for_each_possible_cpu(cpu) { 401 for (j = 0; j < 64; j++) {
207 nid = cpu_to_node(cpu); 402 if (!test_bit(j, &present))
208 nasid = uv_apicid_to_nasid(per_cpu(x86_cpu_to_apicid, cpu)); 403 continue;
209 if (nasid != last_nasid) { 404 uv_blade_info[blade].pnode = (i * 64 + j);
210 blade++; 405 uv_blade_info[blade].nr_possible_cpus = 0;
211 lcpu = -1;
212 uv_blade_info[blade].nr_posible_cpus = 0;
213 uv_blade_info[blade].nr_online_cpus = 0; 406 uv_blade_info[blade].nr_online_cpus = 0;
407 blade++;
214 } 408 }
215 last_nasid = nasid; 409 }
216 lcpu++; 410
411 node_id.v = uv_read_local_mmr(UVH_NODE_ID);
412 gnode_upper = (((unsigned long)node_id.s.node_id) &
413 ~((1 << n_val) - 1)) << m_val;
217 414
218 uv_cpu_hub_info(cpu)->m_val = m_n_config.s.m_skt; 415 uv_rtc_init();
219 uv_cpu_hub_info(cpu)->n_val = m_n_config.s.n_skt; 416
417 for_each_present_cpu(cpu) {
418 nid = cpu_to_node(cpu);
419 pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu));
420 blade = boot_pnode_to_blade(pnode);
421 lcpu = uv_blade_info[blade].nr_possible_cpus;
422 uv_blade_info[blade].nr_possible_cpus++;
423
424 uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
425 uv_cpu_hub_info(cpu)->lowmem_remap_top =
426 lowmem_redir_base + lowmem_redir_size;
427 uv_cpu_hub_info(cpu)->m_val = m_val;
428 uv_cpu_hub_info(cpu)->n_val = m_val;
220 uv_cpu_hub_info(cpu)->numa_blade_id = blade; 429 uv_cpu_hub_info(cpu)->numa_blade_id = blade;
221 uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; 430 uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
222 uv_cpu_hub_info(cpu)->local_nasid = nasid; 431 uv_cpu_hub_info(cpu)->pnode = pnode;
223 uv_cpu_hub_info(cpu)->gnode_upper = 432 uv_cpu_hub_info(cpu)->pnode_mask = (1 << n_val) - 1;
224 nasid & ~((1 << uv_hub_info->n_val) - 1); 433 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
434 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
225 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; 435 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
226 uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */ 436 uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */
227 uv_blade_info[blade].nasid = nasid;
228 uv_blade_info[blade].nr_posible_cpus++;
229 uv_node_to_blade[nid] = blade; 437 uv_node_to_blade[nid] = blade;
230 uv_cpu_to_blade[cpu] = blade; 438 uv_cpu_to_blade[cpu] = blade;
439 max_pnode = max(pnode, max_pnode);
231 440
232 printk(KERN_DEBUG "UV cpu %d, apicid 0x%x, nasid %d, nid %d\n", 441 printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, "
233 cpu, per_cpu(x86_cpu_to_apicid, cpu), nasid, nid); 442 "lcpu %d, blade %d\n",
234 printk(KERN_DEBUG "UV lcpu %d, blade %d\n", lcpu, blade); 443 cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid,
444 lcpu, blade);
235 } 445 }
446
447 map_gru_high(max_pnode);
448 map_mmr_high(max_pnode);
449 map_config_high(max_pnode);
450 map_mmioh_high(max_pnode);
451 uv_system_inited = true;
236} 452}
237 453
238/* 454/*
239 * Called on each cpu to initialize the per_cpu UV data area. 455 * Called on each cpu to initialize the per_cpu UV data area.
456 * ZZZ hotplug not supported yet
240 */ 457 */
241void __cpuinit uv_cpu_init(void) 458void __cpuinit uv_cpu_init(void)
242{ 459{
243 if (!uv_node_to_blade) 460 BUG_ON(!uv_system_inited);
244 uv_system_init();
245 461
246 uv_blade_info[uv_numa_blade_id()].nr_online_cpus++; 462 uv_blade_info[uv_numa_blade_id()].nr_online_cpus++;
247 463
248 if (get_uv_system_type() == UV_NON_UNIQUE_APIC) 464 if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
249 set_x2apic_extra_bits(uv_hub_info->local_nasid); 465 set_x2apic_extra_bits(uv_hub_info->pnode);
250} 466}
467
468
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
new file mode 100644
index 000000000000..1dcb0f13897e
--- /dev/null
+++ b/arch/x86/kernel/head.c
@@ -0,0 +1,56 @@
1#include <linux/kernel.h>
2#include <linux/init.h>
3
4#include <asm/setup.h>
5#include <asm/bios_ebda.h>
6
7#define BIOS_LOWMEM_KILOBYTES 0x413
8
9/*
10 * The BIOS places the EBDA/XBDA at the top of conventional
11 * memory, and usually decreases the reported amount of
12 * conventional memory (int 0x12) too. This also contains a
13 * workaround for Dell systems that neglect to reserve EBDA.
14 * The same workaround also avoids a problem with the AMD768MPX
15 * chipset: reserve a page before VGA to prevent PCI prefetch
16 * into it (errata #56). Usually the page is reserved anyways,
17 * unless you have no PS/2 mouse plugged in.
18 */
19void __init reserve_ebda_region(void)
20{
21 unsigned int lowmem, ebda_addr;
22
23 /* To determine the position of the EBDA and the */
24 /* end of conventional memory, we need to look at */
25 /* the BIOS data area. In a paravirtual environment */
26 /* that area is absent. We'll just have to assume */
27 /* that the paravirt case can handle memory setup */
28 /* correctly, without our help. */
29 if (paravirt_enabled())
30 return;
31
32 /* end of low (conventional) memory */
33 lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
34 lowmem <<= 10;
35
36 /* start of EBDA area */
37 ebda_addr = get_bios_ebda();
38 printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem);
39
40 /* Fixup: bios puts an EBDA in the top 64K segment */
41 /* of conventional memory, but does not adjust lowmem. */
42 if ((lowmem - ebda_addr) <= 0x10000)
43 lowmem = ebda_addr;
44
45 /* Fixup: bios does not report an EBDA at all. */
46 /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
47 if ((ebda_addr == 0) && (lowmem >= 0x9f000))
48 lowmem = 0x9f000;
49
50 /* Paranoia: should never happen, but... */
51 if ((lowmem == 0) || (lowmem >= 0x100000))
52 lowmem = 0x9f000;
53
54 /* reserve all memory between lowmem and the 1MB mark */
55 reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved");
56}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 3db059058927..fa1d25dd83e3 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -8,7 +8,34 @@
8#include <linux/init.h> 8#include <linux/init.h>
9#include <linux/start_kernel.h> 9#include <linux/start_kernel.h>
10 10
11#include <asm/setup.h>
12#include <asm/sections.h>
13#include <asm/e820.h>
14#include <asm/bios_ebda.h>
15
11void __init i386_start_kernel(void) 16void __init i386_start_kernel(void)
12{ 17{
18 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
19
20#ifdef CONFIG_BLK_DEV_INITRD
21 /* Reserve INITRD */
22 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
23 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
24 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
25 u64 ramdisk_end = ramdisk_image + ramdisk_size;
26 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
27 }
28#endif
29 reserve_early(init_pg_tables_start, init_pg_tables_end,
30 "INIT_PG_TABLE");
31
32 reserve_ebda_region();
33
34 /*
35 * At this point everything still needed from the boot loader
36 * or BIOS or kernel text should be early reserved or marked not
37 * RAM in e820. All other memory is free game.
38 */
39
13 start_kernel(); 40 start_kernel();
14} 41}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index e25c57b8aa84..d16084f90649 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -25,6 +25,27 @@
25#include <asm/e820.h> 25#include <asm/e820.h>
26#include <asm/bios_ebda.h> 26#include <asm/bios_ebda.h>
27 27
28/* boot cpu pda */
29static struct x8664_pda _boot_cpu_pda __read_mostly;
30
31#ifdef CONFIG_SMP
32/*
33 * We install an empty cpu_pda pointer table to indicate to early users
34 * (numa_set_node) that the cpu_pda pointer table for cpus other than
35 * the boot cpu is not yet setup.
36 */
37static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
38#else
39static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
40#endif
41
42void __init x86_64_init_pda(void)
43{
44 _cpu_pda = __cpu_pda;
45 cpu_pda(0) = &_boot_cpu_pda;
46 pda_init(0);
47}
48
28static void __init zap_identity_mappings(void) 49static void __init zap_identity_mappings(void)
29{ 50{
30 pgd_t *pgd = pgd_offset_k(0UL); 51 pgd_t *pgd = pgd_offset_k(0UL);
@@ -51,74 +72,6 @@ static void __init copy_bootdata(char *real_mode_data)
51 } 72 }
52} 73}
53 74
54#define BIOS_LOWMEM_KILOBYTES 0x413
55
56/*
57 * The BIOS places the EBDA/XBDA at the top of conventional
58 * memory, and usually decreases the reported amount of
59 * conventional memory (int 0x12) too. This also contains a
60 * workaround for Dell systems that neglect to reserve EBDA.
61 * The same workaround also avoids a problem with the AMD768MPX
62 * chipset: reserve a page before VGA to prevent PCI prefetch
63 * into it (errata #56). Usually the page is reserved anyways,
64 * unless you have no PS/2 mouse plugged in.
65 */
66static void __init reserve_ebda_region(void)
67{
68 unsigned int lowmem, ebda_addr;
69
70 /* To determine the position of the EBDA and the */
71 /* end of conventional memory, we need to look at */
72 /* the BIOS data area. In a paravirtual environment */
73 /* that area is absent. We'll just have to assume */
74 /* that the paravirt case can handle memory setup */
75 /* correctly, without our help. */
76 if (paravirt_enabled())
77 return;
78
79 /* end of low (conventional) memory */
80 lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
81 lowmem <<= 10;
82
83 /* start of EBDA area */
84 ebda_addr = get_bios_ebda();
85
86 /* Fixup: bios puts an EBDA in the top 64K segment */
87 /* of conventional memory, but does not adjust lowmem. */
88 if ((lowmem - ebda_addr) <= 0x10000)
89 lowmem = ebda_addr;
90
91 /* Fixup: bios does not report an EBDA at all. */
92 /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
93 if ((ebda_addr == 0) && (lowmem >= 0x9f000))
94 lowmem = 0x9f000;
95
96 /* Paranoia: should never happen, but... */
97 if ((lowmem == 0) || (lowmem >= 0x100000))
98 lowmem = 0x9f000;
99
100 /* reserve all memory between lowmem and the 1MB mark */
101 reserve_early(lowmem, 0x100000, "BIOS reserved");
102}
103
104static void __init reserve_setup_data(void)
105{
106 struct setup_data *data;
107 unsigned long pa_data;
108 char buf[32];
109
110 if (boot_params.hdr.version < 0x0209)
111 return;
112 pa_data = boot_params.hdr.setup_data;
113 while (pa_data) {
114 data = early_ioremap(pa_data, sizeof(*data));
115 sprintf(buf, "setup data %x", data->type);
116 reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
117 pa_data = data->next;
118 early_iounmap(data, sizeof(*data));
119 }
120}
121
122void __init x86_64_start_kernel(char * real_mode_data) 75void __init x86_64_start_kernel(char * real_mode_data)
123{ 76{
124 int i; 77 int i;
@@ -135,6 +88,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
135 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); 88 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
136 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == 89 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
137 (__START_KERNEL & PGDIR_MASK))); 90 (__START_KERNEL & PGDIR_MASK)));
91 BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
138 92
139 /* clear bss before set_intr_gate with early_idt_handler */ 93 /* clear bss before set_intr_gate with early_idt_handler */
140 clear_bss(); 94 clear_bss();
@@ -154,12 +108,16 @@ void __init x86_64_start_kernel(char * real_mode_data)
154 } 108 }
155 load_idt((const struct desc_ptr *)&idt_descr); 109 load_idt((const struct desc_ptr *)&idt_descr);
156 110
157 early_printk("Kernel alive\n"); 111 if (console_loglevel == 10)
112 early_printk("Kernel alive\n");
158 113
159 for (i = 0; i < NR_CPUS; i++) 114 x86_64_init_pda();
160 cpu_pda(i) = &boot_cpu_pda[i];
161 115
162 pda_init(0); 116 x86_64_start_reservations(real_mode_data);
117}
118
119void __init x86_64_start_reservations(char *real_mode_data)
120{
163 copy_bootdata(__va(real_mode_data)); 121 copy_bootdata(__va(real_mode_data));
164 122
165 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); 123 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
@@ -175,7 +133,6 @@ void __init x86_64_start_kernel(char * real_mode_data)
175#endif 133#endif
176 134
177 reserve_ebda_region(); 135 reserve_ebda_region();
178 reserve_setup_data();
179 136
180 /* 137 /*
181 * At this point everything still needed from the boot loader 138 * At this point everything still needed from the boot loader
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index f7357cc0162c..e835b4eea70b 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -172,10 +172,6 @@ num_subarch_entries = (. - subarch_entries) / 4
172 * 172 *
173 * Note that the stack is not yet set up! 173 * Note that the stack is not yet set up!
174 */ 174 */
175#define PTE_ATTR 0x007 /* PRESENT+RW+USER */
176#define PDE_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */
177#define PGD_ATTR 0x001 /* PRESENT (no other attributes) */
178
179default_entry: 175default_entry:
180#ifdef CONFIG_X86_PAE 176#ifdef CONFIG_X86_PAE
181 177
@@ -194,10 +190,11 @@ default_entry:
194 xorl %ebx,%ebx /* %ebx is kept at zero */ 190 xorl %ebx,%ebx /* %ebx is kept at zero */
195 191
196 movl $pa(pg0), %edi 192 movl $pa(pg0), %edi
193 movl %edi, pa(init_pg_tables_start)
197 movl $pa(swapper_pg_pmd), %edx 194 movl $pa(swapper_pg_pmd), %edx
198 movl $PTE_ATTR, %eax 195 movl $PTE_IDENT_ATTR, %eax
19910: 19610:
200 leal PDE_ATTR(%edi),%ecx /* Create PMD entry */ 197 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */
201 movl %ecx,(%edx) /* Store PMD entry */ 198 movl %ecx,(%edx) /* Store PMD entry */
202 /* Upper half already zero */ 199 /* Upper half already zero */
203 addl $8,%edx 200 addl $8,%edx
@@ -214,24 +211,27 @@ default_entry:
214 * End condition: we must map up to and including INIT_MAP_BEYOND_END 211 * End condition: we must map up to and including INIT_MAP_BEYOND_END
215 * bytes beyond the end of our own page tables. 212 * bytes beyond the end of our own page tables.
216 */ 213 */
217 leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp 214 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
218 cmpl %ebp,%eax 215 cmpl %ebp,%eax
219 jb 10b 216 jb 10b
2201: 2171:
221 movl %edi,pa(init_pg_tables_end) 218 movl %edi,pa(init_pg_tables_end)
219 shrl $12, %eax
220 movl %eax, pa(max_pfn_mapped)
222 221
223 /* Do early initialization of the fixmap area */ 222 /* Do early initialization of the fixmap area */
224 movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax 223 movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
225 movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8) 224 movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8)
226#else /* Not PAE */ 225#else /* Not PAE */
227 226
228page_pde_offset = (__PAGE_OFFSET >> 20); 227page_pde_offset = (__PAGE_OFFSET >> 20);
229 228
230 movl $pa(pg0), %edi 229 movl $pa(pg0), %edi
230 movl %edi, pa(init_pg_tables_start)
231 movl $pa(swapper_pg_dir), %edx 231 movl $pa(swapper_pg_dir), %edx
232 movl $PTE_ATTR, %eax 232 movl $PTE_IDENT_ATTR, %eax
23310: 23310:
234 leal PDE_ATTR(%edi),%ecx /* Create PDE entry */ 234 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */
235 movl %ecx,(%edx) /* Store identity PDE entry */ 235 movl %ecx,(%edx) /* Store identity PDE entry */
236 movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ 236 movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
237 addl $4,%edx 237 addl $4,%edx
@@ -245,13 +245,15 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
245 * bytes beyond the end of our own page tables; the +0x007 is 245 * bytes beyond the end of our own page tables; the +0x007 is
246 * the attribute bits 246 * the attribute bits
247 */ 247 */
248 leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp 248 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
249 cmpl %ebp,%eax 249 cmpl %ebp,%eax
250 jb 10b 250 jb 10b
251 movl %edi,pa(init_pg_tables_end) 251 movl %edi,pa(init_pg_tables_end)
252 shrl $12, %eax
253 movl %eax, pa(max_pfn_mapped)
252 254
253 /* Do early initialization of the fixmap area */ 255 /* Do early initialization of the fixmap area */
254 movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax 256 movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
255 movl %eax,pa(swapper_pg_dir+0xffc) 257 movl %eax,pa(swapper_pg_dir+0xffc)
256#endif 258#endif
257 jmp 3f 259 jmp 3f
@@ -446,10 +448,10 @@ is386: movl $2,%ecx # set MP
446 je 1f 448 je 1f
447 movl $(__KERNEL_PERCPU), %eax 449 movl $(__KERNEL_PERCPU), %eax
448 movl %eax,%fs # set this cpu's percpu 450 movl %eax,%fs # set this cpu's percpu
449 jmp initialize_secondary # all other CPUs call initialize_secondary 451 movl (stack_start), %esp
4501: 4521:
451#endif /* CONFIG_SMP */ 453#endif /* CONFIG_SMP */
452 jmp i386_start_kernel 454 jmp *(initial_code)
453 455
454/* 456/*
455 * We depend on ET to be correct. This checks for 287/387. 457 * We depend on ET to be correct. This checks for 287/387.
@@ -592,6 +594,11 @@ ignore_int:
592#endif 594#endif
593 iret 595 iret
594 596
597.section .cpuinit.data,"wa"
598.align 4
599ENTRY(initial_code)
600 .long i386_start_kernel
601
595.section .text 602.section .text
596/* 603/*
597 * Real beginning of normal "text" segment 604 * Real beginning of normal "text" segment
@@ -623,19 +630,19 @@ ENTRY(empty_zero_page)
623 /* Page-aligned for the benefit of paravirt? */ 630 /* Page-aligned for the benefit of paravirt? */
624 .align PAGE_SIZE_asm 631 .align PAGE_SIZE_asm
625ENTRY(swapper_pg_dir) 632ENTRY(swapper_pg_dir)
626 .long pa(swapper_pg_pmd+PGD_ATTR),0 /* low identity map */ 633 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
627# if KPMDS == 3 634# if KPMDS == 3
628 .long pa(swapper_pg_pmd+PGD_ATTR),0 635 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
629 .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 636 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
630 .long pa(swapper_pg_pmd+PGD_ATTR+0x2000),0 637 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0
631# elif KPMDS == 2 638# elif KPMDS == 2
632 .long 0,0 639 .long 0,0
633 .long pa(swapper_pg_pmd+PGD_ATTR),0 640 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
634 .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 641 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
635# elif KPMDS == 1 642# elif KPMDS == 1
636 .long 0,0 643 .long 0,0
637 .long 0,0 644 .long 0,0
638 .long pa(swapper_pg_pmd+PGD_ATTR),0 645 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
639# else 646# else
640# error "Kernel PMDs should be 1, 2 or 3" 647# error "Kernel PMDs should be 1, 2 or 3"
641# endif 648# endif
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 10a1955bb1d1..26cfdc1d7c7f 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -18,6 +18,7 @@
18#include <asm/page.h> 18#include <asm/page.h>
19#include <asm/msr.h> 19#include <asm/msr.h>
20#include <asm/cache.h> 20#include <asm/cache.h>
21#include <asm/processor-flags.h>
21 22
22#ifdef CONFIG_PARAVIRT 23#ifdef CONFIG_PARAVIRT
23#include <asm/asm-offsets.h> 24#include <asm/asm-offsets.h>
@@ -31,6 +32,13 @@
31 * 32 *
32 */ 33 */
33 34
35#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
36
37L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET)
38L3_PAGE_OFFSET = pud_index(__PAGE_OFFSET)
39L4_START_KERNEL = pgd_index(__START_KERNEL_map)
40L3_START_KERNEL = pud_index(__START_KERNEL_map)
41
34 .text 42 .text
35 .section .text.head 43 .section .text.head
36 .code64 44 .code64
@@ -76,8 +84,8 @@ startup_64:
76 /* Fixup the physical addresses in the page table 84 /* Fixup the physical addresses in the page table
77 */ 85 */
78 addq %rbp, init_level4_pgt + 0(%rip) 86 addq %rbp, init_level4_pgt + 0(%rip)
79 addq %rbp, init_level4_pgt + (258*8)(%rip) 87 addq %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip)
80 addq %rbp, init_level4_pgt + (511*8)(%rip) 88 addq %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip)
81 89
82 addq %rbp, level3_ident_pgt + 0(%rip) 90 addq %rbp, level3_ident_pgt + 0(%rip)
83 91
@@ -102,7 +110,7 @@ startup_64:
102 movq %rdi, %rax 110 movq %rdi, %rax
103 shrq $PMD_SHIFT, %rax 111 shrq $PMD_SHIFT, %rax
104 andq $(PTRS_PER_PMD - 1), %rax 112 andq $(PTRS_PER_PMD - 1), %rax
105 leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx 113 leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx
106 leaq level2_spare_pgt(%rip), %rbx 114 leaq level2_spare_pgt(%rip), %rbx
107 movq %rdx, 0(%rbx, %rax, 8) 115 movq %rdx, 0(%rbx, %rax, 8)
108ident_complete: 116ident_complete:
@@ -128,7 +136,7 @@ ident_complete:
128 /* Fixup phys_base */ 136 /* Fixup phys_base */
129 addq %rbp, phys_base(%rip) 137 addq %rbp, phys_base(%rip)
130 138
131#ifdef CONFIG_SMP 139#ifdef CONFIG_X86_TRAMPOLINE
132 addq %rbp, trampoline_level4_pgt + 0(%rip) 140 addq %rbp, trampoline_level4_pgt + 0(%rip)
133 addq %rbp, trampoline_level4_pgt + (511*8)(%rip) 141 addq %rbp, trampoline_level4_pgt + (511*8)(%rip)
134#endif 142#endif
@@ -154,9 +162,7 @@ ENTRY(secondary_startup_64)
154 */ 162 */
155 163
156 /* Enable PAE mode and PGE */ 164 /* Enable PAE mode and PGE */
157 xorq %rax, %rax 165 movl $(X86_CR4_PAE | X86_CR4_PGE), %eax
158 btsq $5, %rax
159 btsq $7, %rax
160 movq %rax, %cr4 166 movq %rax, %cr4
161 167
162 /* Setup early boot stage 4 level pagetables. */ 168 /* Setup early boot stage 4 level pagetables. */
@@ -184,19 +190,15 @@ ENTRY(secondary_startup_64)
1841: wrmsr /* Make changes effective */ 1901: wrmsr /* Make changes effective */
185 191
186 /* Setup cr0 */ 192 /* Setup cr0 */
187#define CR0_PM 1 /* protected mode */ 193#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
188#define CR0_MP (1<<1) 194 X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
189#define CR0_ET (1<<4) 195 X86_CR0_PG)
190#define CR0_NE (1<<5) 196 movl $CR0_STATE, %eax
191#define CR0_WP (1<<16)
192#define CR0_AM (1<<18)
193#define CR0_PAGING (1<<31)
194 movl $CR0_PM|CR0_MP|CR0_ET|CR0_NE|CR0_WP|CR0_AM|CR0_PAGING,%eax
195 /* Make changes effective */ 197 /* Make changes effective */
196 movq %rax, %cr0 198 movq %rax, %cr0
197 199
198 /* Setup a boot time stack */ 200 /* Setup a boot time stack */
199 movq init_rsp(%rip),%rsp 201 movq stack_start(%rip),%rsp
200 202
201 /* zero EFLAGS after setting rsp */ 203 /* zero EFLAGS after setting rsp */
202 pushq $0 204 pushq $0
@@ -208,7 +210,7 @@ ENTRY(secondary_startup_64)
208 * addresses where we're currently running on. We have to do that here 210 * addresses where we're currently running on. We have to do that here
209 * because in 32bit we couldn't load a 64bit linear address. 211 * because in 32bit we couldn't load a 64bit linear address.
210 */ 212 */
211 lgdt cpu_gdt_descr(%rip) 213 lgdt early_gdt_descr(%rip)
212 214
213 /* set up data segments. actually 0 would do too */ 215 /* set up data segments. actually 0 would do too */
214 movl $__KERNEL_DS,%eax 216 movl $__KERNEL_DS,%eax
@@ -257,8 +259,9 @@ ENTRY(secondary_startup_64)
257 .quad x86_64_start_kernel 259 .quad x86_64_start_kernel
258 __FINITDATA 260 __FINITDATA
259 261
260 ENTRY(init_rsp) 262 ENTRY(stack_start)
261 .quad init_thread_union+THREAD_SIZE-8 263 .quad init_thread_union+THREAD_SIZE-8
264 .word 0
262 265
263bad_address: 266bad_address:
264 jmp bad_address 267 jmp bad_address
@@ -327,11 +330,11 @@ early_idt_ripmsg:
327ENTRY(name) 330ENTRY(name)
328 331
329/* Automate the creation of 1 to 1 mapping pmd entries */ 332/* Automate the creation of 1 to 1 mapping pmd entries */
330#define PMDS(START, PERM, COUNT) \ 333#define PMDS(START, PERM, COUNT) \
331 i = 0 ; \ 334 i = 0 ; \
332 .rept (COUNT) ; \ 335 .rept (COUNT) ; \
333 .quad (START) + (i << 21) + (PERM) ; \ 336 .quad (START) + (i << PMD_SHIFT) + (PERM) ; \
334 i = i + 1 ; \ 337 i = i + 1 ; \
335 .endr 338 .endr
336 339
337 /* 340 /*
@@ -342,9 +345,9 @@ ENTRY(name)
342 */ 345 */
343NEXT_PAGE(init_level4_pgt) 346NEXT_PAGE(init_level4_pgt)
344 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 347 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
345 .fill 257,8,0 348 .org init_level4_pgt + L4_PAGE_OFFSET*8, 0
346 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 349 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
347 .fill 252,8,0 350 .org init_level4_pgt + L4_START_KERNEL*8, 0
348 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ 351 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
349 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE 352 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
350 353
@@ -353,7 +356,7 @@ NEXT_PAGE(level3_ident_pgt)
353 .fill 511,8,0 356 .fill 511,8,0
354 357
355NEXT_PAGE(level3_kernel_pgt) 358NEXT_PAGE(level3_kernel_pgt)
356 .fill 510,8,0 359 .fill L3_START_KERNEL,8,0
357 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ 360 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
358 .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE 361 .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
359 .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE 362 .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
@@ -371,7 +374,7 @@ NEXT_PAGE(level2_ident_pgt)
371 /* Since I easily can, map the first 1G. 374 /* Since I easily can, map the first 1G.
372 * Don't set NX because code runs from these pages. 375 * Don't set NX because code runs from these pages.
373 */ 376 */
374 PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) 377 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
375 378
376NEXT_PAGE(level2_kernel_pgt) 379NEXT_PAGE(level2_kernel_pgt)
377 /* 380 /*
@@ -384,7 +387,7 @@ NEXT_PAGE(level2_kernel_pgt)
384 * If you want to increase this then increase MODULES_VADDR 387 * If you want to increase this then increase MODULES_VADDR
385 * too.) 388 * too.)
386 */ 389 */
387 PMDS(0, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, 390 PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
388 KERNEL_IMAGE_SIZE/PMD_SIZE) 391 KERNEL_IMAGE_SIZE/PMD_SIZE)
389 392
390NEXT_PAGE(level2_spare_pgt) 393NEXT_PAGE(level2_spare_pgt)
@@ -395,54 +398,17 @@ NEXT_PAGE(level2_spare_pgt)
395 398
396 .data 399 .data
397 .align 16 400 .align 16
398 .globl cpu_gdt_descr 401 .globl early_gdt_descr
399cpu_gdt_descr: 402early_gdt_descr:
400 .word gdt_end-cpu_gdt_table-1 403 .word GDT_ENTRIES*8-1
401gdt: 404 .quad per_cpu__gdt_page
402 .quad cpu_gdt_table
403#ifdef CONFIG_SMP
404 .rept NR_CPUS-1
405 .word 0
406 .quad 0
407 .endr
408#endif
409 405
410ENTRY(phys_base) 406ENTRY(phys_base)
411 /* This must match the first entry in level2_kernel_pgt */ 407 /* This must match the first entry in level2_kernel_pgt */
412 .quad 0x0000000000000000 408 .quad 0x0000000000000000
413 409
414/* We need valid kernel segments for data and code in long mode too 410#include "../../x86/xen/xen-head.S"
415 * IRET will check the segment types kkeil 2000/10/28
416 * Also sysret mandates a special GDT layout
417 */
418
419 .section .data.page_aligned, "aw"
420 .align PAGE_SIZE
421
422/* The TLS descriptors are currently at a different place compared to i386.
423 Hopefully nobody expects them at a fixed place (Wine?) */
424 411
425ENTRY(cpu_gdt_table)
426 .quad 0x0000000000000000 /* NULL descriptor */
427 .quad 0x00cf9b000000ffff /* __KERNEL32_CS */
428 .quad 0x00af9b000000ffff /* __KERNEL_CS */
429 .quad 0x00cf93000000ffff /* __KERNEL_DS */
430 .quad 0x00cffb000000ffff /* __USER32_CS */
431 .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */
432 .quad 0x00affb000000ffff /* __USER_CS */
433 .quad 0x0 /* unused */
434 .quad 0,0 /* TSS */
435 .quad 0,0 /* LDT */
436 .quad 0,0,0 /* three TLS descriptors */
437 .quad 0x0000f40000000000 /* node/CPU stored in limit */
438gdt_end:
439 /* asm/segment.h:GDT_ENTRIES must match this */
440 /* This should be a multiple of the cache line size */
441 /* GDTs of other CPUs are now dynamically allocated */
442
443 /* zero the remaining page */
444 .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
445
446 .section .bss, "aw", @nobits 412 .section .bss, "aw", @nobits
447 .align L1_CACHE_BYTES 413 .align L1_CACHE_BYTES
448ENTRY(idt_table) 414ENTRY(idt_table)
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 9b5cfcdfc426..acf62fc233da 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -17,7 +17,7 @@
17 17
18/* FSEC = 10^-15 18/* FSEC = 10^-15
19 NSEC = 10^-9 */ 19 NSEC = 10^-9 */
20#define FSEC_PER_NSEC 1000000 20#define FSEC_PER_NSEC 1000000L
21 21
22/* 22/*
23 * HPET address is set in acpi/boot.c, when an ACPI entry exists 23 * HPET address is set in acpi/boot.c, when an ACPI entry exists
@@ -36,26 +36,15 @@ static inline void hpet_writel(unsigned long d, unsigned long a)
36} 36}
37 37
38#ifdef CONFIG_X86_64 38#ifdef CONFIG_X86_64
39
40#include <asm/pgtable.h> 39#include <asm/pgtable.h>
41 40#endif
42static inline void hpet_set_mapping(void)
43{
44 set_fixmap_nocache(FIX_HPET_BASE, hpet_address);
45 __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
46 hpet_virt_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE);
47}
48
49static inline void hpet_clear_mapping(void)
50{
51 hpet_virt_address = NULL;
52}
53
54#else
55 41
56static inline void hpet_set_mapping(void) 42static inline void hpet_set_mapping(void)
57{ 43{
58 hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); 44 hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
45#ifdef CONFIG_X86_64
46 __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
47#endif
59} 48}
60 49
61static inline void hpet_clear_mapping(void) 50static inline void hpet_clear_mapping(void)
@@ -63,7 +52,6 @@ static inline void hpet_clear_mapping(void)
63 iounmap(hpet_virt_address); 52 iounmap(hpet_virt_address);
64 hpet_virt_address = NULL; 53 hpet_virt_address = NULL;
65} 54}
66#endif
67 55
68/* 56/*
69 * HPET command line enable / disable 57 * HPET command line enable / disable
@@ -127,13 +115,17 @@ static void hpet_reserve_platform_timers(unsigned long id)
127 hd.hd_phys_address = hpet_address; 115 hd.hd_phys_address = hpet_address;
128 hd.hd_address = hpet; 116 hd.hd_address = hpet;
129 hd.hd_nirqs = nrtimers; 117 hd.hd_nirqs = nrtimers;
130 hd.hd_flags = HPET_DATA_PLATFORM;
131 hpet_reserve_timer(&hd, 0); 118 hpet_reserve_timer(&hd, 0);
132 119
133#ifdef CONFIG_HPET_EMULATE_RTC 120#ifdef CONFIG_HPET_EMULATE_RTC
134 hpet_reserve_timer(&hd, 1); 121 hpet_reserve_timer(&hd, 1);
135#endif 122#endif
136 123
124 /*
125 * NOTE that hd_irq[] reflects IOAPIC input pins (LEGACY_8254
126 * is wrong for i8259!) not the output IRQ. Many BIOS writers
127 * don't bother configuring *any* comparator interrupts.
128 */
137 hd.hd_irq[0] = HPET_LEGACY_8254; 129 hd.hd_irq[0] = HPET_LEGACY_8254;
138 hd.hd_irq[1] = HPET_LEGACY_RTC; 130 hd.hd_irq[1] = HPET_LEGACY_RTC;
139 131
@@ -206,25 +198,24 @@ static void hpet_enable_legacy_int(void)
206 198
207static void hpet_legacy_clockevent_register(void) 199static void hpet_legacy_clockevent_register(void)
208{ 200{
209 uint64_t hpet_freq;
210
211 /* Start HPET legacy interrupts */ 201 /* Start HPET legacy interrupts */
212 hpet_enable_legacy_int(); 202 hpet_enable_legacy_int();
213 203
214 /* 204 /*
215 * The period is a femto seconds value. We need to calculate the 205 * The mult factor is defined as (include/linux/clockchips.h)
216 * scaled math multiplication factor for nanosecond to hpet tick 206 * mult/2^shift = cyc/ns (in contrast to ns/cyc in clocksource.h)
217 * conversion. 207 * hpet_period is in units of femtoseconds (per cycle), so
208 * mult/2^shift = cyc/ns = 10^6/hpet_period
209 * mult = (10^6 * 2^shift)/hpet_period
210 * mult = (FSEC_PER_NSEC << hpet_clockevent.shift)/hpet_period
218 */ 211 */
219 hpet_freq = 1000000000000000ULL; 212 hpet_clockevent.mult = div_sc((unsigned long) FSEC_PER_NSEC,
220 do_div(hpet_freq, hpet_period); 213 hpet_period, hpet_clockevent.shift);
221 hpet_clockevent.mult = div_sc((unsigned long) hpet_freq,
222 NSEC_PER_SEC, hpet_clockevent.shift);
223 /* Calculate the min / max delta */ 214 /* Calculate the min / max delta */
224 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, 215 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
225 &hpet_clockevent); 216 &hpet_clockevent);
226 hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30, 217 /* 5 usec minimum reprogramming delta. */
227 &hpet_clockevent); 218 hpet_clockevent.min_delta_ns = 5000;
228 219
229 /* 220 /*
230 * Start hpet with the boot cpu mask and make it 221 * Start hpet with the boot cpu mask and make it
@@ -283,15 +274,22 @@ static void hpet_legacy_set_mode(enum clock_event_mode mode,
283} 274}
284 275
285static int hpet_legacy_next_event(unsigned long delta, 276static int hpet_legacy_next_event(unsigned long delta,
286 struct clock_event_device *evt) 277 struct clock_event_device *evt)
287{ 278{
288 unsigned long cnt; 279 u32 cnt;
289 280
290 cnt = hpet_readl(HPET_COUNTER); 281 cnt = hpet_readl(HPET_COUNTER);
291 cnt += delta; 282 cnt += (u32) delta;
292 hpet_writel(cnt, HPET_T0_CMP); 283 hpet_writel(cnt, HPET_T0_CMP);
293 284
294 return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0) ? -ETIME : 0; 285 /*
286 * We need to read back the CMP register to make sure that
287 * what we wrote hit the chip before we compare it to the
288 * counter.
289 */
290 WARN_ON((u32)hpet_readl(HPET_T0_CMP) != cnt);
291
292 return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
295} 293}
296 294
297/* 295/*
@@ -324,7 +322,7 @@ static struct clocksource clocksource_hpet = {
324 322
325static int hpet_clocksource_register(void) 323static int hpet_clocksource_register(void)
326{ 324{
327 u64 tmp, start, now; 325 u64 start, now;
328 cycle_t t1; 326 cycle_t t1;
329 327
330 /* Start the counter */ 328 /* Start the counter */
@@ -351,21 +349,15 @@ static int hpet_clocksource_register(void)
351 return -ENODEV; 349 return -ENODEV;
352 } 350 }
353 351
354 /* Initialize and register HPET clocksource 352 /*
355 * 353 * The definition of mult is (include/linux/clocksource.h)
356 * hpet period is in femto seconds per cycle 354 * mult/2^shift = ns/cyc and hpet_period is in units of fsec/cyc
357 * so we need to convert this to ns/cyc units 355 * so we first need to convert hpet_period to ns/cyc units:
358 * approximated by mult/2^shift 356 * mult/2^shift = ns/cyc = hpet_period/10^6
359 * 357 * mult = (hpet_period * 2^shift)/10^6
360 * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift 358 * mult = (hpet_period << shift)/FSEC_PER_NSEC
361 * fsec/cyc * 1ns/1000000fsec * 2^shift = mult
362 * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
363 * (fsec/cyc << shift)/1000000 = mult
364 * (hpet_period << shift)/FSEC_PER_NSEC = mult
365 */ 359 */
366 tmp = (u64)hpet_period << HPET_SHIFT; 360 clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT);
367 do_div(tmp, FSEC_PER_NSEC);
368 clocksource_hpet.mult = (u32)tmp;
369 361
370 clocksource_register(&clocksource_hpet); 362 clocksource_register(&clocksource_hpet);
371 363
@@ -378,6 +370,7 @@ static int hpet_clocksource_register(void)
378int __init hpet_enable(void) 370int __init hpet_enable(void)
379{ 371{
380 unsigned long id; 372 unsigned long id;
373 int i;
381 374
382 if (!is_hpet_capable()) 375 if (!is_hpet_capable())
383 return 0; 376 return 0;
@@ -388,6 +381,29 @@ int __init hpet_enable(void)
388 * Read the period and check for a sane value: 381 * Read the period and check for a sane value:
389 */ 382 */
390 hpet_period = hpet_readl(HPET_PERIOD); 383 hpet_period = hpet_readl(HPET_PERIOD);
384
385 /*
386 * AMD SB700 based systems with spread spectrum enabled use a
387 * SMM based HPET emulation to provide proper frequency
388 * setting. The SMM code is initialized with the first HPET
389 * register access and takes some time to complete. During
390 * this time the config register reads 0xffffffff. We check
391 * for max. 1000 loops whether the config register reads a non
392 * 0xffffffff value to make sure that HPET is up and running
393 * before we go further. A counting loop is safe, as the HPET
394 * access takes thousands of CPU cycles. On non SB700 based
395 * machines this check is only done once and has no side
396 * effects.
397 */
398 for (i = 0; hpet_readl(HPET_CFG) == 0xFFFFFFFF; i++) {
399 if (i == 1000) {
400 printk(KERN_WARNING
401 "HPET config register value = 0xFFFFFFFF. "
402 "Disabling HPET\n");
403 goto out_nohpet;
404 }
405 }
406
391 if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD) 407 if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD)
392 goto out_nohpet; 408 goto out_nohpet;
393 409
@@ -487,7 +503,7 @@ void hpet_disable(void)
487#define RTC_NUM_INTS 1 503#define RTC_NUM_INTS 1
488 504
489static unsigned long hpet_rtc_flags; 505static unsigned long hpet_rtc_flags;
490static unsigned long hpet_prev_update_sec; 506static int hpet_prev_update_sec;
491static struct rtc_time hpet_alarm_time; 507static struct rtc_time hpet_alarm_time;
492static unsigned long hpet_pie_count; 508static unsigned long hpet_pie_count;
493static unsigned long hpet_t1_cmp; 509static unsigned long hpet_t1_cmp;
@@ -594,6 +610,9 @@ int hpet_set_rtc_irq_bit(unsigned long bit_mask)
594 610
595 hpet_rtc_flags |= bit_mask; 611 hpet_rtc_flags |= bit_mask;
596 612
613 if ((bit_mask & RTC_UIE) && !(oldbits & RTC_UIE))
614 hpet_prev_update_sec = -1;
615
597 if (!oldbits) 616 if (!oldbits)
598 hpet_rtc_timer_init(); 617 hpet_rtc_timer_init();
599 618
@@ -671,7 +690,7 @@ static void hpet_rtc_timer_reinit(void)
671 if (hpet_rtc_flags & RTC_PIE) 690 if (hpet_rtc_flags & RTC_PIE)
672 hpet_pie_count += lost_ints; 691 hpet_pie_count += lost_ints;
673 if (printk_ratelimit()) 692 if (printk_ratelimit())
674 printk(KERN_WARNING "rtc: lost %d interrupts\n", 693 printk(KERN_WARNING "hpet1: lost %d rtc interrupts\n",
675 lost_ints); 694 lost_ints);
676 } 695 }
677} 696}
@@ -689,7 +708,8 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
689 708
690 if (hpet_rtc_flags & RTC_UIE && 709 if (hpet_rtc_flags & RTC_UIE &&
691 curr_time.tm_sec != hpet_prev_update_sec) { 710 curr_time.tm_sec != hpet_prev_update_sec) {
692 rtc_int_flag = RTC_UF; 711 if (hpet_prev_update_sec >= 0)
712 rtc_int_flag = RTC_UF;
693 hpet_prev_update_sec = curr_time.tm_sec; 713 hpet_prev_update_sec = curr_time.tm_sec;
694 } 714 }
695 715
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index deb43785e923..dd7ebee446af 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -1,7 +1,14 @@
1#include <linux/module.h> 1#include <linux/module.h>
2
2#include <asm/checksum.h> 3#include <asm/checksum.h>
3#include <asm/desc.h>
4#include <asm/pgtable.h> 4#include <asm/pgtable.h>
5#include <asm/desc.h>
6#include <asm/ftrace.h>
7
8#ifdef CONFIG_FTRACE
9/* mcount is defined in assembly */
10EXPORT_SYMBOL(mcount);
11#endif
5 12
6/* Networking helper routines. */ 13/* Networking helper routines. */
7EXPORT_SYMBOL(csum_partial_copy_generic); 14EXPORT_SYMBOL(csum_partial_copy_generic);
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index eb9ddd8efb82..1f20608d4ca8 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -21,9 +21,12 @@
21# include <asm/sigcontext32.h> 21# include <asm/sigcontext32.h>
22# include <asm/user32.h> 22# include <asm/user32.h>
23#else 23#else
24# define save_i387_ia32 save_i387 24# define save_i387_xstate_ia32 save_i387_xstate
25# define restore_i387_ia32 restore_i387 25# define restore_i387_xstate_ia32 restore_i387_xstate
26# define _fpstate_ia32 _fpstate 26# define _fpstate_ia32 _fpstate
27# define _xstate_ia32 _xstate
28# define sig_xstate_ia32_size sig_xstate_size
29# define fx_sw_reserved_ia32 fx_sw_reserved
27# define user_i387_ia32_struct user_i387_struct 30# define user_i387_ia32_struct user_i387_struct
28# define user32_fxsr_struct user_fxsr_struct 31# define user32_fxsr_struct user_fxsr_struct
29#endif 32#endif
@@ -36,6 +39,7 @@
36 39
37static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; 40static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
38unsigned int xstate_size; 41unsigned int xstate_size;
42unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32);
39static struct i387_fxsave_struct fx_scratch __cpuinitdata; 43static struct i387_fxsave_struct fx_scratch __cpuinitdata;
40 44
41void __cpuinit mxcsr_feature_mask_init(void) 45void __cpuinit mxcsr_feature_mask_init(void)
@@ -61,6 +65,11 @@ void __init init_thread_xstate(void)
61 return; 65 return;
62 } 66 }
63 67
68 if (cpu_has_xsave) {
69 xsave_cntxt_init();
70 return;
71 }
72
64 if (cpu_has_fxsr) 73 if (cpu_has_fxsr)
65 xstate_size = sizeof(struct i387_fxsave_struct); 74 xstate_size = sizeof(struct i387_fxsave_struct);
66#ifdef CONFIG_X86_32 75#ifdef CONFIG_X86_32
@@ -83,9 +92,19 @@ void __cpuinit fpu_init(void)
83 92
84 write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ 93 write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
85 94
95 /*
96 * Boot processor to setup the FP and extended state context info.
97 */
98 if (!smp_processor_id())
99 init_thread_xstate();
100 xsave_init();
101
86 mxcsr_feature_mask_init(); 102 mxcsr_feature_mask_init();
87 /* clean state in init */ 103 /* clean state in init */
88 current_thread_info()->status = 0; 104 if (cpu_has_xsave)
105 current_thread_info()->status = TS_XSAVE;
106 else
107 current_thread_info()->status = 0;
89 clear_used_math(); 108 clear_used_math();
90} 109}
91#endif /* CONFIG_X86_64 */ 110#endif /* CONFIG_X86_64 */
@@ -195,6 +214,13 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
195 */ 214 */
196 target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; 215 target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
197 216
217 /*
218 * update the header bits in the xsave header, indicating the
219 * presence of FP and SSE state.
220 */
221 if (cpu_has_xsave)
222 target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
223
198 return ret; 224 return ret;
199} 225}
200 226
@@ -395,6 +421,12 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
395 if (!ret) 421 if (!ret)
396 convert_to_fxsr(target, &env); 422 convert_to_fxsr(target, &env);
397 423
424 /*
425 * update the header bit in the xsave header, indicating the
426 * presence of FP.
427 */
428 if (cpu_has_xsave)
429 target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
398 return ret; 430 return ret;
399} 431}
400 432
@@ -407,7 +439,6 @@ static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
407 struct task_struct *tsk = current; 439 struct task_struct *tsk = current;
408 struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave; 440 struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
409 441
410 unlazy_fpu(tsk);
411 fp->status = fp->swd; 442 fp->status = fp->swd;
412 if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct))) 443 if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct)))
413 return -1; 444 return -1;
@@ -421,8 +452,6 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
421 struct user_i387_ia32_struct env; 452 struct user_i387_ia32_struct env;
422 int err = 0; 453 int err = 0;
423 454
424 unlazy_fpu(tsk);
425
426 convert_from_fxsr(&env, tsk); 455 convert_from_fxsr(&env, tsk);
427 if (__copy_to_user(buf, &env, sizeof(env))) 456 if (__copy_to_user(buf, &env, sizeof(env)))
428 return -1; 457 return -1;
@@ -432,16 +461,54 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
432 if (err) 461 if (err)
433 return -1; 462 return -1;
434 463
435 if (__copy_to_user(&buf->_fxsr_env[0], fx, 464 if (__copy_to_user(&buf->_fxsr_env[0], fx, xstate_size))
436 sizeof(struct i387_fxsave_struct))) 465 return -1;
466 return 1;
467}
468
469static int save_i387_xsave(void __user *buf)
470{
471 struct task_struct *tsk = current;
472 struct _fpstate_ia32 __user *fx = buf;
473 int err = 0;
474
475 /*
476 * For legacy compatible, we always set FP/SSE bits in the bit
477 * vector while saving the state to the user context.
478 * This will enable us capturing any changes(during sigreturn) to
479 * the FP/SSE bits by the legacy applications which don't touch
480 * xstate_bv in the xsave header.
481 *
482 * xsave aware applications can change the xstate_bv in the xsave
483 * header as well as change any contents in the memory layout.
484 * xrestore as part of sigreturn will capture all the changes.
485 */
486 tsk->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
487
488 if (save_i387_fxsave(fx) < 0)
489 return -1;
490
491 err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved_ia32,
492 sizeof(struct _fpx_sw_bytes));
493 err |= __put_user(FP_XSTATE_MAGIC2,
494 (__u32 __user *) (buf + sig_xstate_ia32_size
495 - FP_XSTATE_MAGIC2_SIZE));
496 if (err)
437 return -1; 497 return -1;
498
438 return 1; 499 return 1;
439} 500}
440 501
441int save_i387_ia32(struct _fpstate_ia32 __user *buf) 502int save_i387_xstate_ia32(void __user *buf)
442{ 503{
504 struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
505 struct task_struct *tsk = current;
506
443 if (!used_math()) 507 if (!used_math())
444 return 0; 508 return 0;
509
510 if (!access_ok(VERIFY_WRITE, buf, sig_xstate_ia32_size))
511 return -EACCES;
445 /* 512 /*
446 * This will cause a "finit" to be triggered by the next 513 * This will cause a "finit" to be triggered by the next
447 * attempted FPU operation by the 'current' process. 514 * attempted FPU operation by the 'current' process.
@@ -451,13 +518,17 @@ int save_i387_ia32(struct _fpstate_ia32 __user *buf)
451 if (!HAVE_HWFP) { 518 if (!HAVE_HWFP) {
452 return fpregs_soft_get(current, NULL, 519 return fpregs_soft_get(current, NULL,
453 0, sizeof(struct user_i387_ia32_struct), 520 0, sizeof(struct user_i387_ia32_struct),
454 NULL, buf) ? -1 : 1; 521 NULL, fp) ? -1 : 1;
455 } 522 }
456 523
524 unlazy_fpu(tsk);
525
526 if (cpu_has_xsave)
527 return save_i387_xsave(fp);
457 if (cpu_has_fxsr) 528 if (cpu_has_fxsr)
458 return save_i387_fxsave(buf); 529 return save_i387_fxsave(fp);
459 else 530 else
460 return save_i387_fsave(buf); 531 return save_i387_fsave(fp);
461} 532}
462 533
463static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf) 534static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
@@ -468,14 +539,15 @@ static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
468 sizeof(struct i387_fsave_struct)); 539 sizeof(struct i387_fsave_struct));
469} 540}
470 541
471static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf) 542static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf,
543 unsigned int size)
472{ 544{
473 struct task_struct *tsk = current; 545 struct task_struct *tsk = current;
474 struct user_i387_ia32_struct env; 546 struct user_i387_ia32_struct env;
475 int err; 547 int err;
476 548
477 err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0], 549 err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0],
478 sizeof(struct i387_fxsave_struct)); 550 size);
479 /* mxcsr reserved bits must be masked to zero for security reasons */ 551 /* mxcsr reserved bits must be masked to zero for security reasons */
480 tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; 552 tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
481 if (err || __copy_from_user(&env, buf, sizeof(env))) 553 if (err || __copy_from_user(&env, buf, sizeof(env)))
@@ -485,14 +557,69 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf)
485 return 0; 557 return 0;
486} 558}
487 559
488int restore_i387_ia32(struct _fpstate_ia32 __user *buf) 560static int restore_i387_xsave(void __user *buf)
561{
562 struct _fpx_sw_bytes fx_sw_user;
563 struct _fpstate_ia32 __user *fx_user =
564 ((struct _fpstate_ia32 __user *) buf);
565 struct i387_fxsave_struct __user *fx =
566 (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0];
567 struct xsave_hdr_struct *xsave_hdr =
568 &current->thread.xstate->xsave.xsave_hdr;
569 u64 mask;
570 int err;
571
572 if (check_for_xstate(fx, buf, &fx_sw_user))
573 goto fx_only;
574
575 mask = fx_sw_user.xstate_bv;
576
577 err = restore_i387_fxsave(buf, fx_sw_user.xstate_size);
578
579 xsave_hdr->xstate_bv &= pcntxt_mask;
580 /*
581 * These bits must be zero.
582 */
583 xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
584
585 /*
586 * Init the state that is not present in the memory layout
587 * and enabled by the OS.
588 */
589 mask = ~(pcntxt_mask & ~mask);
590 xsave_hdr->xstate_bv &= mask;
591
592 return err;
593fx_only:
594 /*
595 * Couldn't find the extended state information in the memory
596 * layout. Restore the FP/SSE and init the other extended state
597 * enabled by the OS.
598 */
599 xsave_hdr->xstate_bv = XSTATE_FPSSE;
600 return restore_i387_fxsave(buf, sizeof(struct i387_fxsave_struct));
601}
602
603int restore_i387_xstate_ia32(void __user *buf)
489{ 604{
490 int err; 605 int err;
491 struct task_struct *tsk = current; 606 struct task_struct *tsk = current;
607 struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
492 608
493 if (HAVE_HWFP) 609 if (HAVE_HWFP)
494 clear_fpu(tsk); 610 clear_fpu(tsk);
495 611
612 if (!buf) {
613 if (used_math()) {
614 clear_fpu(tsk);
615 clear_used_math();
616 }
617
618 return 0;
619 } else
620 if (!access_ok(VERIFY_READ, buf, sig_xstate_ia32_size))
621 return -EACCES;
622
496 if (!used_math()) { 623 if (!used_math()) {
497 err = init_fpu(tsk); 624 err = init_fpu(tsk);
498 if (err) 625 if (err)
@@ -500,14 +627,17 @@ int restore_i387_ia32(struct _fpstate_ia32 __user *buf)
500 } 627 }
501 628
502 if (HAVE_HWFP) { 629 if (HAVE_HWFP) {
503 if (cpu_has_fxsr) 630 if (cpu_has_xsave)
504 err = restore_i387_fxsave(buf); 631 err = restore_i387_xsave(buf);
632 else if (cpu_has_fxsr)
633 err = restore_i387_fxsave(fp, sizeof(struct
634 i387_fxsave_struct));
505 else 635 else
506 err = restore_i387_fsave(buf); 636 err = restore_i387_fsave(fp);
507 } else { 637 } else {
508 err = fpregs_soft_set(current, NULL, 638 err = fpregs_soft_set(current, NULL,
509 0, sizeof(struct user_i387_ia32_struct), 639 0, sizeof(struct user_i387_ia32_struct),
510 NULL, buf) != 0; 640 NULL, fp) != 0;
511 } 641 }
512 set_used_math(); 642 set_used_math();
513 643
diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259.c
index fe631967d625..4b8a53d841f7 100644
--- a/arch/x86/kernel/i8259_32.c
+++ b/arch/x86/kernel/i8259.c
@@ -1,8 +1,10 @@
1#include <linux/linkage.h>
1#include <linux/errno.h> 2#include <linux/errno.h>
2#include <linux/signal.h> 3#include <linux/signal.h>
3#include <linux/sched.h> 4#include <linux/sched.h>
4#include <linux/ioport.h> 5#include <linux/ioport.h>
5#include <linux/interrupt.h> 6#include <linux/interrupt.h>
7#include <linux/timex.h>
6#include <linux/slab.h> 8#include <linux/slab.h>
7#include <linux/random.h> 9#include <linux/random.h>
8#include <linux/init.h> 10#include <linux/init.h>
@@ -10,10 +12,12 @@
10#include <linux/sysdev.h> 12#include <linux/sysdev.h>
11#include <linux/bitops.h> 13#include <linux/bitops.h>
12 14
15#include <asm/acpi.h>
13#include <asm/atomic.h> 16#include <asm/atomic.h>
14#include <asm/system.h> 17#include <asm/system.h>
15#include <asm/io.h> 18#include <asm/io.h>
16#include <asm/timer.h> 19#include <asm/timer.h>
20#include <asm/hw_irq.h>
17#include <asm/pgtable.h> 21#include <asm/pgtable.h>
18#include <asm/delay.h> 22#include <asm/delay.h>
19#include <asm/desc.h> 23#include <asm/desc.h>
@@ -32,7 +36,7 @@ static int i8259A_auto_eoi;
32DEFINE_SPINLOCK(i8259A_lock); 36DEFINE_SPINLOCK(i8259A_lock);
33static void mask_and_ack_8259A(unsigned int); 37static void mask_and_ack_8259A(unsigned int);
34 38
35static struct irq_chip i8259A_chip = { 39struct irq_chip i8259A_chip = {
36 .name = "XT-PIC", 40 .name = "XT-PIC",
37 .mask = disable_8259A_irq, 41 .mask = disable_8259A_irq,
38 .disable = disable_8259A_irq, 42 .disable = disable_8259A_irq,
@@ -125,14 +129,14 @@ static inline int i8259A_irq_real(unsigned int irq)
125 int irqmask = 1<<irq; 129 int irqmask = 1<<irq;
126 130
127 if (irq < 8) { 131 if (irq < 8) {
128 outb(0x0B,PIC_MASTER_CMD); /* ISR register */ 132 outb(0x0B, PIC_MASTER_CMD); /* ISR register */
129 value = inb(PIC_MASTER_CMD) & irqmask; 133 value = inb(PIC_MASTER_CMD) & irqmask;
130 outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */ 134 outb(0x0A, PIC_MASTER_CMD); /* back to the IRR register */
131 return value; 135 return value;
132 } 136 }
133 outb(0x0B,PIC_SLAVE_CMD); /* ISR register */ 137 outb(0x0B, PIC_SLAVE_CMD); /* ISR register */
134 value = inb(PIC_SLAVE_CMD) & (irqmask >> 8); 138 value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
135 outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ 139 outb(0x0A, PIC_SLAVE_CMD); /* back to the IRR register */
136 return value; 140 return value;
137} 141}
138 142
@@ -171,12 +175,14 @@ handle_real_irq:
171 if (irq & 8) { 175 if (irq & 8) {
172 inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ 176 inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */
173 outb(cached_slave_mask, PIC_SLAVE_IMR); 177 outb(cached_slave_mask, PIC_SLAVE_IMR);
174 outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */ 178 /* 'Specific EOI' to slave */
175 outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */ 179 outb(0x60+(irq&7), PIC_SLAVE_CMD);
180 /* 'Specific EOI' to master-IRQ2 */
181 outb(0x60+PIC_CASCADE_IR, PIC_MASTER_CMD);
176 } else { 182 } else {
177 inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ 183 inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */
178 outb(cached_master_mask, PIC_MASTER_IMR); 184 outb(cached_master_mask, PIC_MASTER_IMR);
179 outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */ 185 outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */
180 } 186 }
181 spin_unlock_irqrestore(&i8259A_lock, flags); 187 spin_unlock_irqrestore(&i8259A_lock, flags);
182 return; 188 return;
@@ -199,7 +205,8 @@ spurious_8259A_irq:
199 * lets ACK and report it. [once per IRQ] 205 * lets ACK and report it. [once per IRQ]
200 */ 206 */
201 if (!(spurious_irq_mask & irqmask)) { 207 if (!(spurious_irq_mask & irqmask)) {
202 printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); 208 printk(KERN_DEBUG
209 "spurious 8259A interrupt: IRQ%d.\n", irq);
203 spurious_irq_mask |= irqmask; 210 spurious_irq_mask |= irqmask;
204 } 211 }
205 atomic_inc(&irq_err_count); 212 atomic_inc(&irq_err_count);
@@ -275,6 +282,30 @@ static int __init i8259A_init_sysfs(void)
275 282
276device_initcall(i8259A_init_sysfs); 283device_initcall(i8259A_init_sysfs);
277 284
285void mask_8259A(void)
286{
287 unsigned long flags;
288
289 spin_lock_irqsave(&i8259A_lock, flags);
290
291 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
292 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
293
294 spin_unlock_irqrestore(&i8259A_lock, flags);
295}
296
297void unmask_8259A(void)
298{
299 unsigned long flags;
300
301 spin_lock_irqsave(&i8259A_lock, flags);
302
303 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
304 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
305
306 spin_unlock_irqrestore(&i8259A_lock, flags);
307}
308
278void init_8259A(int auto_eoi) 309void init_8259A(int auto_eoi)
279{ 310{
280 unsigned long flags; 311 unsigned long flags;
@@ -290,17 +321,28 @@ void init_8259A(int auto_eoi)
290 * outb_pic - this has to work on a wide range of PC hardware. 321 * outb_pic - this has to work on a wide range of PC hardware.
291 */ 322 */
292 outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ 323 outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
293 outb_pic(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ 324
294 outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ 325 /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 on x86-64,
326 to 0x20-0x27 on i386 */
327 outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
328
329 /* 8259A-1 (the master) has a slave on IR2 */
330 outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);
331
295 if (auto_eoi) /* master does Auto EOI */ 332 if (auto_eoi) /* master does Auto EOI */
296 outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); 333 outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
297 else /* master expects normal EOI */ 334 else /* master expects normal EOI */
298 outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); 335 outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
299 336
300 outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ 337 outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */
301 outb_pic(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ 338
302 outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ 339 /* ICW2: 8259A-2 IR0-7 mapped to IRQ8_VECTOR */
303 outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ 340 outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR);
341 /* 8259A-2 is a slave on master's IR2 */
342 outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
343 /* (slave's support for AEOI in flat mode is to be investigated) */
344 outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR);
345
304 if (auto_eoi) 346 if (auto_eoi)
305 /* 347 /*
306 * In AEOI mode we just have to mask the interrupt 348 * In AEOI mode we just have to mask the interrupt
@@ -317,93 +359,3 @@ void init_8259A(int auto_eoi)
317 359
318 spin_unlock_irqrestore(&i8259A_lock, flags); 360 spin_unlock_irqrestore(&i8259A_lock, flags);
319} 361}
320
321/*
322 * Note that on a 486, we don't want to do a SIGFPE on an irq13
323 * as the irq is unreliable, and exception 16 works correctly
324 * (ie as explained in the intel literature). On a 386, you
325 * can't use exception 16 due to bad IBM design, so we have to
326 * rely on the less exact irq13.
327 *
328 * Careful.. Not only is IRQ13 unreliable, but it is also
329 * leads to races. IBM designers who came up with it should
330 * be shot.
331 */
332
333
334static irqreturn_t math_error_irq(int cpl, void *dev_id)
335{
336 extern void math_error(void __user *);
337 outb(0,0xF0);
338 if (ignore_fpu_irq || !boot_cpu_data.hard_math)
339 return IRQ_NONE;
340 math_error((void __user *)get_irq_regs()->ip);
341 return IRQ_HANDLED;
342}
343
344/*
345 * New motherboards sometimes make IRQ 13 be a PCI interrupt,
346 * so allow interrupt sharing.
347 */
348static struct irqaction fpu_irq = {
349 .handler = math_error_irq,
350 .mask = CPU_MASK_NONE,
351 .name = "fpu",
352};
353
354void __init init_ISA_irqs (void)
355{
356 int i;
357
358#ifdef CONFIG_X86_LOCAL_APIC
359 init_bsp_APIC();
360#endif
361 init_8259A(0);
362
363 /*
364 * 16 old-style INTA-cycle interrupts:
365 */
366 for (i = 0; i < 16; i++) {
367 set_irq_chip_and_handler_name(i, &i8259A_chip,
368 handle_level_irq, "XT");
369 }
370}
371
372/* Overridden in paravirt.c */
373void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
374
375void __init native_init_IRQ(void)
376{
377 int i;
378
379 /* all the set up before the call gates are initialised */
380 pre_intr_init_hook();
381
382 /*
383 * Cover the whole vector space, no vector can escape
384 * us. (some of these will be overridden and become
385 * 'special' SMP interrupts)
386 */
387 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
388 int vector = FIRST_EXTERNAL_VECTOR + i;
389 if (i >= NR_IRQS)
390 break;
391 /* SYSCALL_VECTOR was reserved in trap_init. */
392 if (!test_bit(vector, used_vectors))
393 set_intr_gate(vector, interrupt[i]);
394 }
395
396 /* setup after call gates are initialised (usually add in
397 * the architecture specific gates)
398 */
399 intr_init_hook();
400
401 /*
402 * External FPU? Set up irq13 if so, for
403 * original braindamaged IBM FERR coupling.
404 */
405 if (boot_cpu_data.hard_math && !cpu_has_fpu)
406 setup_irq(FPU_IRQ, &fpu_irq);
407
408 irq_ctx_init(smp_processor_id());
409}
diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c
deleted file mode 100644
index fa57a1568508..000000000000
--- a/arch/x86/kernel/i8259_64.c
+++ /dev/null
@@ -1,512 +0,0 @@
1#include <linux/linkage.h>
2#include <linux/errno.h>
3#include <linux/signal.h>
4#include <linux/sched.h>
5#include <linux/ioport.h>
6#include <linux/interrupt.h>
7#include <linux/timex.h>
8#include <linux/slab.h>
9#include <linux/random.h>
10#include <linux/init.h>
11#include <linux/kernel_stat.h>
12#include <linux/sysdev.h>
13#include <linux/bitops.h>
14
15#include <asm/acpi.h>
16#include <asm/atomic.h>
17#include <asm/system.h>
18#include <asm/io.h>
19#include <asm/hw_irq.h>
20#include <asm/pgtable.h>
21#include <asm/delay.h>
22#include <asm/desc.h>
23#include <asm/apic.h>
24#include <asm/i8259.h>
25
26/*
27 * Common place to define all x86 IRQ vectors
28 *
29 * This builds up the IRQ handler stubs using some ugly macros in irq.h
30 *
31 * These macros create the low-level assembly IRQ routines that save
32 * register context and call do_IRQ(). do_IRQ() then does all the
33 * operations that are needed to keep the AT (or SMP IOAPIC)
34 * interrupt-controller happy.
35 */
36
37#define BI(x,y) \
38 BUILD_IRQ(x##y)
39
40#define BUILD_16_IRQS(x) \
41 BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
42 BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
43 BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
44 BI(x,c) BI(x,d) BI(x,e) BI(x,f)
45
46/*
47 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
48 * (these are usually mapped to vectors 0x30-0x3f)
49 */
50
51/*
52 * The IO-APIC gives us many more interrupt sources. Most of these
53 * are unused but an SMP system is supposed to have enough memory ...
54 * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
55 * across the spectrum, so we really want to be prepared to get all
56 * of these. Plus, more powerful systems might have more than 64
57 * IO-APIC registers.
58 *
59 * (these are usually mapped into the 0x30-0xff vector range)
60 */
61 BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
62BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
63BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
64BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
65
66#undef BUILD_16_IRQS
67#undef BI
68
69
70#define IRQ(x,y) \
71 IRQ##x##y##_interrupt
72
73#define IRQLIST_16(x) \
74 IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
75 IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
76 IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
77 IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
78
79/* for the irq vectors */
80static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
81 IRQLIST_16(0x2), IRQLIST_16(0x3),
82 IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
83 IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
84 IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
85};
86
87#undef IRQ
88#undef IRQLIST_16
89
90/*
91 * This is the 'legacy' 8259A Programmable Interrupt Controller,
92 * present in the majority of PC/AT boxes.
93 * plus some generic x86 specific things if generic specifics makes
94 * any sense at all.
95 * this file should become arch/i386/kernel/irq.c when the old irq.c
96 * moves to arch independent land
97 */
98
99static int i8259A_auto_eoi;
100DEFINE_SPINLOCK(i8259A_lock);
101static void mask_and_ack_8259A(unsigned int);
102
103static struct irq_chip i8259A_chip = {
104 .name = "XT-PIC",
105 .mask = disable_8259A_irq,
106 .disable = disable_8259A_irq,
107 .unmask = enable_8259A_irq,
108 .mask_ack = mask_and_ack_8259A,
109};
110
111/*
112 * 8259A PIC functions to handle ISA devices:
113 */
114
115/*
116 * This contains the irq mask for both 8259A irq controllers,
117 */
118unsigned int cached_irq_mask = 0xffff;
119
120/*
121 * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
122 * boards the timer interrupt is not really connected to any IO-APIC pin,
123 * it's fed to the master 8259A's IR0 line only.
124 *
125 * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
126 * this 'mixed mode' IRQ handling costs nothing because it's only used
127 * at IRQ setup time.
128 */
129unsigned long io_apic_irqs;
130
131void disable_8259A_irq(unsigned int irq)
132{
133 unsigned int mask = 1 << irq;
134 unsigned long flags;
135
136 spin_lock_irqsave(&i8259A_lock, flags);
137 cached_irq_mask |= mask;
138 if (irq & 8)
139 outb(cached_slave_mask, PIC_SLAVE_IMR);
140 else
141 outb(cached_master_mask, PIC_MASTER_IMR);
142 spin_unlock_irqrestore(&i8259A_lock, flags);
143}
144
145void enable_8259A_irq(unsigned int irq)
146{
147 unsigned int mask = ~(1 << irq);
148 unsigned long flags;
149
150 spin_lock_irqsave(&i8259A_lock, flags);
151 cached_irq_mask &= mask;
152 if (irq & 8)
153 outb(cached_slave_mask, PIC_SLAVE_IMR);
154 else
155 outb(cached_master_mask, PIC_MASTER_IMR);
156 spin_unlock_irqrestore(&i8259A_lock, flags);
157}
158
159int i8259A_irq_pending(unsigned int irq)
160{
161 unsigned int mask = 1<<irq;
162 unsigned long flags;
163 int ret;
164
165 spin_lock_irqsave(&i8259A_lock, flags);
166 if (irq < 8)
167 ret = inb(PIC_MASTER_CMD) & mask;
168 else
169 ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
170 spin_unlock_irqrestore(&i8259A_lock, flags);
171
172 return ret;
173}
174
175void make_8259A_irq(unsigned int irq)
176{
177 disable_irq_nosync(irq);
178 io_apic_irqs &= ~(1<<irq);
179 set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
180 "XT");
181 enable_irq(irq);
182}
183
184/*
185 * This function assumes to be called rarely. Switching between
186 * 8259A registers is slow.
187 * This has to be protected by the irq controller spinlock
188 * before being called.
189 */
190static inline int i8259A_irq_real(unsigned int irq)
191{
192 int value;
193 int irqmask = 1<<irq;
194
195 if (irq < 8) {
196 outb(0x0B,PIC_MASTER_CMD); /* ISR register */
197 value = inb(PIC_MASTER_CMD) & irqmask;
198 outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */
199 return value;
200 }
201 outb(0x0B,PIC_SLAVE_CMD); /* ISR register */
202 value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
203 outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */
204 return value;
205}
206
207/*
208 * Careful! The 8259A is a fragile beast, it pretty
209 * much _has_ to be done exactly like this (mask it
210 * first, _then_ send the EOI, and the order of EOI
211 * to the two 8259s is important!
212 */
213static void mask_and_ack_8259A(unsigned int irq)
214{
215 unsigned int irqmask = 1 << irq;
216 unsigned long flags;
217
218 spin_lock_irqsave(&i8259A_lock, flags);
219 /*
220 * Lightweight spurious IRQ detection. We do not want
221 * to overdo spurious IRQ handling - it's usually a sign
222 * of hardware problems, so we only do the checks we can
223 * do without slowing down good hardware unnecessarily.
224 *
225 * Note that IRQ7 and IRQ15 (the two spurious IRQs
226 * usually resulting from the 8259A-1|2 PICs) occur
227 * even if the IRQ is masked in the 8259A. Thus we
228 * can check spurious 8259A IRQs without doing the
229 * quite slow i8259A_irq_real() call for every IRQ.
230 * This does not cover 100% of spurious interrupts,
231 * but should be enough to warn the user that there
232 * is something bad going on ...
233 */
234 if (cached_irq_mask & irqmask)
235 goto spurious_8259A_irq;
236 cached_irq_mask |= irqmask;
237
238handle_real_irq:
239 if (irq & 8) {
240 inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */
241 outb(cached_slave_mask, PIC_SLAVE_IMR);
242 /* 'Specific EOI' to slave */
243 outb(0x60+(irq&7),PIC_SLAVE_CMD);
244 /* 'Specific EOI' to master-IRQ2 */
245 outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD);
246 } else {
247 inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */
248 outb(cached_master_mask, PIC_MASTER_IMR);
249 /* 'Specific EOI' to master */
250 outb(0x60+irq,PIC_MASTER_CMD);
251 }
252 spin_unlock_irqrestore(&i8259A_lock, flags);
253 return;
254
255spurious_8259A_irq:
256 /*
257 * this is the slow path - should happen rarely.
258 */
259 if (i8259A_irq_real(irq))
260 /*
261 * oops, the IRQ _is_ in service according to the
262 * 8259A - not spurious, go handle it.
263 */
264 goto handle_real_irq;
265
266 {
267 static int spurious_irq_mask;
268 /*
269 * At this point we can be sure the IRQ is spurious,
270 * lets ACK and report it. [once per IRQ]
271 */
272 if (!(spurious_irq_mask & irqmask)) {
273 printk(KERN_DEBUG
274 "spurious 8259A interrupt: IRQ%d.\n", irq);
275 spurious_irq_mask |= irqmask;
276 }
277 atomic_inc(&irq_err_count);
278 /*
279 * Theoretically we do not have to handle this IRQ,
280 * but in Linux this does not cause problems and is
281 * simpler for us.
282 */
283 goto handle_real_irq;
284 }
285}
286
287static char irq_trigger[2];
288/**
289 * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
290 */
291static void restore_ELCR(char *trigger)
292{
293 outb(trigger[0], 0x4d0);
294 outb(trigger[1], 0x4d1);
295}
296
297static void save_ELCR(char *trigger)
298{
299 /* IRQ 0,1,2,8,13 are marked as reserved */
300 trigger[0] = inb(0x4d0) & 0xF8;
301 trigger[1] = inb(0x4d1) & 0xDE;
302}
303
304static int i8259A_resume(struct sys_device *dev)
305{
306 init_8259A(i8259A_auto_eoi);
307 restore_ELCR(irq_trigger);
308 return 0;
309}
310
311static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
312{
313 save_ELCR(irq_trigger);
314 return 0;
315}
316
317static int i8259A_shutdown(struct sys_device *dev)
318{
319 /* Put the i8259A into a quiescent state that
320 * the kernel initialization code can get it
321 * out of.
322 */
323 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
324 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */
325 return 0;
326}
327
328static struct sysdev_class i8259_sysdev_class = {
329 .name = "i8259",
330 .suspend = i8259A_suspend,
331 .resume = i8259A_resume,
332 .shutdown = i8259A_shutdown,
333};
334
335static struct sys_device device_i8259A = {
336 .id = 0,
337 .cls = &i8259_sysdev_class,
338};
339
340static int __init i8259A_init_sysfs(void)
341{
342 int error = sysdev_class_register(&i8259_sysdev_class);
343 if (!error)
344 error = sysdev_register(&device_i8259A);
345 return error;
346}
347
348device_initcall(i8259A_init_sysfs);
349
350void init_8259A(int auto_eoi)
351{
352 unsigned long flags;
353
354 i8259A_auto_eoi = auto_eoi;
355
356 spin_lock_irqsave(&i8259A_lock, flags);
357
358 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
359 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
360
361 /*
362 * outb_pic - this has to work on a wide range of PC hardware.
363 */
364 outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
365 /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
366 outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
367 /* 8259A-1 (the master) has a slave on IR2 */
368 outb_pic(0x04, PIC_MASTER_IMR);
369 if (auto_eoi) /* master does Auto EOI */
370 outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
371 else /* master expects normal EOI */
372 outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
373
374 outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */
375 /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */
376 outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR);
377 /* 8259A-2 is a slave on master's IR2 */
378 outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
379 /* (slave's support for AEOI in flat mode is to be investigated) */
380 outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR);
381
382 if (auto_eoi)
383 /*
384 * In AEOI mode we just have to mask the interrupt
385 * when acking.
386 */
387 i8259A_chip.mask_ack = disable_8259A_irq;
388 else
389 i8259A_chip.mask_ack = mask_and_ack_8259A;
390
391 udelay(100); /* wait for 8259A to initialize */
392
393 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
394 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
395
396 spin_unlock_irqrestore(&i8259A_lock, flags);
397}
398
399
400
401
402/*
403 * IRQ2 is cascade interrupt to second interrupt controller
404 */
405
406static struct irqaction irq2 = {
407 .handler = no_action,
408 .mask = CPU_MASK_NONE,
409 .name = "cascade",
410};
411DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
412 [0 ... IRQ0_VECTOR - 1] = -1,
413 [IRQ0_VECTOR] = 0,
414 [IRQ1_VECTOR] = 1,
415 [IRQ2_VECTOR] = 2,
416 [IRQ3_VECTOR] = 3,
417 [IRQ4_VECTOR] = 4,
418 [IRQ5_VECTOR] = 5,
419 [IRQ6_VECTOR] = 6,
420 [IRQ7_VECTOR] = 7,
421 [IRQ8_VECTOR] = 8,
422 [IRQ9_VECTOR] = 9,
423 [IRQ10_VECTOR] = 10,
424 [IRQ11_VECTOR] = 11,
425 [IRQ12_VECTOR] = 12,
426 [IRQ13_VECTOR] = 13,
427 [IRQ14_VECTOR] = 14,
428 [IRQ15_VECTOR] = 15,
429 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
430};
431
432void __init init_ISA_irqs (void)
433{
434 int i;
435
436 init_bsp_APIC();
437 init_8259A(0);
438
439 for (i = 0; i < NR_IRQS; i++) {
440 irq_desc[i].status = IRQ_DISABLED;
441 irq_desc[i].action = NULL;
442 irq_desc[i].depth = 1;
443
444 if (i < 16) {
445 /*
446 * 16 old-style INTA-cycle interrupts:
447 */
448 set_irq_chip_and_handler_name(i, &i8259A_chip,
449 handle_level_irq, "XT");
450 } else {
451 /*
452 * 'high' PCI IRQs filled in on demand
453 */
454 irq_desc[i].chip = &no_irq_chip;
455 }
456 }
457}
458
459void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
460
461void __init native_init_IRQ(void)
462{
463 int i;
464
465 init_ISA_irqs();
466 /*
467 * Cover the whole vector space, no vector can escape
468 * us. (some of these will be overridden and become
469 * 'special' SMP interrupts)
470 */
471 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
472 int vector = FIRST_EXTERNAL_VECTOR + i;
473 if (vector != IA32_SYSCALL_VECTOR)
474 set_intr_gate(vector, interrupt[i]);
475 }
476
477#ifdef CONFIG_SMP
478 /*
479 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
480 * IPI, driven by wakeup.
481 */
482 set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
483
484 /* IPIs for invalidation */
485 set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
486 set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
487 set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
488 set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
489 set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
490 set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
491 set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
492 set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
493
494 /* IPI for generic function call */
495 set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
496
497 /* Low priority IPI to cleanup after moving an irq */
498 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
499#endif
500 set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
501 set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
502
503 /* self generated IPI for local APIC timer */
504 set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
505
506 /* IPI vectors for APIC spurious and error interrupts */
507 set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
508 set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
509
510 if (!acpi_ioapic)
511 setup_irq(2, &irq2);
512}
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 4dc8600d9d20..e710289f673e 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -25,6 +25,7 @@
25#include <linux/init.h> 25#include <linux/init.h>
26#include <linux/delay.h> 26#include <linux/delay.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/bootmem.h>
28#include <linux/mc146818rtc.h> 29#include <linux/mc146818rtc.h>
29#include <linux/compiler.h> 30#include <linux/compiler.h>
30#include <linux/acpi.h> 31#include <linux/acpi.h>
@@ -45,10 +46,13 @@
45#include <asm/nmi.h> 46#include <asm/nmi.h>
46#include <asm/msidef.h> 47#include <asm/msidef.h>
47#include <asm/hypertransport.h> 48#include <asm/hypertransport.h>
49#include <asm/setup.h>
48 50
49#include <mach_apic.h> 51#include <mach_apic.h>
50#include <mach_apicdef.h> 52#include <mach_apicdef.h>
51 53
54#define __apicdebuginit(type) static type __init
55
52int (*ioapic_renumber_irq)(int ioapic, int irq); 56int (*ioapic_renumber_irq)(int ioapic, int irq);
53atomic_t irq_mis_count; 57atomic_t irq_mis_count;
54 58
@@ -56,9 +60,9 @@ atomic_t irq_mis_count;
56static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; 60static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
57 61
58static DEFINE_SPINLOCK(ioapic_lock); 62static DEFINE_SPINLOCK(ioapic_lock);
59static DEFINE_SPINLOCK(vector_lock); 63DEFINE_SPINLOCK(vector_lock);
60 64
61int timer_over_8254 __initdata = 1; 65int timer_through_8259 __initdata;
62 66
63/* 67/*
64 * Is the SiS APIC rmw bug present ? 68 * Is the SiS APIC rmw bug present ?
@@ -72,15 +76,21 @@ int sis_apic_bug = -1;
72int nr_ioapic_registers[MAX_IO_APICS]; 76int nr_ioapic_registers[MAX_IO_APICS];
73 77
74/* I/O APIC entries */ 78/* I/O APIC entries */
75struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; 79struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
76int nr_ioapics; 80int nr_ioapics;
77 81
78/* MP IRQ source entries */ 82/* MP IRQ source entries */
79struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; 83struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
80 84
81/* # of MP IRQ source entries */ 85/* # of MP IRQ source entries */
82int mp_irq_entries; 86int mp_irq_entries;
83 87
88#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
89int mp_bus_id_to_type[MAX_MP_BUSSES];
90#endif
91
92DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
93
84static int disable_timer_pin_1 __initdata; 94static int disable_timer_pin_1 __initdata;
85 95
86/* 96/*
@@ -110,7 +120,7 @@ struct io_apic {
110static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) 120static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
111{ 121{
112 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) 122 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
113 + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK); 123 + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
114} 124}
115 125
116static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) 126static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
@@ -239,7 +249,7 @@ static void __init replace_pin_at_irq(unsigned int irq,
239 } 249 }
240} 250}
241 251
242static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable) 252static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable)
243{ 253{
244 struct irq_pin_list *entry = irq_2_pin + irq; 254 struct irq_pin_list *entry = irq_2_pin + irq;
245 unsigned int pin, reg; 255 unsigned int pin, reg;
@@ -259,30 +269,32 @@ static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsign
259} 269}
260 270
261/* mask = 1 */ 271/* mask = 1 */
262static void __mask_IO_APIC_irq (unsigned int irq) 272static void __mask_IO_APIC_irq(unsigned int irq)
263{ 273{
264 __modify_IO_APIC_irq(irq, 0x00010000, 0); 274 __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0);
265} 275}
266 276
267/* mask = 0 */ 277/* mask = 0 */
268static void __unmask_IO_APIC_irq (unsigned int irq) 278static void __unmask_IO_APIC_irq(unsigned int irq)
269{ 279{
270 __modify_IO_APIC_irq(irq, 0, 0x00010000); 280 __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED);
271} 281}
272 282
273/* mask = 1, trigger = 0 */ 283/* mask = 1, trigger = 0 */
274static void __mask_and_edge_IO_APIC_irq (unsigned int irq) 284static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
275{ 285{
276 __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); 286 __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED,
287 IO_APIC_REDIR_LEVEL_TRIGGER);
277} 288}
278 289
279/* mask = 0, trigger = 1 */ 290/* mask = 0, trigger = 1 */
280static void __unmask_and_level_IO_APIC_irq (unsigned int irq) 291static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
281{ 292{
282 __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); 293 __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER,
294 IO_APIC_REDIR_MASKED);
283} 295}
284 296
285static void mask_IO_APIC_irq (unsigned int irq) 297static void mask_IO_APIC_irq(unsigned int irq)
286{ 298{
287 unsigned long flags; 299 unsigned long flags;
288 300
@@ -291,7 +303,7 @@ static void mask_IO_APIC_irq (unsigned int irq)
291 spin_unlock_irqrestore(&ioapic_lock, flags); 303 spin_unlock_irqrestore(&ioapic_lock, flags);
292} 304}
293 305
294static void unmask_IO_APIC_irq (unsigned int irq) 306static void unmask_IO_APIC_irq(unsigned int irq)
295{ 307{
296 unsigned long flags; 308 unsigned long flags;
297 309
@@ -303,7 +315,7 @@ static void unmask_IO_APIC_irq (unsigned int irq)
303static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) 315static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
304{ 316{
305 struct IO_APIC_route_entry entry; 317 struct IO_APIC_route_entry entry;
306 318
307 /* Check delivery_mode to be sure we're not clearing an SMI pin */ 319 /* Check delivery_mode to be sure we're not clearing an SMI pin */
308 entry = ioapic_read_entry(apic, pin); 320 entry = ioapic_read_entry(apic, pin);
309 if (entry.delivery_mode == dest_SMI) 321 if (entry.delivery_mode == dest_SMI)
@@ -315,7 +327,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
315 ioapic_mask_entry(apic, pin); 327 ioapic_mask_entry(apic, pin);
316} 328}
317 329
318static void clear_IO_APIC (void) 330static void clear_IO_APIC(void)
319{ 331{
320 int apic, pin; 332 int apic, pin;
321 333
@@ -332,7 +344,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
332 struct irq_pin_list *entry = irq_2_pin + irq; 344 struct irq_pin_list *entry = irq_2_pin + irq;
333 unsigned int apicid_value; 345 unsigned int apicid_value;
334 cpumask_t tmp; 346 cpumask_t tmp;
335 347
336 cpus_and(tmp, cpumask, cpu_online_map); 348 cpus_and(tmp, cpumask, cpu_online_map);
337 if (cpus_empty(tmp)) 349 if (cpus_empty(tmp))
338 tmp = TARGET_CPUS; 350 tmp = TARGET_CPUS;
@@ -361,7 +373,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
361# include <linux/kernel_stat.h> /* kstat */ 373# include <linux/kernel_stat.h> /* kstat */
362# include <linux/slab.h> /* kmalloc() */ 374# include <linux/slab.h> /* kmalloc() */
363# include <linux/timer.h> 375# include <linux/timer.h>
364 376
365#define IRQBALANCE_CHECK_ARCH -999 377#define IRQBALANCE_CHECK_ARCH -999
366#define MAX_BALANCED_IRQ_INTERVAL (5*HZ) 378#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
367#define MIN_BALANCED_IRQ_INTERVAL (HZ/2) 379#define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
@@ -373,14 +385,14 @@ static int physical_balance __read_mostly;
373static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL; 385static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
374 386
375static struct irq_cpu_info { 387static struct irq_cpu_info {
376 unsigned long * last_irq; 388 unsigned long *last_irq;
377 unsigned long * irq_delta; 389 unsigned long *irq_delta;
378 unsigned long irq; 390 unsigned long irq;
379} irq_cpu_data[NR_CPUS]; 391} irq_cpu_data[NR_CPUS];
380 392
381#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq) 393#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
382#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq]) 394#define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq])
383#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq]) 395#define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq])
384 396
385#define IDLE_ENOUGH(cpu,now) \ 397#define IDLE_ENOUGH(cpu,now) \
386 (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1)) 398 (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
@@ -419,8 +431,8 @@ inside:
419 if (cpu == -1) 431 if (cpu == -1)
420 cpu = NR_CPUS-1; 432 cpu = NR_CPUS-1;
421 } 433 }
422 } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) || 434 } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) ||
423 (search_idle && !IDLE_ENOUGH(cpu,now))); 435 (search_idle && !IDLE_ENOUGH(cpu, now)));
424 436
425 return cpu; 437 return cpu;
426} 438}
@@ -430,15 +442,14 @@ static inline void balance_irq(int cpu, int irq)
430 unsigned long now = jiffies; 442 unsigned long now = jiffies;
431 cpumask_t allowed_mask; 443 cpumask_t allowed_mask;
432 unsigned int new_cpu; 444 unsigned int new_cpu;
433 445
434 if (irqbalance_disabled) 446 if (irqbalance_disabled)
435 return; 447 return;
436 448
437 cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]); 449 cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
438 new_cpu = move(cpu, allowed_mask, now, 1); 450 new_cpu = move(cpu, allowed_mask, now, 1);
439 if (cpu != new_cpu) { 451 if (cpu != new_cpu)
440 set_pending_irq(irq, cpumask_of_cpu(new_cpu)); 452 set_pending_irq(irq, cpumask_of_cpu(new_cpu));
441 }
442} 453}
443 454
444static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) 455static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
@@ -450,14 +461,14 @@ static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
450 if (!irq_desc[j].action) 461 if (!irq_desc[j].action)
451 continue; 462 continue;
452 /* Is it a significant load ? */ 463 /* Is it a significant load ? */
453 if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) < 464 if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) <
454 useful_load_threshold) 465 useful_load_threshold)
455 continue; 466 continue;
456 balance_irq(i, j); 467 balance_irq(i, j);
457 } 468 }
458 } 469 }
459 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, 470 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
460 balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); 471 balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
461 return; 472 return;
462} 473}
463 474
@@ -486,22 +497,22 @@ static void do_irq_balance(void)
486 /* Is this an active IRQ or balancing disabled ? */ 497 /* Is this an active IRQ or balancing disabled ? */
487 if (!irq_desc[j].action || irq_balancing_disabled(j)) 498 if (!irq_desc[j].action || irq_balancing_disabled(j))
488 continue; 499 continue;
489 if ( package_index == i ) 500 if (package_index == i)
490 IRQ_DELTA(package_index,j) = 0; 501 IRQ_DELTA(package_index, j) = 0;
491 /* Determine the total count per processor per IRQ */ 502 /* Determine the total count per processor per IRQ */
492 value_now = (unsigned long) kstat_cpu(i).irqs[j]; 503 value_now = (unsigned long) kstat_cpu(i).irqs[j];
493 504
494 /* Determine the activity per processor per IRQ */ 505 /* Determine the activity per processor per IRQ */
495 delta = value_now - LAST_CPU_IRQ(i,j); 506 delta = value_now - LAST_CPU_IRQ(i, j);
496 507
497 /* Update last_cpu_irq[][] for the next time */ 508 /* Update last_cpu_irq[][] for the next time */
498 LAST_CPU_IRQ(i,j) = value_now; 509 LAST_CPU_IRQ(i, j) = value_now;
499 510
500 /* Ignore IRQs whose rate is less than the clock */ 511 /* Ignore IRQs whose rate is less than the clock */
501 if (delta < useful_load_threshold) 512 if (delta < useful_load_threshold)
502 continue; 513 continue;
503 /* update the load for the processor or package total */ 514 /* update the load for the processor or package total */
504 IRQ_DELTA(package_index,j) += delta; 515 IRQ_DELTA(package_index, j) += delta;
505 516
506 /* Keep track of the higher numbered sibling as well */ 517 /* Keep track of the higher numbered sibling as well */
507 if (i != package_index) 518 if (i != package_index)
@@ -527,7 +538,8 @@ static void do_irq_balance(void)
527 max_cpu_irq = ULONG_MAX; 538 max_cpu_irq = ULONG_MAX;
528 539
529tryanothercpu: 540tryanothercpu:
530 /* Look for heaviest loaded processor. 541 /*
542 * Look for heaviest loaded processor.
531 * We may come back to get the next heaviest loaded processor. 543 * We may come back to get the next heaviest loaded processor.
532 * Skip processors with trivial loads. 544 * Skip processors with trivial loads.
533 */ 545 */
@@ -536,7 +548,7 @@ tryanothercpu:
536 for_each_online_cpu(i) { 548 for_each_online_cpu(i) {
537 if (i != CPU_TO_PACKAGEINDEX(i)) 549 if (i != CPU_TO_PACKAGEINDEX(i))
538 continue; 550 continue;
539 if (max_cpu_irq <= CPU_IRQ(i)) 551 if (max_cpu_irq <= CPU_IRQ(i))
540 continue; 552 continue;
541 if (tmp_cpu_irq < CPU_IRQ(i)) { 553 if (tmp_cpu_irq < CPU_IRQ(i)) {
542 tmp_cpu_irq = CPU_IRQ(i); 554 tmp_cpu_irq = CPU_IRQ(i);
@@ -545,8 +557,9 @@ tryanothercpu:
545 } 557 }
546 558
547 if (tmp_loaded == -1) { 559 if (tmp_loaded == -1) {
548 /* In the case of small number of heavy interrupt sources, 560 /*
549 * loading some of the cpus too much. We use Ingo's original 561 * In the case of small number of heavy interrupt sources,
562 * loading some of the cpus too much. We use Ingo's original
550 * approach to rotate them around. 563 * approach to rotate them around.
551 */ 564 */
552 if (!first_attempt && imbalance >= useful_load_threshold) { 565 if (!first_attempt && imbalance >= useful_load_threshold) {
@@ -555,13 +568,14 @@ tryanothercpu:
555 } 568 }
556 goto not_worth_the_effort; 569 goto not_worth_the_effort;
557 } 570 }
558 571
559 first_attempt = 0; /* heaviest search */ 572 first_attempt = 0; /* heaviest search */
560 max_cpu_irq = tmp_cpu_irq; /* load */ 573 max_cpu_irq = tmp_cpu_irq; /* load */
561 max_loaded = tmp_loaded; /* processor */ 574 max_loaded = tmp_loaded; /* processor */
562 imbalance = (max_cpu_irq - min_cpu_irq) / 2; 575 imbalance = (max_cpu_irq - min_cpu_irq) / 2;
563 576
564 /* if imbalance is less than approx 10% of max load, then 577 /*
578 * if imbalance is less than approx 10% of max load, then
565 * observe diminishing returns action. - quit 579 * observe diminishing returns action. - quit
566 */ 580 */
567 if (imbalance < (max_cpu_irq >> 3)) 581 if (imbalance < (max_cpu_irq >> 3))
@@ -577,26 +591,25 @@ tryanotherirq:
577 /* Is this an active IRQ? */ 591 /* Is this an active IRQ? */
578 if (!irq_desc[j].action) 592 if (!irq_desc[j].action)
579 continue; 593 continue;
580 if (imbalance <= IRQ_DELTA(max_loaded,j)) 594 if (imbalance <= IRQ_DELTA(max_loaded, j))
581 continue; 595 continue;
582 /* Try to find the IRQ that is closest to the imbalance 596 /* Try to find the IRQ that is closest to the imbalance
583 * without going over. 597 * without going over.
584 */ 598 */
585 if (move_this_load < IRQ_DELTA(max_loaded,j)) { 599 if (move_this_load < IRQ_DELTA(max_loaded, j)) {
586 move_this_load = IRQ_DELTA(max_loaded,j); 600 move_this_load = IRQ_DELTA(max_loaded, j);
587 selected_irq = j; 601 selected_irq = j;
588 } 602 }
589 } 603 }
590 if (selected_irq == -1) { 604 if (selected_irq == -1)
591 goto tryanothercpu; 605 goto tryanothercpu;
592 }
593 606
594 imbalance = move_this_load; 607 imbalance = move_this_load;
595 608
596 /* For physical_balance case, we accumulated both load 609 /* For physical_balance case, we accumulated both load
597 * values in the one of the siblings cpu_irq[], 610 * values in the one of the siblings cpu_irq[],
598 * to use the same code for physical and logical processors 611 * to use the same code for physical and logical processors
599 * as much as possible. 612 * as much as possible.
600 * 613 *
601 * NOTE: the cpu_irq[] array holds the sum of the load for 614 * NOTE: the cpu_irq[] array holds the sum of the load for
602 * sibling A and sibling B in the slot for the lowest numbered 615 * sibling A and sibling B in the slot for the lowest numbered
@@ -625,11 +638,11 @@ tryanotherirq:
625 /* mark for change destination */ 638 /* mark for change destination */
626 set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded)); 639 set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
627 640
628 /* Since we made a change, come back sooner to 641 /* Since we made a change, come back sooner to
629 * check for more variation. 642 * check for more variation.
630 */ 643 */
631 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, 644 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
632 balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); 645 balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
633 return; 646 return;
634 } 647 }
635 goto tryanotherirq; 648 goto tryanotherirq;
@@ -640,7 +653,7 @@ not_worth_the_effort:
640 * upward 653 * upward
641 */ 654 */
642 balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL, 655 balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
643 balanced_irq_interval + BALANCED_IRQ_MORE_DELTA); 656 balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
644 return; 657 return;
645} 658}
646 659
@@ -679,13 +692,13 @@ static int __init balanced_irq_init(void)
679 cpumask_t tmp; 692 cpumask_t tmp;
680 693
681 cpus_shift_right(tmp, cpu_online_map, 2); 694 cpus_shift_right(tmp, cpu_online_map, 2);
682 c = &boot_cpu_data; 695 c = &boot_cpu_data;
683 /* When not overwritten by the command line ask subarchitecture. */ 696 /* When not overwritten by the command line ask subarchitecture. */
684 if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH) 697 if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
685 irqbalance_disabled = NO_BALANCE_IRQ; 698 irqbalance_disabled = NO_BALANCE_IRQ;
686 if (irqbalance_disabled) 699 if (irqbalance_disabled)
687 return 0; 700 return 0;
688 701
689 /* disable irqbalance completely if there is only one processor online */ 702 /* disable irqbalance completely if there is only one processor online */
690 if (num_online_cpus() < 2) { 703 if (num_online_cpus() < 2) {
691 irqbalance_disabled = 1; 704 irqbalance_disabled = 1;
@@ -699,16 +712,14 @@ static int __init balanced_irq_init(void)
699 physical_balance = 1; 712 physical_balance = 1;
700 713
701 for_each_online_cpu(i) { 714 for_each_online_cpu(i) {
702 irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); 715 irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
703 irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); 716 irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
704 if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) { 717 if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
705 printk(KERN_ERR "balanced_irq_init: out of memory"); 718 printk(KERN_ERR "balanced_irq_init: out of memory");
706 goto failed; 719 goto failed;
707 } 720 }
708 memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
709 memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
710 } 721 }
711 722
712 printk(KERN_INFO "Starting balanced_irq\n"); 723 printk(KERN_INFO "Starting balanced_irq\n");
713 if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd"))) 724 if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
714 return 0; 725 return 0;
@@ -748,7 +759,7 @@ void send_IPI_self(int vector)
748 /* 759 /*
749 * Send the IPI. The write to APIC_ICR fires this off. 760 * Send the IPI. The write to APIC_ICR fires this off.
750 */ 761 */
751 apic_write_around(APIC_ICR, cfg); 762 apic_write(APIC_ICR, cfg);
752} 763}
753#endif /* !CONFIG_SMP */ 764#endif /* !CONFIG_SMP */
754 765
@@ -801,10 +812,10 @@ static int find_irq_entry(int apic, int pin, int type)
801 int i; 812 int i;
802 813
803 for (i = 0; i < mp_irq_entries; i++) 814 for (i = 0; i < mp_irq_entries; i++)
804 if (mp_irqs[i].mpc_irqtype == type && 815 if (mp_irqs[i].mp_irqtype == type &&
805 (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || 816 (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
806 mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && 817 mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
807 mp_irqs[i].mpc_dstirq == pin) 818 mp_irqs[i].mp_dstirq == pin)
808 return i; 819 return i;
809 820
810 return -1; 821 return -1;
@@ -818,13 +829,13 @@ static int __init find_isa_irq_pin(int irq, int type)
818 int i; 829 int i;
819 830
820 for (i = 0; i < mp_irq_entries; i++) { 831 for (i = 0; i < mp_irq_entries; i++) {
821 int lbus = mp_irqs[i].mpc_srcbus; 832 int lbus = mp_irqs[i].mp_srcbus;
822 833
823 if (test_bit(lbus, mp_bus_not_pci) && 834 if (test_bit(lbus, mp_bus_not_pci) &&
824 (mp_irqs[i].mpc_irqtype == type) && 835 (mp_irqs[i].mp_irqtype == type) &&
825 (mp_irqs[i].mpc_srcbusirq == irq)) 836 (mp_irqs[i].mp_srcbusirq == irq))
826 837
827 return mp_irqs[i].mpc_dstirq; 838 return mp_irqs[i].mp_dstirq;
828 } 839 }
829 return -1; 840 return -1;
830} 841}
@@ -834,17 +845,17 @@ static int __init find_isa_irq_apic(int irq, int type)
834 int i; 845 int i;
835 846
836 for (i = 0; i < mp_irq_entries; i++) { 847 for (i = 0; i < mp_irq_entries; i++) {
837 int lbus = mp_irqs[i].mpc_srcbus; 848 int lbus = mp_irqs[i].mp_srcbus;
838 849
839 if (test_bit(lbus, mp_bus_not_pci) && 850 if (test_bit(lbus, mp_bus_not_pci) &&
840 (mp_irqs[i].mpc_irqtype == type) && 851 (mp_irqs[i].mp_irqtype == type) &&
841 (mp_irqs[i].mpc_srcbusirq == irq)) 852 (mp_irqs[i].mp_srcbusirq == irq))
842 break; 853 break;
843 } 854 }
844 if (i < mp_irq_entries) { 855 if (i < mp_irq_entries) {
845 int apic; 856 int apic;
846 for(apic = 0; apic < nr_ioapics; apic++) { 857 for (apic = 0; apic < nr_ioapics; apic++) {
847 if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) 858 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
848 return apic; 859 return apic;
849 } 860 }
850 } 861 }
@@ -864,28 +875,28 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
864 875
865 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, " 876 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
866 "slot:%d, pin:%d.\n", bus, slot, pin); 877 "slot:%d, pin:%d.\n", bus, slot, pin);
867 if (mp_bus_id_to_pci_bus[bus] == -1) { 878 if (test_bit(bus, mp_bus_not_pci)) {
868 printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus); 879 printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
869 return -1; 880 return -1;
870 } 881 }
871 for (i = 0; i < mp_irq_entries; i++) { 882 for (i = 0; i < mp_irq_entries; i++) {
872 int lbus = mp_irqs[i].mpc_srcbus; 883 int lbus = mp_irqs[i].mp_srcbus;
873 884
874 for (apic = 0; apic < nr_ioapics; apic++) 885 for (apic = 0; apic < nr_ioapics; apic++)
875 if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || 886 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
876 mp_irqs[i].mpc_dstapic == MP_APIC_ALL) 887 mp_irqs[i].mp_dstapic == MP_APIC_ALL)
877 break; 888 break;
878 889
879 if (!test_bit(lbus, mp_bus_not_pci) && 890 if (!test_bit(lbus, mp_bus_not_pci) &&
880 !mp_irqs[i].mpc_irqtype && 891 !mp_irqs[i].mp_irqtype &&
881 (bus == lbus) && 892 (bus == lbus) &&
882 (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { 893 (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
883 int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); 894 int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq);
884 895
885 if (!(apic || IO_APIC_IRQ(irq))) 896 if (!(apic || IO_APIC_IRQ(irq)))
886 continue; 897 continue;
887 898
888 if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) 899 if (pin == (mp_irqs[i].mp_srcbusirq & 3))
889 return irq; 900 return irq;
890 /* 901 /*
891 * Use the first all-but-pin matching entry as a 902 * Use the first all-but-pin matching entry as a
@@ -900,7 +911,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
900EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); 911EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
901 912
902/* 913/*
903 * This function currently is only a helper for the i386 smp boot process where 914 * This function currently is only a helper for the i386 smp boot process where
904 * we need to reprogram the ioredtbls to cater for the cpus which have come online 915 * we need to reprogram the ioredtbls to cater for the cpus which have come online
905 * so mask in all cases should simply be TARGET_CPUS 916 * so mask in all cases should simply be TARGET_CPUS
906 */ 917 */
@@ -952,7 +963,7 @@ static int EISA_ELCR(unsigned int irq)
952 * EISA conforming in the MP table, that means its trigger type must 963 * EISA conforming in the MP table, that means its trigger type must
953 * be read in from the ELCR */ 964 * be read in from the ELCR */
954 965
955#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) 966#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
956#define default_EISA_polarity(idx) default_ISA_polarity(idx) 967#define default_EISA_polarity(idx) default_ISA_polarity(idx)
957 968
958/* PCI interrupts are always polarity one level triggered, 969/* PCI interrupts are always polarity one level triggered,
@@ -969,118 +980,115 @@ static int EISA_ELCR(unsigned int irq)
969 980
970static int MPBIOS_polarity(int idx) 981static int MPBIOS_polarity(int idx)
971{ 982{
972 int bus = mp_irqs[idx].mpc_srcbus; 983 int bus = mp_irqs[idx].mp_srcbus;
973 int polarity; 984 int polarity;
974 985
975 /* 986 /*
976 * Determine IRQ line polarity (high active or low active): 987 * Determine IRQ line polarity (high active or low active):
977 */ 988 */
978 switch (mp_irqs[idx].mpc_irqflag & 3) 989 switch (mp_irqs[idx].mp_irqflag & 3) {
990 case 0: /* conforms, ie. bus-type dependent polarity */
979 { 991 {
980 case 0: /* conforms, ie. bus-type dependent polarity */ 992 polarity = test_bit(bus, mp_bus_not_pci)?
981 { 993 default_ISA_polarity(idx):
982 polarity = test_bit(bus, mp_bus_not_pci)? 994 default_PCI_polarity(idx);
983 default_ISA_polarity(idx): 995 break;
984 default_PCI_polarity(idx); 996 }
985 break; 997 case 1: /* high active */
986 } 998 {
987 case 1: /* high active */ 999 polarity = 0;
988 { 1000 break;
989 polarity = 0; 1001 }
990 break; 1002 case 2: /* reserved */
991 } 1003 {
992 case 2: /* reserved */ 1004 printk(KERN_WARNING "broken BIOS!!\n");
993 { 1005 polarity = 1;
994 printk(KERN_WARNING "broken BIOS!!\n"); 1006 break;
995 polarity = 1; 1007 }
996 break; 1008 case 3: /* low active */
997 } 1009 {
998 case 3: /* low active */ 1010 polarity = 1;
999 { 1011 break;
1000 polarity = 1; 1012 }
1001 break; 1013 default: /* invalid */
1002 } 1014 {
1003 default: /* invalid */ 1015 printk(KERN_WARNING "broken BIOS!!\n");
1004 { 1016 polarity = 1;
1005 printk(KERN_WARNING "broken BIOS!!\n"); 1017 break;
1006 polarity = 1; 1018 }
1007 break;
1008 }
1009 } 1019 }
1010 return polarity; 1020 return polarity;
1011} 1021}
1012 1022
1013static int MPBIOS_trigger(int idx) 1023static int MPBIOS_trigger(int idx)
1014{ 1024{
1015 int bus = mp_irqs[idx].mpc_srcbus; 1025 int bus = mp_irqs[idx].mp_srcbus;
1016 int trigger; 1026 int trigger;
1017 1027
1018 /* 1028 /*
1019 * Determine IRQ trigger mode (edge or level sensitive): 1029 * Determine IRQ trigger mode (edge or level sensitive):
1020 */ 1030 */
1021 switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) 1031 switch ((mp_irqs[idx].mp_irqflag>>2) & 3) {
1032 case 0: /* conforms, ie. bus-type dependent */
1022 { 1033 {
1023 case 0: /* conforms, ie. bus-type dependent */ 1034 trigger = test_bit(bus, mp_bus_not_pci)?
1024 { 1035 default_ISA_trigger(idx):
1025 trigger = test_bit(bus, mp_bus_not_pci)? 1036 default_PCI_trigger(idx);
1026 default_ISA_trigger(idx):
1027 default_PCI_trigger(idx);
1028#if defined(CONFIG_EISA) || defined(CONFIG_MCA) 1037#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
1029 switch (mp_bus_id_to_type[bus]) 1038 switch (mp_bus_id_to_type[bus]) {
1030 { 1039 case MP_BUS_ISA: /* ISA pin */
1031 case MP_BUS_ISA: /* ISA pin */ 1040 {
1032 { 1041 /* set before the switch */
1033 /* set before the switch */
1034 break;
1035 }
1036 case MP_BUS_EISA: /* EISA pin */
1037 {
1038 trigger = default_EISA_trigger(idx);
1039 break;
1040 }
1041 case MP_BUS_PCI: /* PCI pin */
1042 {
1043 /* set before the switch */
1044 break;
1045 }
1046 case MP_BUS_MCA: /* MCA pin */
1047 {
1048 trigger = default_MCA_trigger(idx);
1049 break;
1050 }
1051 default:
1052 {
1053 printk(KERN_WARNING "broken BIOS!!\n");
1054 trigger = 1;
1055 break;
1056 }
1057 }
1058#endif
1059 break; 1042 break;
1060 } 1043 }
1061 case 1: /* edge */ 1044 case MP_BUS_EISA: /* EISA pin */
1062 { 1045 {
1063 trigger = 0; 1046 trigger = default_EISA_trigger(idx);
1064 break; 1047 break;
1065 } 1048 }
1066 case 2: /* reserved */ 1049 case MP_BUS_PCI: /* PCI pin */
1067 { 1050 {
1068 printk(KERN_WARNING "broken BIOS!!\n"); 1051 /* set before the switch */
1069 trigger = 1;
1070 break; 1052 break;
1071 } 1053 }
1072 case 3: /* level */ 1054 case MP_BUS_MCA: /* MCA pin */
1073 { 1055 {
1074 trigger = 1; 1056 trigger = default_MCA_trigger(idx);
1075 break; 1057 break;
1076 } 1058 }
1077 default: /* invalid */ 1059 default:
1078 { 1060 {
1079 printk(KERN_WARNING "broken BIOS!!\n"); 1061 printk(KERN_WARNING "broken BIOS!!\n");
1080 trigger = 0; 1062 trigger = 1;
1081 break; 1063 break;
1082 } 1064 }
1083 } 1065 }
1066#endif
1067 break;
1068 }
1069 case 1: /* edge */
1070 {
1071 trigger = 0;
1072 break;
1073 }
1074 case 2: /* reserved */
1075 {
1076 printk(KERN_WARNING "broken BIOS!!\n");
1077 trigger = 1;
1078 break;
1079 }
1080 case 3: /* level */
1081 {
1082 trigger = 1;
1083 break;
1084 }
1085 default: /* invalid */
1086 {
1087 printk(KERN_WARNING "broken BIOS!!\n");
1088 trigger = 0;
1089 break;
1090 }
1091 }
1084 return trigger; 1092 return trigger;
1085} 1093}
1086 1094
@@ -1097,16 +1105,16 @@ static inline int irq_trigger(int idx)
1097static int pin_2_irq(int idx, int apic, int pin) 1105static int pin_2_irq(int idx, int apic, int pin)
1098{ 1106{
1099 int irq, i; 1107 int irq, i;
1100 int bus = mp_irqs[idx].mpc_srcbus; 1108 int bus = mp_irqs[idx].mp_srcbus;
1101 1109
1102 /* 1110 /*
1103 * Debugging check, we are in big trouble if this message pops up! 1111 * Debugging check, we are in big trouble if this message pops up!
1104 */ 1112 */
1105 if (mp_irqs[idx].mpc_dstirq != pin) 1113 if (mp_irqs[idx].mp_dstirq != pin)
1106 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); 1114 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
1107 1115
1108 if (test_bit(bus, mp_bus_not_pci)) 1116 if (test_bit(bus, mp_bus_not_pci))
1109 irq = mp_irqs[idx].mpc_srcbusirq; 1117 irq = mp_irqs[idx].mp_srcbusirq;
1110 else { 1118 else {
1111 /* 1119 /*
1112 * PCI IRQs are mapped in order 1120 * PCI IRQs are mapped in order
@@ -1148,8 +1156,8 @@ static inline int IO_APIC_irq_trigger(int irq)
1148 1156
1149 for (apic = 0; apic < nr_ioapics; apic++) { 1157 for (apic = 0; apic < nr_ioapics; apic++) {
1150 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 1158 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1151 idx = find_irq_entry(apic,pin,mp_INT); 1159 idx = find_irq_entry(apic, pin, mp_INT);
1152 if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin))) 1160 if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
1153 return irq_trigger(idx); 1161 return irq_trigger(idx);
1154 } 1162 }
1155 } 1163 }
@@ -1164,7 +1172,7 @@ static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }
1164 1172
1165static int __assign_irq_vector(int irq) 1173static int __assign_irq_vector(int irq)
1166{ 1174{
1167 static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; 1175 static int current_vector = FIRST_DEVICE_VECTOR, current_offset;
1168 int vector, offset; 1176 int vector, offset;
1169 1177
1170 BUG_ON((unsigned)irq >= NR_IRQ_VECTORS); 1178 BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
@@ -1176,7 +1184,7 @@ static int __assign_irq_vector(int irq)
1176 offset = current_offset; 1184 offset = current_offset;
1177next: 1185next:
1178 vector += 8; 1186 vector += 8;
1179 if (vector >= FIRST_SYSTEM_VECTOR) { 1187 if (vector >= first_system_vector) {
1180 offset = (offset + 1) % 8; 1188 offset = (offset + 1) % 8;
1181 vector = FIRST_DEVICE_VECTOR + offset; 1189 vector = FIRST_DEVICE_VECTOR + offset;
1182 } 1190 }
@@ -1203,6 +1211,7 @@ static int assign_irq_vector(int irq)
1203 1211
1204 return vector; 1212 return vector;
1205} 1213}
1214
1206static struct irq_chip ioapic_chip; 1215static struct irq_chip ioapic_chip;
1207 1216
1208#define IOAPIC_AUTO -1 1217#define IOAPIC_AUTO -1
@@ -1237,25 +1246,25 @@ static void __init setup_IO_APIC_irqs(void)
1237 /* 1246 /*
1238 * add it to the IO-APIC irq-routing table: 1247 * add it to the IO-APIC irq-routing table:
1239 */ 1248 */
1240 memset(&entry,0,sizeof(entry)); 1249 memset(&entry, 0, sizeof(entry));
1241 1250
1242 entry.delivery_mode = INT_DELIVERY_MODE; 1251 entry.delivery_mode = INT_DELIVERY_MODE;
1243 entry.dest_mode = INT_DEST_MODE; 1252 entry.dest_mode = INT_DEST_MODE;
1244 entry.mask = 0; /* enable IRQ */ 1253 entry.mask = 0; /* enable IRQ */
1245 entry.dest.logical.logical_dest = 1254 entry.dest.logical.logical_dest =
1246 cpu_mask_to_apicid(TARGET_CPUS); 1255 cpu_mask_to_apicid(TARGET_CPUS);
1247 1256
1248 idx = find_irq_entry(apic,pin,mp_INT); 1257 idx = find_irq_entry(apic, pin, mp_INT);
1249 if (idx == -1) { 1258 if (idx == -1) {
1250 if (first_notcon) { 1259 if (first_notcon) {
1251 apic_printk(APIC_VERBOSE, KERN_DEBUG 1260 apic_printk(APIC_VERBOSE, KERN_DEBUG
1252 " IO-APIC (apicid-pin) %d-%d", 1261 " IO-APIC (apicid-pin) %d-%d",
1253 mp_ioapics[apic].mpc_apicid, 1262 mp_ioapics[apic].mp_apicid,
1254 pin); 1263 pin);
1255 first_notcon = 0; 1264 first_notcon = 0;
1256 } else 1265 } else
1257 apic_printk(APIC_VERBOSE, ", %d-%d", 1266 apic_printk(APIC_VERBOSE, ", %d-%d",
1258 mp_ioapics[apic].mpc_apicid, pin); 1267 mp_ioapics[apic].mp_apicid, pin);
1259 continue; 1268 continue;
1260 } 1269 }
1261 1270
@@ -1289,7 +1298,7 @@ static void __init setup_IO_APIC_irqs(void)
1289 vector = assign_irq_vector(irq); 1298 vector = assign_irq_vector(irq);
1290 entry.vector = vector; 1299 entry.vector = vector;
1291 ioapic_register_intr(irq, vector, IOAPIC_AUTO); 1300 ioapic_register_intr(irq, vector, IOAPIC_AUTO);
1292 1301
1293 if (!apic && (irq < 16)) 1302 if (!apic && (irq < 16))
1294 disable_8259A_irq(irq); 1303 disable_8259A_irq(irq);
1295 } 1304 }
@@ -1302,25 +1311,21 @@ static void __init setup_IO_APIC_irqs(void)
1302} 1311}
1303 1312
1304/* 1313/*
1305 * Set up the 8259A-master output pin: 1314 * Set up the timer pin, possibly with the 8259A-master behind.
1306 */ 1315 */
1307static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) 1316static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
1317 int vector)
1308{ 1318{
1309 struct IO_APIC_route_entry entry; 1319 struct IO_APIC_route_entry entry;
1310 1320
1311 memset(&entry,0,sizeof(entry)); 1321 memset(&entry, 0, sizeof(entry));
1312
1313 disable_8259A_irq(0);
1314
1315 /* mask LVT0 */
1316 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
1317 1322
1318 /* 1323 /*
1319 * We use logical delivery to get the timer IRQ 1324 * We use logical delivery to get the timer IRQ
1320 * to the first CPU. 1325 * to the first CPU.
1321 */ 1326 */
1322 entry.dest_mode = INT_DEST_MODE; 1327 entry.dest_mode = INT_DEST_MODE;
1323 entry.mask = 0; /* unmask IRQ now */ 1328 entry.mask = 1; /* mask IRQ now */
1324 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); 1329 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
1325 entry.delivery_mode = INT_DELIVERY_MODE; 1330 entry.delivery_mode = INT_DELIVERY_MODE;
1326 entry.polarity = 0; 1331 entry.polarity = 0;
@@ -1329,20 +1334,18 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
1329 1334
1330 /* 1335 /*
1331 * The timer IRQ doesn't have to know that behind the 1336 * The timer IRQ doesn't have to know that behind the
1332 * scene we have a 8259A-master in AEOI mode ... 1337 * scene we may have a 8259A-master in AEOI mode ...
1333 */ 1338 */
1334 irq_desc[0].chip = &ioapic_chip; 1339 ioapic_register_intr(0, vector, IOAPIC_EDGE);
1335 set_irq_handler(0, handle_edge_irq);
1336 1340
1337 /* 1341 /*
1338 * Add it to the IO-APIC irq-routing table: 1342 * Add it to the IO-APIC irq-routing table:
1339 */ 1343 */
1340 ioapic_write_entry(apic, pin, entry); 1344 ioapic_write_entry(apic, pin, entry);
1341
1342 enable_8259A_irq(0);
1343} 1345}
1344 1346
1345void __init print_IO_APIC(void) 1347
1348__apicdebuginit(void) print_IO_APIC(void)
1346{ 1349{
1347 int apic, i; 1350 int apic, i;
1348 union IO_APIC_reg_00 reg_00; 1351 union IO_APIC_reg_00 reg_00;
@@ -1354,10 +1357,10 @@ void __init print_IO_APIC(void)
1354 if (apic_verbosity == APIC_QUIET) 1357 if (apic_verbosity == APIC_QUIET)
1355 return; 1358 return;
1356 1359
1357 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); 1360 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
1358 for (i = 0; i < nr_ioapics; i++) 1361 for (i = 0; i < nr_ioapics; i++)
1359 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", 1362 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
1360 mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); 1363 mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
1361 1364
1362 /* 1365 /*
1363 * We are a bit conservative about what we expect. We have to 1366 * We are a bit conservative about what we expect. We have to
@@ -1376,7 +1379,7 @@ void __init print_IO_APIC(void)
1376 reg_03.raw = io_apic_read(apic, 3); 1379 reg_03.raw = io_apic_read(apic, 3);
1377 spin_unlock_irqrestore(&ioapic_lock, flags); 1380 spin_unlock_irqrestore(&ioapic_lock, flags);
1378 1381
1379 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); 1382 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
1380 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); 1383 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1381 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); 1384 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1382 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); 1385 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
@@ -1457,9 +1460,7 @@ void __init print_IO_APIC(void)
1457 return; 1460 return;
1458} 1461}
1459 1462
1460#if 0 1463__apicdebuginit(void) print_APIC_bitfield(int base)
1461
1462static void print_APIC_bitfield (int base)
1463{ 1464{
1464 unsigned int v; 1465 unsigned int v;
1465 int i, j; 1466 int i, j;
@@ -1480,17 +1481,19 @@ static void print_APIC_bitfield (int base)
1480 } 1481 }
1481} 1482}
1482 1483
1483void /*__init*/ print_local_APIC(void * dummy) 1484__apicdebuginit(void) print_local_APIC(void *dummy)
1484{ 1485{
1485 unsigned int v, ver, maxlvt; 1486 unsigned int v, ver, maxlvt;
1487 u64 icr;
1486 1488
1487 if (apic_verbosity == APIC_QUIET) 1489 if (apic_verbosity == APIC_QUIET)
1488 return; 1490 return;
1489 1491
1490 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", 1492 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1491 smp_processor_id(), hard_smp_processor_id()); 1493 smp_processor_id(), hard_smp_processor_id());
1494 v = apic_read(APIC_ID);
1492 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, 1495 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v,
1493 GET_APIC_ID(read_apic_id())); 1496 GET_APIC_ID(v));
1494 v = apic_read(APIC_LVR); 1497 v = apic_read(APIC_LVR);
1495 printk(KERN_INFO "... APIC VERSION: %08x\n", v); 1498 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
1496 ver = GET_APIC_VERSION(v); 1499 ver = GET_APIC_VERSION(v);
@@ -1532,10 +1535,9 @@ void /*__init*/ print_local_APIC(void * dummy)
1532 printk(KERN_DEBUG "... APIC ESR: %08x\n", v); 1535 printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
1533 } 1536 }
1534 1537
1535 v = apic_read(APIC_ICR); 1538 icr = apic_icr_read();
1536 printk(KERN_DEBUG "... APIC ICR: %08x\n", v); 1539 printk(KERN_DEBUG "... APIC ICR: %08x\n", icr);
1537 v = apic_read(APIC_ICR2); 1540 printk(KERN_DEBUG "... APIC ICR2: %08x\n", icr >> 32);
1538 printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
1539 1541
1540 v = apic_read(APIC_LVTT); 1542 v = apic_read(APIC_LVTT);
1541 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); 1543 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
@@ -1563,12 +1565,12 @@ void /*__init*/ print_local_APIC(void * dummy)
1563 printk("\n"); 1565 printk("\n");
1564} 1566}
1565 1567
1566void print_all_local_APICs (void) 1568__apicdebuginit(void) print_all_local_APICs(void)
1567{ 1569{
1568 on_each_cpu(print_local_APIC, NULL, 1, 1); 1570 on_each_cpu(print_local_APIC, NULL, 1);
1569} 1571}
1570 1572
1571void /*__init*/ print_PIC(void) 1573__apicdebuginit(void) print_PIC(void)
1572{ 1574{
1573 unsigned int v; 1575 unsigned int v;
1574 unsigned long flags; 1576 unsigned long flags;
@@ -1586,11 +1588,11 @@ void /*__init*/ print_PIC(void)
1586 v = inb(0xa0) << 8 | inb(0x20); 1588 v = inb(0xa0) << 8 | inb(0x20);
1587 printk(KERN_DEBUG "... PIC IRR: %04x\n", v); 1589 printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
1588 1590
1589 outb(0x0b,0xa0); 1591 outb(0x0b, 0xa0);
1590 outb(0x0b,0x20); 1592 outb(0x0b, 0x20);
1591 v = inb(0xa0) << 8 | inb(0x20); 1593 v = inb(0xa0) << 8 | inb(0x20);
1592 outb(0x0a,0xa0); 1594 outb(0x0a, 0xa0);
1593 outb(0x0a,0x20); 1595 outb(0x0a, 0x20);
1594 1596
1595 spin_unlock_irqrestore(&i8259A_lock, flags); 1597 spin_unlock_irqrestore(&i8259A_lock, flags);
1596 1598
@@ -1600,7 +1602,17 @@ void /*__init*/ print_PIC(void)
1600 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); 1602 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1601} 1603}
1602 1604
1603#endif /* 0 */ 1605__apicdebuginit(int) print_all_ICs(void)
1606{
1607 print_PIC();
1608 print_all_local_APICs();
1609 print_IO_APIC();
1610
1611 return 0;
1612}
1613
1614fs_initcall(print_all_ICs);
1615
1604 1616
1605static void __init enable_IO_APIC(void) 1617static void __init enable_IO_APIC(void)
1606{ 1618{
@@ -1626,7 +1638,7 @@ static void __init enable_IO_APIC(void)
1626 spin_unlock_irqrestore(&ioapic_lock, flags); 1638 spin_unlock_irqrestore(&ioapic_lock, flags);
1627 nr_ioapic_registers[apic] = reg_01.bits.entries+1; 1639 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
1628 } 1640 }
1629 for(apic = 0; apic < nr_ioapics; apic++) { 1641 for (apic = 0; apic < nr_ioapics; apic++) {
1630 int pin; 1642 int pin;
1631 /* See if any of the pins is in ExtINT mode */ 1643 /* See if any of the pins is in ExtINT mode */
1632 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 1644 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
@@ -1698,8 +1710,7 @@ void disable_IO_APIC(void)
1698 entry.dest_mode = 0; /* Physical */ 1710 entry.dest_mode = 0; /* Physical */
1699 entry.delivery_mode = dest_ExtINT; /* ExtInt */ 1711 entry.delivery_mode = dest_ExtINT; /* ExtInt */
1700 entry.vector = 0; 1712 entry.vector = 0;
1701 entry.dest.physical.physical_dest = 1713 entry.dest.physical.physical_dest = read_apic_id();
1702 GET_APIC_ID(read_apic_id());
1703 1714
1704 /* 1715 /*
1705 * Add it to the IO-APIC irq-routing table: 1716 * Add it to the IO-APIC irq-routing table:
@@ -1716,7 +1727,6 @@ void disable_IO_APIC(void)
1716 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 1727 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
1717 */ 1728 */
1718 1729
1719#ifndef CONFIG_X86_NUMAQ
1720static void __init setup_ioapic_ids_from_mpc(void) 1730static void __init setup_ioapic_ids_from_mpc(void)
1721{ 1731{
1722 union IO_APIC_reg_00 reg_00; 1732 union IO_APIC_reg_00 reg_00;
@@ -1726,6 +1736,9 @@ static void __init setup_ioapic_ids_from_mpc(void)
1726 unsigned char old_id; 1736 unsigned char old_id;
1727 unsigned long flags; 1737 unsigned long flags;
1728 1738
1739 if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids())
1740 return;
1741
1729 /* 1742 /*
1730 * Don't check I/O APIC IDs for xAPIC systems. They have 1743 * Don't check I/O APIC IDs for xAPIC systems. They have
1731 * no meaning without the serial APIC bus. 1744 * no meaning without the serial APIC bus.
@@ -1748,15 +1761,15 @@ static void __init setup_ioapic_ids_from_mpc(void)
1748 spin_lock_irqsave(&ioapic_lock, flags); 1761 spin_lock_irqsave(&ioapic_lock, flags);
1749 reg_00.raw = io_apic_read(apic, 0); 1762 reg_00.raw = io_apic_read(apic, 0);
1750 spin_unlock_irqrestore(&ioapic_lock, flags); 1763 spin_unlock_irqrestore(&ioapic_lock, flags);
1751
1752 old_id = mp_ioapics[apic].mpc_apicid;
1753 1764
1754 if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) { 1765 old_id = mp_ioapics[apic].mp_apicid;
1766
1767 if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
1755 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", 1768 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
1756 apic, mp_ioapics[apic].mpc_apicid); 1769 apic, mp_ioapics[apic].mp_apicid);
1757 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", 1770 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1758 reg_00.bits.ID); 1771 reg_00.bits.ID);
1759 mp_ioapics[apic].mpc_apicid = reg_00.bits.ID; 1772 mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
1760 } 1773 }
1761 1774
1762 /* 1775 /*
@@ -1765,9 +1778,9 @@ static void __init setup_ioapic_ids_from_mpc(void)
1765 * 'stuck on smp_invalidate_needed IPI wait' messages. 1778 * 'stuck on smp_invalidate_needed IPI wait' messages.
1766 */ 1779 */
1767 if (check_apicid_used(phys_id_present_map, 1780 if (check_apicid_used(phys_id_present_map,
1768 mp_ioapics[apic].mpc_apicid)) { 1781 mp_ioapics[apic].mp_apicid)) {
1769 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", 1782 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
1770 apic, mp_ioapics[apic].mpc_apicid); 1783 apic, mp_ioapics[apic].mp_apicid);
1771 for (i = 0; i < get_physical_broadcast(); i++) 1784 for (i = 0; i < get_physical_broadcast(); i++)
1772 if (!physid_isset(i, phys_id_present_map)) 1785 if (!physid_isset(i, phys_id_present_map))
1773 break; 1786 break;
@@ -1776,13 +1789,13 @@ static void __init setup_ioapic_ids_from_mpc(void)
1776 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", 1789 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1777 i); 1790 i);
1778 physid_set(i, phys_id_present_map); 1791 physid_set(i, phys_id_present_map);
1779 mp_ioapics[apic].mpc_apicid = i; 1792 mp_ioapics[apic].mp_apicid = i;
1780 } else { 1793 } else {
1781 physid_mask_t tmp; 1794 physid_mask_t tmp;
1782 tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid); 1795 tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
1783 apic_printk(APIC_VERBOSE, "Setting %d in the " 1796 apic_printk(APIC_VERBOSE, "Setting %d in the "
1784 "phys_id_present_map\n", 1797 "phys_id_present_map\n",
1785 mp_ioapics[apic].mpc_apicid); 1798 mp_ioapics[apic].mp_apicid);
1786 physids_or(phys_id_present_map, phys_id_present_map, tmp); 1799 physids_or(phys_id_present_map, phys_id_present_map, tmp);
1787 } 1800 }
1788 1801
@@ -1791,21 +1804,21 @@ static void __init setup_ioapic_ids_from_mpc(void)
1791 * We need to adjust the IRQ routing table 1804 * We need to adjust the IRQ routing table
1792 * if the ID changed. 1805 * if the ID changed.
1793 */ 1806 */
1794 if (old_id != mp_ioapics[apic].mpc_apicid) 1807 if (old_id != mp_ioapics[apic].mp_apicid)
1795 for (i = 0; i < mp_irq_entries; i++) 1808 for (i = 0; i < mp_irq_entries; i++)
1796 if (mp_irqs[i].mpc_dstapic == old_id) 1809 if (mp_irqs[i].mp_dstapic == old_id)
1797 mp_irqs[i].mpc_dstapic 1810 mp_irqs[i].mp_dstapic
1798 = mp_ioapics[apic].mpc_apicid; 1811 = mp_ioapics[apic].mp_apicid;
1799 1812
1800 /* 1813 /*
1801 * Read the right value from the MPC table and 1814 * Read the right value from the MPC table and
1802 * write it into the ID register. 1815 * write it into the ID register.
1803 */ 1816 */
1804 apic_printk(APIC_VERBOSE, KERN_INFO 1817 apic_printk(APIC_VERBOSE, KERN_INFO
1805 "...changing IO-APIC physical APIC ID to %d ...", 1818 "...changing IO-APIC physical APIC ID to %d ...",
1806 mp_ioapics[apic].mpc_apicid); 1819 mp_ioapics[apic].mp_apicid);
1807 1820
1808 reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; 1821 reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
1809 spin_lock_irqsave(&ioapic_lock, flags); 1822 spin_lock_irqsave(&ioapic_lock, flags);
1810 io_apic_write(apic, 0, reg_00.raw); 1823 io_apic_write(apic, 0, reg_00.raw);
1811 spin_unlock_irqrestore(&ioapic_lock, flags); 1824 spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -1816,15 +1829,12 @@ static void __init setup_ioapic_ids_from_mpc(void)
1816 spin_lock_irqsave(&ioapic_lock, flags); 1829 spin_lock_irqsave(&ioapic_lock, flags);
1817 reg_00.raw = io_apic_read(apic, 0); 1830 reg_00.raw = io_apic_read(apic, 0);
1818 spin_unlock_irqrestore(&ioapic_lock, flags); 1831 spin_unlock_irqrestore(&ioapic_lock, flags);
1819 if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) 1832 if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
1820 printk("could not set ID!\n"); 1833 printk("could not set ID!\n");
1821 else 1834 else
1822 apic_printk(APIC_VERBOSE, " ok.\n"); 1835 apic_printk(APIC_VERBOSE, " ok.\n");
1823 } 1836 }
1824} 1837}
1825#else
1826static void __init setup_ioapic_ids_from_mpc(void) { }
1827#endif
1828 1838
1829int no_timer_check __initdata; 1839int no_timer_check __initdata;
1830 1840
@@ -2015,45 +2025,53 @@ static inline void init_IO_APIC_traps(void)
2015 * The local APIC irq-chip implementation: 2025 * The local APIC irq-chip implementation:
2016 */ 2026 */
2017 2027
2018static void ack_apic(unsigned int irq) 2028static void ack_lapic_irq(unsigned int irq)
2019{ 2029{
2020 ack_APIC_irq(); 2030 ack_APIC_irq();
2021} 2031}
2022 2032
2023static void mask_lapic_irq (unsigned int irq) 2033static void mask_lapic_irq(unsigned int irq)
2024{ 2034{
2025 unsigned long v; 2035 unsigned long v;
2026 2036
2027 v = apic_read(APIC_LVT0); 2037 v = apic_read(APIC_LVT0);
2028 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); 2038 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
2029} 2039}
2030 2040
2031static void unmask_lapic_irq (unsigned int irq) 2041static void unmask_lapic_irq(unsigned int irq)
2032{ 2042{
2033 unsigned long v; 2043 unsigned long v;
2034 2044
2035 v = apic_read(APIC_LVT0); 2045 v = apic_read(APIC_LVT0);
2036 apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); 2046 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
2037} 2047}
2038 2048
2039static struct irq_chip lapic_chip __read_mostly = { 2049static struct irq_chip lapic_chip __read_mostly = {
2040 .name = "local-APIC-edge", 2050 .name = "local-APIC",
2041 .mask = mask_lapic_irq, 2051 .mask = mask_lapic_irq,
2042 .unmask = unmask_lapic_irq, 2052 .unmask = unmask_lapic_irq,
2043 .eoi = ack_apic, 2053 .ack = ack_lapic_irq,
2044}; 2054};
2045 2055
2056static void lapic_register_intr(int irq, int vector)
2057{
2058 irq_desc[irq].status &= ~IRQ_LEVEL;
2059 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
2060 "edge");
2061 set_intr_gate(vector, interrupt[irq]);
2062}
2063
2046static void __init setup_nmi(void) 2064static void __init setup_nmi(void)
2047{ 2065{
2048 /* 2066 /*
2049 * Dirty trick to enable the NMI watchdog ... 2067 * Dirty trick to enable the NMI watchdog ...
2050 * We put the 8259A master into AEOI mode and 2068 * We put the 8259A master into AEOI mode and
2051 * unmask on all local APICs LVT0 as NMI. 2069 * unmask on all local APICs LVT0 as NMI.
2052 * 2070 *
2053 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') 2071 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
2054 * is from Maciej W. Rozycki - so we do not have to EOI from 2072 * is from Maciej W. Rozycki - so we do not have to EOI from
2055 * the NMI handler or the timer interrupt. 2073 * the NMI handler or the timer interrupt.
2056 */ 2074 */
2057 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); 2075 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
2058 2076
2059 enable_NMI_through_LVT0(); 2077 enable_NMI_through_LVT0();
@@ -2129,11 +2147,16 @@ static inline void __init unlock_ExtINT_logic(void)
2129static inline void __init check_timer(void) 2147static inline void __init check_timer(void)
2130{ 2148{
2131 int apic1, pin1, apic2, pin2; 2149 int apic1, pin1, apic2, pin2;
2150 int no_pin1 = 0;
2132 int vector; 2151 int vector;
2152 unsigned int ver;
2133 unsigned long flags; 2153 unsigned long flags;
2134 2154
2135 local_irq_save(flags); 2155 local_irq_save(flags);
2136 2156
2157 ver = apic_read(APIC_LVR);
2158 ver = GET_APIC_VERSION(ver);
2159
2137 /* 2160 /*
2138 * get/set the timer IRQ vector: 2161 * get/set the timer IRQ vector:
2139 */ 2162 */
@@ -2142,34 +2165,54 @@ static inline void __init check_timer(void)
2142 set_intr_gate(vector, interrupt[0]); 2165 set_intr_gate(vector, interrupt[0]);
2143 2166
2144 /* 2167 /*
2145 * Subtle, code in do_timer_interrupt() expects an AEOI 2168 * As IRQ0 is to be enabled in the 8259A, the virtual
2146 * mode for the 8259A whenever interrupts are routed 2169 * wire has to be disabled in the local APIC. Also
2147 * through I/O APICs. Also IRQ0 has to be enabled in 2170 * timer interrupts need to be acknowledged manually in
2148 * the 8259A which implies the virtual wire has to be 2171 * the 8259A for the i82489DX when using the NMI
2149 * disabled in the local APIC. 2172 * watchdog as that APIC treats NMIs as level-triggered.
2173 * The AEOI mode will finish them in the 8259A
2174 * automatically.
2150 */ 2175 */
2151 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 2176 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2152 init_8259A(1); 2177 init_8259A(1);
2153 timer_ack = 1; 2178 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
2154 if (timer_over_8254 > 0)
2155 enable_8259A_irq(0);
2156 2179
2157 pin1 = find_isa_irq_pin(0, mp_INT); 2180 pin1 = find_isa_irq_pin(0, mp_INT);
2158 apic1 = find_isa_irq_apic(0, mp_INT); 2181 apic1 = find_isa_irq_apic(0, mp_INT);
2159 pin2 = ioapic_i8259.pin; 2182 pin2 = ioapic_i8259.pin;
2160 apic2 = ioapic_i8259.apic; 2183 apic2 = ioapic_i8259.apic;
2161 2184
2162 printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", 2185 apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
2163 vector, apic1, pin1, apic2, pin2); 2186 "apic1=%d pin1=%d apic2=%d pin2=%d\n",
2187 vector, apic1, pin1, apic2, pin2);
2188
2189 /*
2190 * Some BIOS writers are clueless and report the ExtINTA
2191 * I/O APIC input from the cascaded 8259A as the timer
2192 * interrupt input. So just in case, if only one pin
2193 * was found above, try it both directly and through the
2194 * 8259A.
2195 */
2196 if (pin1 == -1) {
2197 pin1 = pin2;
2198 apic1 = apic2;
2199 no_pin1 = 1;
2200 } else if (pin2 == -1) {
2201 pin2 = pin1;
2202 apic2 = apic1;
2203 }
2164 2204
2165 if (pin1 != -1) { 2205 if (pin1 != -1) {
2166 /* 2206 /*
2167 * Ok, does IRQ0 through the IOAPIC work? 2207 * Ok, does IRQ0 through the IOAPIC work?
2168 */ 2208 */
2209 if (no_pin1) {
2210 add_pin_to_irq(0, apic1, pin1);
2211 setup_timer_IRQ0_pin(apic1, pin1, vector);
2212 }
2169 unmask_IO_APIC_irq(0); 2213 unmask_IO_APIC_irq(0);
2170 if (timer_irq_works()) { 2214 if (timer_irq_works()) {
2171 if (nmi_watchdog == NMI_IO_APIC) { 2215 if (nmi_watchdog == NMI_IO_APIC) {
2172 disable_8259A_irq(0);
2173 setup_nmi(); 2216 setup_nmi();
2174 enable_8259A_irq(0); 2217 enable_8259A_irq(0);
2175 } 2218 }
@@ -2178,81 +2221,97 @@ static inline void __init check_timer(void)
2178 goto out; 2221 goto out;
2179 } 2222 }
2180 clear_IO_APIC_pin(apic1, pin1); 2223 clear_IO_APIC_pin(apic1, pin1);
2181 printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to " 2224 if (!no_pin1)
2182 "IO-APIC\n"); 2225 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
2183 } 2226 "8254 timer not connected to IO-APIC\n");
2184 2227
2185 printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... "); 2228 apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
2186 if (pin2 != -1) { 2229 "(IRQ0) through the 8259A ...\n");
2187 printk("\n..... (found pin %d) ...", pin2); 2230 apic_printk(APIC_QUIET, KERN_INFO
2231 "..... (found apic %d pin %d) ...\n", apic2, pin2);
2188 /* 2232 /*
2189 * legacy devices should be connected to IO APIC #0 2233 * legacy devices should be connected to IO APIC #0
2190 */ 2234 */
2191 setup_ExtINT_IRQ0_pin(apic2, pin2, vector); 2235 replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
2236 setup_timer_IRQ0_pin(apic2, pin2, vector);
2237 unmask_IO_APIC_irq(0);
2238 enable_8259A_irq(0);
2192 if (timer_irq_works()) { 2239 if (timer_irq_works()) {
2193 printk("works.\n"); 2240 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
2194 if (pin1 != -1) 2241 timer_through_8259 = 1;
2195 replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
2196 else
2197 add_pin_to_irq(0, apic2, pin2);
2198 if (nmi_watchdog == NMI_IO_APIC) { 2242 if (nmi_watchdog == NMI_IO_APIC) {
2243 disable_8259A_irq(0);
2199 setup_nmi(); 2244 setup_nmi();
2245 enable_8259A_irq(0);
2200 } 2246 }
2201 goto out; 2247 goto out;
2202 } 2248 }
2203 /* 2249 /*
2204 * Cleanup, just in case ... 2250 * Cleanup, just in case ...
2205 */ 2251 */
2252 disable_8259A_irq(0);
2206 clear_IO_APIC_pin(apic2, pin2); 2253 clear_IO_APIC_pin(apic2, pin2);
2254 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
2207 } 2255 }
2208 printk(" failed.\n");
2209 2256
2210 if (nmi_watchdog == NMI_IO_APIC) { 2257 if (nmi_watchdog == NMI_IO_APIC) {
2211 printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); 2258 apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
2212 nmi_watchdog = 0; 2259 "through the IO-APIC - disabling NMI Watchdog!\n");
2260 nmi_watchdog = NMI_NONE;
2213 } 2261 }
2262 timer_ack = 0;
2214 2263
2215 printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); 2264 apic_printk(APIC_QUIET, KERN_INFO
2265 "...trying to set up timer as Virtual Wire IRQ...\n");
2216 2266
2217 disable_8259A_irq(0); 2267 lapic_register_intr(0, vector);
2218 set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq, 2268 apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
2219 "fasteoi");
2220 apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
2221 enable_8259A_irq(0); 2269 enable_8259A_irq(0);
2222 2270
2223 if (timer_irq_works()) { 2271 if (timer_irq_works()) {
2224 printk(" works.\n"); 2272 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
2225 goto out; 2273 goto out;
2226 } 2274 }
2227 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); 2275 disable_8259A_irq(0);
2228 printk(" failed.\n"); 2276 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
2277 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
2229 2278
2230 printk(KERN_INFO "...trying to set up timer as ExtINT IRQ..."); 2279 apic_printk(APIC_QUIET, KERN_INFO
2280 "...trying to set up timer as ExtINT IRQ...\n");
2231 2281
2232 timer_ack = 0;
2233 init_8259A(0); 2282 init_8259A(0);
2234 make_8259A_irq(0); 2283 make_8259A_irq(0);
2235 apic_write_around(APIC_LVT0, APIC_DM_EXTINT); 2284 apic_write(APIC_LVT0, APIC_DM_EXTINT);
2236 2285
2237 unlock_ExtINT_logic(); 2286 unlock_ExtINT_logic();
2238 2287
2239 if (timer_irq_works()) { 2288 if (timer_irq_works()) {
2240 printk(" works.\n"); 2289 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
2241 goto out; 2290 goto out;
2242 } 2291 }
2243 printk(" failed :(.\n"); 2292 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
2244 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " 2293 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
2245 "report. Then try booting with the 'noapic' option"); 2294 "report. Then try booting with the 'noapic' option.\n");
2246out: 2295out:
2247 local_irq_restore(flags); 2296 local_irq_restore(flags);
2248} 2297}
2249 2298
2250/* 2299/*
2251 * 2300 * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
2252 * IRQ's that are handled by the PIC in the MPS IOAPIC case. 2301 * to devices. However there may be an I/O APIC pin available for
2253 * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. 2302 * this interrupt regardless. The pin may be left unconnected, but
2254 * Linux doesn't really care, as it's not actually used 2303 * typically it will be reused as an ExtINT cascade interrupt for
2255 * for any interrupt handling anyway. 2304 * the master 8259A. In the MPS case such a pin will normally be
2305 * reported as an ExtINT interrupt in the MP table. With ACPI
2306 * there is no provision for ExtINT interrupts, and in the absence
2307 * of an override it would be treated as an ordinary ISA I/O APIC
2308 * interrupt, that is edge-triggered and unmasked by default. We
2309 * used to do this, but it caused problems on some systems because
2310 * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
2311 * the same ExtINT cascade interrupt to drive the local APIC of the
2312 * bootstrap processor. Therefore we refrain from routing IRQ2 to
2313 * the I/O APIC in all cases now. No actual device should request
2314 * it anyway. --macro
2256 */ 2315 */
2257#define PIC_IRQS (1 << PIC_CASCADE_IR) 2316#define PIC_IRQS (1 << PIC_CASCADE_IR)
2258 2317
@@ -2261,15 +2320,12 @@ void __init setup_IO_APIC(void)
2261 int i; 2320 int i;
2262 2321
2263 /* Reserve all the system vectors. */ 2322 /* Reserve all the system vectors. */
2264 for (i = FIRST_SYSTEM_VECTOR; i < NR_VECTORS; i++) 2323 for (i = first_system_vector; i < NR_VECTORS; i++)
2265 set_bit(i, used_vectors); 2324 set_bit(i, used_vectors);
2266 2325
2267 enable_IO_APIC(); 2326 enable_IO_APIC();
2268 2327
2269 if (acpi_ioapic) 2328 io_apic_irqs = ~PIC_IRQS;
2270 io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
2271 else
2272 io_apic_irqs = ~PIC_IRQS;
2273 2329
2274 printk("ENABLING IO-APIC IRQs\n"); 2330 printk("ENABLING IO-APIC IRQs\n");
2275 2331
@@ -2282,32 +2338,16 @@ void __init setup_IO_APIC(void)
2282 setup_IO_APIC_irqs(); 2338 setup_IO_APIC_irqs();
2283 init_IO_APIC_traps(); 2339 init_IO_APIC_traps();
2284 check_timer(); 2340 check_timer();
2285 if (!acpi_ioapic)
2286 print_IO_APIC();
2287} 2341}
2288 2342
2289static int __init setup_disable_8254_timer(char *s)
2290{
2291 timer_over_8254 = -1;
2292 return 1;
2293}
2294static int __init setup_enable_8254_timer(char *s)
2295{
2296 timer_over_8254 = 2;
2297 return 1;
2298}
2299
2300__setup("disable_8254_timer", setup_disable_8254_timer);
2301__setup("enable_8254_timer", setup_enable_8254_timer);
2302
2303/* 2343/*
2304 * Called after all the initialization is done. If we didnt find any 2344 * Called after all the initialization is done. If we didnt find any
2305 * APIC bugs then we can allow the modify fast path 2345 * APIC bugs then we can allow the modify fast path
2306 */ 2346 */
2307 2347
2308static int __init io_apic_bug_finalize(void) 2348static int __init io_apic_bug_finalize(void)
2309{ 2349{
2310 if(sis_apic_bug == -1) 2350 if (sis_apic_bug == -1)
2311 sis_apic_bug = 0; 2351 sis_apic_bug = 0;
2312 return 0; 2352 return 0;
2313} 2353}
@@ -2318,17 +2358,17 @@ struct sysfs_ioapic_data {
2318 struct sys_device dev; 2358 struct sys_device dev;
2319 struct IO_APIC_route_entry entry[0]; 2359 struct IO_APIC_route_entry entry[0];
2320}; 2360};
2321static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; 2361static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS];
2322 2362
2323static int ioapic_suspend(struct sys_device *dev, pm_message_t state) 2363static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
2324{ 2364{
2325 struct IO_APIC_route_entry *entry; 2365 struct IO_APIC_route_entry *entry;
2326 struct sysfs_ioapic_data *data; 2366 struct sysfs_ioapic_data *data;
2327 int i; 2367 int i;
2328 2368
2329 data = container_of(dev, struct sysfs_ioapic_data, dev); 2369 data = container_of(dev, struct sysfs_ioapic_data, dev);
2330 entry = data->entry; 2370 entry = data->entry;
2331 for (i = 0; i < nr_ioapic_registers[dev->id]; i ++) 2371 for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
2332 entry[i] = ioapic_read_entry(dev->id, i); 2372 entry[i] = ioapic_read_entry(dev->id, i);
2333 2373
2334 return 0; 2374 return 0;
@@ -2341,18 +2381,18 @@ static int ioapic_resume(struct sys_device *dev)
2341 unsigned long flags; 2381 unsigned long flags;
2342 union IO_APIC_reg_00 reg_00; 2382 union IO_APIC_reg_00 reg_00;
2343 int i; 2383 int i;
2344 2384
2345 data = container_of(dev, struct sysfs_ioapic_data, dev); 2385 data = container_of(dev, struct sysfs_ioapic_data, dev);
2346 entry = data->entry; 2386 entry = data->entry;
2347 2387
2348 spin_lock_irqsave(&ioapic_lock, flags); 2388 spin_lock_irqsave(&ioapic_lock, flags);
2349 reg_00.raw = io_apic_read(dev->id, 0); 2389 reg_00.raw = io_apic_read(dev->id, 0);
2350 if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { 2390 if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
2351 reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; 2391 reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
2352 io_apic_write(dev->id, 0, reg_00.raw); 2392 io_apic_write(dev->id, 0, reg_00.raw);
2353 } 2393 }
2354 spin_unlock_irqrestore(&ioapic_lock, flags); 2394 spin_unlock_irqrestore(&ioapic_lock, flags);
2355 for (i = 0; i < nr_ioapic_registers[dev->id]; i ++) 2395 for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
2356 ioapic_write_entry(dev->id, i, entry[i]); 2396 ioapic_write_entry(dev->id, i, entry[i]);
2357 2397
2358 return 0; 2398 return 0;
@@ -2366,24 +2406,23 @@ static struct sysdev_class ioapic_sysdev_class = {
2366 2406
2367static int __init ioapic_init_sysfs(void) 2407static int __init ioapic_init_sysfs(void)
2368{ 2408{
2369 struct sys_device * dev; 2409 struct sys_device *dev;
2370 int i, size, error = 0; 2410 int i, size, error = 0;
2371 2411
2372 error = sysdev_class_register(&ioapic_sysdev_class); 2412 error = sysdev_class_register(&ioapic_sysdev_class);
2373 if (error) 2413 if (error)
2374 return error; 2414 return error;
2375 2415
2376 for (i = 0; i < nr_ioapics; i++ ) { 2416 for (i = 0; i < nr_ioapics; i++) {
2377 size = sizeof(struct sys_device) + nr_ioapic_registers[i] 2417 size = sizeof(struct sys_device) + nr_ioapic_registers[i]
2378 * sizeof(struct IO_APIC_route_entry); 2418 * sizeof(struct IO_APIC_route_entry);
2379 mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); 2419 mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
2380 if (!mp_ioapic_data[i]) { 2420 if (!mp_ioapic_data[i]) {
2381 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); 2421 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
2382 continue; 2422 continue;
2383 } 2423 }
2384 memset(mp_ioapic_data[i], 0, size);
2385 dev = &mp_ioapic_data[i]->dev; 2424 dev = &mp_ioapic_data[i]->dev;
2386 dev->id = i; 2425 dev->id = i;
2387 dev->cls = &ioapic_sysdev_class; 2426 dev->cls = &ioapic_sysdev_class;
2388 error = sysdev_register(dev); 2427 error = sysdev_register(dev);
2389 if (error) { 2428 if (error) {
@@ -2458,7 +2497,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
2458 msg->address_lo = 2497 msg->address_lo =
2459 MSI_ADDR_BASE_LO | 2498 MSI_ADDR_BASE_LO |
2460 ((INT_DEST_MODE == 0) ? 2499 ((INT_DEST_MODE == 0) ?
2461 MSI_ADDR_DEST_MODE_PHYSICAL: 2500MSI_ADDR_DEST_MODE_PHYSICAL:
2462 MSI_ADDR_DEST_MODE_LOGICAL) | 2501 MSI_ADDR_DEST_MODE_LOGICAL) |
2463 ((INT_DELIVERY_MODE != dest_LowestPrio) ? 2502 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2464 MSI_ADDR_REDIRECTION_CPU: 2503 MSI_ADDR_REDIRECTION_CPU:
@@ -2469,7 +2508,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
2469 MSI_DATA_TRIGGER_EDGE | 2508 MSI_DATA_TRIGGER_EDGE |
2470 MSI_DATA_LEVEL_ASSERT | 2509 MSI_DATA_LEVEL_ASSERT |
2471 ((INT_DELIVERY_MODE != dest_LowestPrio) ? 2510 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2472 MSI_DATA_DELIVERY_FIXED: 2511MSI_DATA_DELIVERY_FIXED:
2473 MSI_DATA_DELIVERY_LOWPRI) | 2512 MSI_DATA_DELIVERY_LOWPRI) |
2474 MSI_DATA_VECTOR(vector); 2513 MSI_DATA_VECTOR(vector);
2475 } 2514 }
@@ -2640,12 +2679,12 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2640#endif /* CONFIG_HT_IRQ */ 2679#endif /* CONFIG_HT_IRQ */
2641 2680
2642/* -------------------------------------------------------------------------- 2681/* --------------------------------------------------------------------------
2643 ACPI-based IOAPIC Configuration 2682 ACPI-based IOAPIC Configuration
2644 -------------------------------------------------------------------------- */ 2683 -------------------------------------------------------------------------- */
2645 2684
2646#ifdef CONFIG_ACPI 2685#ifdef CONFIG_ACPI
2647 2686
2648int __init io_apic_get_unique_id (int ioapic, int apic_id) 2687int __init io_apic_get_unique_id(int ioapic, int apic_id)
2649{ 2688{
2650 union IO_APIC_reg_00 reg_00; 2689 union IO_APIC_reg_00 reg_00;
2651 static physid_mask_t apic_id_map = PHYSID_MASK_NONE; 2690 static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
@@ -2654,10 +2693,10 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id)
2654 int i = 0; 2693 int i = 0;
2655 2694
2656 /* 2695 /*
2657 * The P4 platform supports up to 256 APIC IDs on two separate APIC 2696 * The P4 platform supports up to 256 APIC IDs on two separate APIC
2658 * buses (one for LAPICs, one for IOAPICs), where predecessors only 2697 * buses (one for LAPICs, one for IOAPICs), where predecessors only
2659 * supports up to 16 on one shared APIC bus. 2698 * supports up to 16 on one shared APIC bus.
2660 * 2699 *
2661 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full 2700 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
2662 * advantage of new APIC bus architecture. 2701 * advantage of new APIC bus architecture.
2663 */ 2702 */
@@ -2676,7 +2715,7 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id)
2676 } 2715 }
2677 2716
2678 /* 2717 /*
2679 * Every APIC in a system must have a unique ID or we get lots of nice 2718 * Every APIC in a system must have a unique ID or we get lots of nice
2680 * 'stuck on smp_invalidate_needed IPI wait' messages. 2719 * 'stuck on smp_invalidate_needed IPI wait' messages.
2681 */ 2720 */
2682 if (check_apicid_used(apic_id_map, apic_id)) { 2721 if (check_apicid_used(apic_id_map, apic_id)) {
@@ -2693,7 +2732,7 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id)
2693 "trying %d\n", ioapic, apic_id, i); 2732 "trying %d\n", ioapic, apic_id, i);
2694 2733
2695 apic_id = i; 2734 apic_id = i;
2696 } 2735 }
2697 2736
2698 tmp = apicid_to_cpu_present(apic_id); 2737 tmp = apicid_to_cpu_present(apic_id);
2699 physids_or(apic_id_map, apic_id_map, tmp); 2738 physids_or(apic_id_map, apic_id_map, tmp);
@@ -2720,7 +2759,7 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id)
2720} 2759}
2721 2760
2722 2761
2723int __init io_apic_get_version (int ioapic) 2762int __init io_apic_get_version(int ioapic)
2724{ 2763{
2725 union IO_APIC_reg_01 reg_01; 2764 union IO_APIC_reg_01 reg_01;
2726 unsigned long flags; 2765 unsigned long flags;
@@ -2733,7 +2772,7 @@ int __init io_apic_get_version (int ioapic)
2733} 2772}
2734 2773
2735 2774
2736int __init io_apic_get_redir_entries (int ioapic) 2775int __init io_apic_get_redir_entries(int ioapic)
2737{ 2776{
2738 union IO_APIC_reg_01 reg_01; 2777 union IO_APIC_reg_01 reg_01;
2739 unsigned long flags; 2778 unsigned long flags;
@@ -2746,7 +2785,7 @@ int __init io_apic_get_redir_entries (int ioapic)
2746} 2785}
2747 2786
2748 2787
2749int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low) 2788int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low)
2750{ 2789{
2751 struct IO_APIC_route_entry entry; 2790 struct IO_APIC_route_entry entry;
2752 2791
@@ -2762,7 +2801,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
2762 * corresponding device driver registers for this IRQ. 2801 * corresponding device driver registers for this IRQ.
2763 */ 2802 */
2764 2803
2765 memset(&entry,0,sizeof(entry)); 2804 memset(&entry, 0, sizeof(entry));
2766 2805
2767 entry.delivery_mode = INT_DELIVERY_MODE; 2806 entry.delivery_mode = INT_DELIVERY_MODE;
2768 entry.dest_mode = INT_DEST_MODE; 2807 entry.dest_mode = INT_DEST_MODE;
@@ -2781,7 +2820,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
2781 2820
2782 apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry " 2821 apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
2783 "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic, 2822 "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
2784 mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, 2823 mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq,
2785 edge_level, active_high_low); 2824 edge_level, active_high_low);
2786 2825
2787 ioapic_register_intr(irq, entry.vector, edge_level); 2826 ioapic_register_intr(irq, entry.vector, edge_level);
@@ -2802,8 +2841,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
2802 return -1; 2841 return -1;
2803 2842
2804 for (i = 0; i < mp_irq_entries; i++) 2843 for (i = 0; i < mp_irq_entries; i++)
2805 if (mp_irqs[i].mpc_irqtype == mp_INT && 2844 if (mp_irqs[i].mp_irqtype == mp_INT &&
2806 mp_irqs[i].mpc_srcbusirq == bus_irq) 2845 mp_irqs[i].mp_srcbusirq == bus_irq)
2807 break; 2846 break;
2808 if (i >= mp_irq_entries) 2847 if (i >= mp_irq_entries)
2809 return -1; 2848 return -1;
@@ -2836,3 +2875,34 @@ static int __init parse_noapic(char *arg)
2836 return 0; 2875 return 0;
2837} 2876}
2838early_param("noapic", parse_noapic); 2877early_param("noapic", parse_noapic);
2878
2879void __init ioapic_init_mappings(void)
2880{
2881 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
2882 int i;
2883
2884 for (i = 0; i < nr_ioapics; i++) {
2885 if (smp_found_config) {
2886 ioapic_phys = mp_ioapics[i].mp_apicaddr;
2887 if (!ioapic_phys) {
2888 printk(KERN_ERR
2889 "WARNING: bogus zero IO-APIC "
2890 "address found in MPTABLE, "
2891 "disabling IO/APIC support!\n");
2892 smp_found_config = 0;
2893 skip_ioapic_setup = 1;
2894 goto fake_ioapic_page;
2895 }
2896 } else {
2897fake_ioapic_page:
2898 ioapic_phys = (unsigned long)
2899 alloc_bootmem_pages(PAGE_SIZE);
2900 ioapic_phys = __pa(ioapic_phys);
2901 }
2902 set_fixmap_nocache(idx, ioapic_phys);
2903 printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
2904 __fix_to_virt(idx), ioapic_phys);
2905 idx++;
2906 }
2907}
2908
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index ef1a8dfcc529..02063ae042f7 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -37,6 +37,7 @@
37#include <acpi/acpi_bus.h> 37#include <acpi/acpi_bus.h>
38#endif 38#endif
39#include <linux/bootmem.h> 39#include <linux/bootmem.h>
40#include <linux/dmar.h>
40 41
41#include <asm/idle.h> 42#include <asm/idle.h>
42#include <asm/io.h> 43#include <asm/io.h>
@@ -45,13 +46,17 @@
45#include <asm/proto.h> 46#include <asm/proto.h>
46#include <asm/acpi.h> 47#include <asm/acpi.h>
47#include <asm/dma.h> 48#include <asm/dma.h>
49#include <asm/i8259.h>
48#include <asm/nmi.h> 50#include <asm/nmi.h>
49#include <asm/msidef.h> 51#include <asm/msidef.h>
50#include <asm/hypertransport.h> 52#include <asm/hypertransport.h>
53#include <asm/irq_remapping.h>
51 54
52#include <mach_ipi.h> 55#include <mach_ipi.h>
53#include <mach_apic.h> 56#include <mach_apic.h>
54 57
58#define __apicdebuginit(type) static type __init
59
55struct irq_cfg { 60struct irq_cfg {
56 cpumask_t domain; 61 cpumask_t domain;
57 cpumask_t old_domain; 62 cpumask_t old_domain;
@@ -61,7 +66,7 @@ struct irq_cfg {
61}; 66};
62 67
63/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ 68/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
64struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = { 69static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
65 [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, 70 [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, },
66 [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, 71 [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, },
67 [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, 72 [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, },
@@ -82,7 +87,9 @@ struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
82 87
83static int assign_irq_vector(int irq, cpumask_t mask); 88static int assign_irq_vector(int irq, cpumask_t mask);
84 89
85#define __apicdebuginit __init 90int first_system_vector = 0xfe;
91
92char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
86 93
87int sis_apic_bug; /* not actually supported, dummy for compile */ 94int sis_apic_bug; /* not actually supported, dummy for compile */
88 95
@@ -90,29 +97,34 @@ static int no_timer_check;
90 97
91static int disable_timer_pin_1 __initdata; 98static int disable_timer_pin_1 __initdata;
92 99
93int timer_over_8254 __initdata = 1; 100int timer_through_8259 __initdata;
94 101
95/* Where if anywhere is the i8259 connect in external int mode */ 102/* Where if anywhere is the i8259 connect in external int mode */
96static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; 103static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
97 104
98static DEFINE_SPINLOCK(ioapic_lock); 105static DEFINE_SPINLOCK(ioapic_lock);
99DEFINE_SPINLOCK(vector_lock); 106static DEFINE_SPINLOCK(vector_lock);
100 107
101/* 108/*
102 * # of IRQ routing registers 109 * # of IRQ routing registers
103 */ 110 */
104int nr_ioapic_registers[MAX_IO_APICS]; 111int nr_ioapic_registers[MAX_IO_APICS];
105 112
113/* I/O APIC RTE contents at the OS boot up */
114struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS];
115
106/* I/O APIC entries */ 116/* I/O APIC entries */
107struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; 117struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
108int nr_ioapics; 118int nr_ioapics;
109 119
110/* MP IRQ source entries */ 120/* MP IRQ source entries */
111struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; 121struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
112 122
113/* # of MP IRQ source entries */ 123/* # of MP IRQ source entries */
114int mp_irq_entries; 124int mp_irq_entries;
115 125
126DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
127
116/* 128/*
117 * Rough estimation of how many shared IRQs there are, can 129 * Rough estimation of how many shared IRQs there are, can
118 * be changed anytime. 130 * be changed anytime.
@@ -140,7 +152,7 @@ struct io_apic {
140static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) 152static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
141{ 153{
142 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) 154 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
143 + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK); 155 + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
144} 156}
145 157
146static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) 158static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
@@ -183,7 +195,7 @@ static bool io_apic_level_ack_pending(unsigned int irq)
183 break; 195 break;
184 reg = io_apic_read(entry->apic, 0x10 + pin*2); 196 reg = io_apic_read(entry->apic, 0x10 + pin*2);
185 /* Is the remote IRR bit set? */ 197 /* Is the remote IRR bit set? */
186 if ((reg >> 14) & 1) { 198 if (reg & IO_APIC_REDIR_REMOTE_IRR) {
187 spin_unlock_irqrestore(&ioapic_lock, flags); 199 spin_unlock_irqrestore(&ioapic_lock, flags);
188 return true; 200 return true;
189 } 201 }
@@ -296,9 +308,14 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
296 pin = entry->pin; 308 pin = entry->pin;
297 if (pin == -1) 309 if (pin == -1)
298 break; 310 break;
299 io_apic_write(apic, 0x11 + pin*2, dest); 311 /*
312 * With interrupt-remapping, destination information comes
313 * from interrupt-remapping table entry.
314 */
315 if (!irq_remapped(irq))
316 io_apic_write(apic, 0x11 + pin*2, dest);
300 reg = io_apic_read(apic, 0x10 + pin*2); 317 reg = io_apic_read(apic, 0x10 + pin*2);
301 reg &= ~0x000000ff; 318 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
302 reg |= vector; 319 reg |= vector;
303 io_apic_modify(apic, reg); 320 io_apic_modify(apic, reg);
304 if (!entry->next) 321 if (!entry->next)
@@ -360,16 +377,37 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
360 entry->pin = pin; 377 entry->pin = pin;
361} 378}
362 379
380/*
381 * Reroute an IRQ to a different pin.
382 */
383static void __init replace_pin_at_irq(unsigned int irq,
384 int oldapic, int oldpin,
385 int newapic, int newpin)
386{
387 struct irq_pin_list *entry = irq_2_pin + irq;
388
389 while (1) {
390 if (entry->apic == oldapic && entry->pin == oldpin) {
391 entry->apic = newapic;
392 entry->pin = newpin;
393 }
394 if (!entry->next)
395 break;
396 entry = irq_2_pin + entry->next;
397 }
398}
399
363 400
364#define DO_ACTION(name,R,ACTION, FINAL) \ 401#define DO_ACTION(name,R,ACTION, FINAL) \
365 \ 402 \
366 static void name##_IO_APIC_irq (unsigned int irq) \ 403 static void name##_IO_APIC_irq (unsigned int irq) \
367 __DO_ACTION(R, ACTION, FINAL) 404 __DO_ACTION(R, ACTION, FINAL)
368 405
369DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) 406/* mask = 1 */
370 /* mask = 1 */ 407DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic))
371DO_ACTION( __unmask, 0, &= 0xfffeffff, ) 408
372 /* mask = 0 */ 409/* mask = 0 */
410DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, )
373 411
374static void mask_IO_APIC_irq (unsigned int irq) 412static void mask_IO_APIC_irq (unsigned int irq)
375{ 413{
@@ -412,6 +450,69 @@ static void clear_IO_APIC (void)
412 clear_IO_APIC_pin(apic, pin); 450 clear_IO_APIC_pin(apic, pin);
413} 451}
414 452
453/*
454 * Saves and masks all the unmasked IO-APIC RTE's
455 */
456int save_mask_IO_APIC_setup(void)
457{
458 union IO_APIC_reg_01 reg_01;
459 unsigned long flags;
460 int apic, pin;
461
462 /*
463 * The number of IO-APIC IRQ registers (== #pins):
464 */
465 for (apic = 0; apic < nr_ioapics; apic++) {
466 spin_lock_irqsave(&ioapic_lock, flags);
467 reg_01.raw = io_apic_read(apic, 1);
468 spin_unlock_irqrestore(&ioapic_lock, flags);
469 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
470 }
471
472 for (apic = 0; apic < nr_ioapics; apic++) {
473 early_ioapic_entries[apic] =
474 kzalloc(sizeof(struct IO_APIC_route_entry) *
475 nr_ioapic_registers[apic], GFP_KERNEL);
476 if (!early_ioapic_entries[apic])
477 return -ENOMEM;
478 }
479
480 for (apic = 0; apic < nr_ioapics; apic++)
481 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
482 struct IO_APIC_route_entry entry;
483
484 entry = early_ioapic_entries[apic][pin] =
485 ioapic_read_entry(apic, pin);
486 if (!entry.mask) {
487 entry.mask = 1;
488 ioapic_write_entry(apic, pin, entry);
489 }
490 }
491 return 0;
492}
493
494void restore_IO_APIC_setup(void)
495{
496 int apic, pin;
497
498 for (apic = 0; apic < nr_ioapics; apic++)
499 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
500 ioapic_write_entry(apic, pin,
501 early_ioapic_entries[apic][pin]);
502}
503
504void reinit_intr_remapped_IO_APIC(int intr_remapping)
505{
506 /*
507 * for now plain restore of previous settings.
508 * TBD: In the case of OS enabling interrupt-remapping,
509 * IO-APIC RTE's need to be setup to point to interrupt-remapping
510 * table entries. for now, do a plain restore, and wait for
511 * the setup_IO_APIC_irqs() to do proper initialization.
512 */
513 restore_IO_APIC_setup();
514}
515
415int skip_ioapic_setup; 516int skip_ioapic_setup;
416int ioapic_force; 517int ioapic_force;
417 518
@@ -430,20 +531,6 @@ static int __init disable_timer_pin_setup(char *arg)
430} 531}
431__setup("disable_timer_pin_1", disable_timer_pin_setup); 532__setup("disable_timer_pin_1", disable_timer_pin_setup);
432 533
433static int __init setup_disable_8254_timer(char *s)
434{
435 timer_over_8254 = -1;
436 return 1;
437}
438static int __init setup_enable_8254_timer(char *s)
439{
440 timer_over_8254 = 2;
441 return 1;
442}
443
444__setup("disable_8254_timer", setup_disable_8254_timer);
445__setup("enable_8254_timer", setup_enable_8254_timer);
446
447 534
448/* 535/*
449 * Find the IRQ entry number of a certain pin. 536 * Find the IRQ entry number of a certain pin.
@@ -453,10 +540,10 @@ static int find_irq_entry(int apic, int pin, int type)
453 int i; 540 int i;
454 541
455 for (i = 0; i < mp_irq_entries; i++) 542 for (i = 0; i < mp_irq_entries; i++)
456 if (mp_irqs[i].mpc_irqtype == type && 543 if (mp_irqs[i].mp_irqtype == type &&
457 (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || 544 (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
458 mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && 545 mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
459 mp_irqs[i].mpc_dstirq == pin) 546 mp_irqs[i].mp_dstirq == pin)
460 return i; 547 return i;
461 548
462 return -1; 549 return -1;
@@ -470,13 +557,13 @@ static int __init find_isa_irq_pin(int irq, int type)
470 int i; 557 int i;
471 558
472 for (i = 0; i < mp_irq_entries; i++) { 559 for (i = 0; i < mp_irq_entries; i++) {
473 int lbus = mp_irqs[i].mpc_srcbus; 560 int lbus = mp_irqs[i].mp_srcbus;
474 561
475 if (test_bit(lbus, mp_bus_not_pci) && 562 if (test_bit(lbus, mp_bus_not_pci) &&
476 (mp_irqs[i].mpc_irqtype == type) && 563 (mp_irqs[i].mp_irqtype == type) &&
477 (mp_irqs[i].mpc_srcbusirq == irq)) 564 (mp_irqs[i].mp_srcbusirq == irq))
478 565
479 return mp_irqs[i].mpc_dstirq; 566 return mp_irqs[i].mp_dstirq;
480 } 567 }
481 return -1; 568 return -1;
482} 569}
@@ -486,17 +573,17 @@ static int __init find_isa_irq_apic(int irq, int type)
486 int i; 573 int i;
487 574
488 for (i = 0; i < mp_irq_entries; i++) { 575 for (i = 0; i < mp_irq_entries; i++) {
489 int lbus = mp_irqs[i].mpc_srcbus; 576 int lbus = mp_irqs[i].mp_srcbus;
490 577
491 if (test_bit(lbus, mp_bus_not_pci) && 578 if (test_bit(lbus, mp_bus_not_pci) &&
492 (mp_irqs[i].mpc_irqtype == type) && 579 (mp_irqs[i].mp_irqtype == type) &&
493 (mp_irqs[i].mpc_srcbusirq == irq)) 580 (mp_irqs[i].mp_srcbusirq == irq))
494 break; 581 break;
495 } 582 }
496 if (i < mp_irq_entries) { 583 if (i < mp_irq_entries) {
497 int apic; 584 int apic;
498 for(apic = 0; apic < nr_ioapics; apic++) { 585 for(apic = 0; apic < nr_ioapics; apic++) {
499 if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) 586 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
500 return apic; 587 return apic;
501 } 588 }
502 } 589 }
@@ -516,28 +603,28 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
516 603
517 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", 604 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
518 bus, slot, pin); 605 bus, slot, pin);
519 if (mp_bus_id_to_pci_bus[bus] == -1) { 606 if (test_bit(bus, mp_bus_not_pci)) {
520 apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); 607 apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
521 return -1; 608 return -1;
522 } 609 }
523 for (i = 0; i < mp_irq_entries; i++) { 610 for (i = 0; i < mp_irq_entries; i++) {
524 int lbus = mp_irqs[i].mpc_srcbus; 611 int lbus = mp_irqs[i].mp_srcbus;
525 612
526 for (apic = 0; apic < nr_ioapics; apic++) 613 for (apic = 0; apic < nr_ioapics; apic++)
527 if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || 614 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
528 mp_irqs[i].mpc_dstapic == MP_APIC_ALL) 615 mp_irqs[i].mp_dstapic == MP_APIC_ALL)
529 break; 616 break;
530 617
531 if (!test_bit(lbus, mp_bus_not_pci) && 618 if (!test_bit(lbus, mp_bus_not_pci) &&
532 !mp_irqs[i].mpc_irqtype && 619 !mp_irqs[i].mp_irqtype &&
533 (bus == lbus) && 620 (bus == lbus) &&
534 (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { 621 (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
535 int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); 622 int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq);
536 623
537 if (!(apic || IO_APIC_IRQ(irq))) 624 if (!(apic || IO_APIC_IRQ(irq)))
538 continue; 625 continue;
539 626
540 if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) 627 if (pin == (mp_irqs[i].mp_srcbusirq & 3))
541 return irq; 628 return irq;
542 /* 629 /*
543 * Use the first all-but-pin matching entry as a 630 * Use the first all-but-pin matching entry as a
@@ -565,13 +652,13 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
565 652
566static int MPBIOS_polarity(int idx) 653static int MPBIOS_polarity(int idx)
567{ 654{
568 int bus = mp_irqs[idx].mpc_srcbus; 655 int bus = mp_irqs[idx].mp_srcbus;
569 int polarity; 656 int polarity;
570 657
571 /* 658 /*
572 * Determine IRQ line polarity (high active or low active): 659 * Determine IRQ line polarity (high active or low active):
573 */ 660 */
574 switch (mp_irqs[idx].mpc_irqflag & 3) 661 switch (mp_irqs[idx].mp_irqflag & 3)
575 { 662 {
576 case 0: /* conforms, ie. bus-type dependent polarity */ 663 case 0: /* conforms, ie. bus-type dependent polarity */
577 if (test_bit(bus, mp_bus_not_pci)) 664 if (test_bit(bus, mp_bus_not_pci))
@@ -607,13 +694,13 @@ static int MPBIOS_polarity(int idx)
607 694
608static int MPBIOS_trigger(int idx) 695static int MPBIOS_trigger(int idx)
609{ 696{
610 int bus = mp_irqs[idx].mpc_srcbus; 697 int bus = mp_irqs[idx].mp_srcbus;
611 int trigger; 698 int trigger;
612 699
613 /* 700 /*
614 * Determine IRQ trigger mode (edge or level sensitive): 701 * Determine IRQ trigger mode (edge or level sensitive):
615 */ 702 */
616 switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) 703 switch ((mp_irqs[idx].mp_irqflag>>2) & 3)
617 { 704 {
618 case 0: /* conforms, ie. bus-type dependent */ 705 case 0: /* conforms, ie. bus-type dependent */
619 if (test_bit(bus, mp_bus_not_pci)) 706 if (test_bit(bus, mp_bus_not_pci))
@@ -660,16 +747,16 @@ static inline int irq_trigger(int idx)
660static int pin_2_irq(int idx, int apic, int pin) 747static int pin_2_irq(int idx, int apic, int pin)
661{ 748{
662 int irq, i; 749 int irq, i;
663 int bus = mp_irqs[idx].mpc_srcbus; 750 int bus = mp_irqs[idx].mp_srcbus;
664 751
665 /* 752 /*
666 * Debugging check, we are in big trouble if this message pops up! 753 * Debugging check, we are in big trouble if this message pops up!
667 */ 754 */
668 if (mp_irqs[idx].mpc_dstirq != pin) 755 if (mp_irqs[idx].mp_dstirq != pin)
669 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); 756 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
670 757
671 if (test_bit(bus, mp_bus_not_pci)) { 758 if (test_bit(bus, mp_bus_not_pci)) {
672 irq = mp_irqs[idx].mpc_srcbusirq; 759 irq = mp_irqs[idx].mp_srcbusirq;
673 } else { 760 } else {
674 /* 761 /*
675 * PCI IRQs are mapped in order 762 * PCI IRQs are mapped in order
@@ -683,6 +770,19 @@ static int pin_2_irq(int idx, int apic, int pin)
683 return irq; 770 return irq;
684} 771}
685 772
773void lock_vector_lock(void)
774{
775 /* Used to the online set of cpus does not change
776 * during assign_irq_vector.
777 */
778 spin_lock(&vector_lock);
779}
780
781void unlock_vector_lock(void)
782{
783 spin_unlock(&vector_lock);
784}
785
686static int __assign_irq_vector(int irq, cpumask_t mask) 786static int __assign_irq_vector(int irq, cpumask_t mask)
687{ 787{
688 /* 788 /*
@@ -718,7 +818,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
718 return 0; 818 return 0;
719 } 819 }
720 820
721 for_each_cpu_mask(cpu, mask) { 821 for_each_cpu_mask_nr(cpu, mask) {
722 cpumask_t domain, new_mask; 822 cpumask_t domain, new_mask;
723 int new_cpu; 823 int new_cpu;
724 int vector, offset; 824 int vector, offset;
@@ -730,7 +830,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
730 offset = current_offset; 830 offset = current_offset;
731next: 831next:
732 vector += 8; 832 vector += 8;
733 if (vector >= FIRST_SYSTEM_VECTOR) { 833 if (vector >= first_system_vector) {
734 /* If we run out of vectors on large boxen, must share them. */ 834 /* If we run out of vectors on large boxen, must share them. */
735 offset = (offset + 1) % 8; 835 offset = (offset + 1) % 8;
736 vector = FIRST_DEVICE_VECTOR + offset; 836 vector = FIRST_DEVICE_VECTOR + offset;
@@ -739,7 +839,7 @@ next:
739 continue; 839 continue;
740 if (vector == IA32_SYSCALL_VECTOR) 840 if (vector == IA32_SYSCALL_VECTOR)
741 goto next; 841 goto next;
742 for_each_cpu_mask(new_cpu, new_mask) 842 for_each_cpu_mask_nr(new_cpu, new_mask)
743 if (per_cpu(vector_irq, new_cpu)[vector] != -1) 843 if (per_cpu(vector_irq, new_cpu)[vector] != -1)
744 goto next; 844 goto next;
745 /* Found one! */ 845 /* Found one! */
@@ -749,7 +849,7 @@ next:
749 cfg->move_in_progress = 1; 849 cfg->move_in_progress = 1;
750 cfg->old_domain = cfg->domain; 850 cfg->old_domain = cfg->domain;
751 } 851 }
752 for_each_cpu_mask(new_cpu, new_mask) 852 for_each_cpu_mask_nr(new_cpu, new_mask)
753 per_cpu(vector_irq, new_cpu)[vector] = irq; 853 per_cpu(vector_irq, new_cpu)[vector] = irq;
754 cfg->vector = vector; 854 cfg->vector = vector;
755 cfg->domain = domain; 855 cfg->domain = domain;
@@ -781,7 +881,7 @@ static void __clear_irq_vector(int irq)
781 881
782 vector = cfg->vector; 882 vector = cfg->vector;
783 cpus_and(mask, cfg->domain, cpu_online_map); 883 cpus_and(mask, cfg->domain, cpu_online_map);
784 for_each_cpu_mask(cpu, mask) 884 for_each_cpu_mask_nr(cpu, mask)
785 per_cpu(vector_irq, cpu)[vector] = -1; 885 per_cpu(vector_irq, cpu)[vector] = -1;
786 886
787 cfg->vector = 0; 887 cfg->vector = 0;
@@ -811,20 +911,99 @@ void __setup_vector_irq(int cpu)
811 } 911 }
812} 912}
813 913
814
815static struct irq_chip ioapic_chip; 914static struct irq_chip ioapic_chip;
915#ifdef CONFIG_INTR_REMAP
916static struct irq_chip ir_ioapic_chip;
917#endif
816 918
817static void ioapic_register_intr(int irq, unsigned long trigger) 919static void ioapic_register_intr(int irq, unsigned long trigger)
818{ 920{
819 if (trigger) { 921 if (trigger)
820 irq_desc[irq].status |= IRQ_LEVEL; 922 irq_desc[irq].status |= IRQ_LEVEL;
821 set_irq_chip_and_handler_name(irq, &ioapic_chip, 923 else
822 handle_fasteoi_irq, "fasteoi");
823 } else {
824 irq_desc[irq].status &= ~IRQ_LEVEL; 924 irq_desc[irq].status &= ~IRQ_LEVEL;
925
926#ifdef CONFIG_INTR_REMAP
927 if (irq_remapped(irq)) {
928 irq_desc[irq].status |= IRQ_MOVE_PCNTXT;
929 if (trigger)
930 set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
931 handle_fasteoi_irq,
932 "fasteoi");
933 else
934 set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
935 handle_edge_irq, "edge");
936 return;
937 }
938#endif
939 if (trigger)
940 set_irq_chip_and_handler_name(irq, &ioapic_chip,
941 handle_fasteoi_irq,
942 "fasteoi");
943 else
825 set_irq_chip_and_handler_name(irq, &ioapic_chip, 944 set_irq_chip_and_handler_name(irq, &ioapic_chip,
826 handle_edge_irq, "edge"); 945 handle_edge_irq, "edge");
946}
947
948static int setup_ioapic_entry(int apic, int irq,
949 struct IO_APIC_route_entry *entry,
950 unsigned int destination, int trigger,
951 int polarity, int vector)
952{
953 /*
954 * add it to the IO-APIC irq-routing table:
955 */
956 memset(entry,0,sizeof(*entry));
957
958#ifdef CONFIG_INTR_REMAP
959 if (intr_remapping_enabled) {
960 struct intel_iommu *iommu = map_ioapic_to_ir(apic);
961 struct irte irte;
962 struct IR_IO_APIC_route_entry *ir_entry =
963 (struct IR_IO_APIC_route_entry *) entry;
964 int index;
965
966 if (!iommu)
967 panic("No mapping iommu for ioapic %d\n", apic);
968
969 index = alloc_irte(iommu, irq, 1);
970 if (index < 0)
971 panic("Failed to allocate IRTE for ioapic %d\n", apic);
972
973 memset(&irte, 0, sizeof(irte));
974
975 irte.present = 1;
976 irte.dst_mode = INT_DEST_MODE;
977 irte.trigger_mode = trigger;
978 irte.dlvry_mode = INT_DELIVERY_MODE;
979 irte.vector = vector;
980 irte.dest_id = IRTE_DEST(destination);
981
982 modify_irte(irq, &irte);
983
984 ir_entry->index2 = (index >> 15) & 0x1;
985 ir_entry->zero = 0;
986 ir_entry->format = 1;
987 ir_entry->index = (index & 0x7fff);
988 } else
989#endif
990 {
991 entry->delivery_mode = INT_DELIVERY_MODE;
992 entry->dest_mode = INT_DEST_MODE;
993 entry->dest = destination;
827 } 994 }
995
996 entry->mask = 0; /* enable IRQ */
997 entry->trigger = trigger;
998 entry->polarity = polarity;
999 entry->vector = vector;
1000
1001 /* Mask level triggered irqs.
1002 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
1003 */
1004 if (trigger)
1005 entry->mask = 1;
1006 return 0;
828} 1007}
829 1008
830static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, 1009static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
@@ -846,27 +1025,18 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
846 apic_printk(APIC_VERBOSE,KERN_DEBUG 1025 apic_printk(APIC_VERBOSE,KERN_DEBUG
847 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " 1026 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
848 "IRQ %d Mode:%i Active:%i)\n", 1027 "IRQ %d Mode:%i Active:%i)\n",
849 apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector, 1028 apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
850 irq, trigger, polarity); 1029 irq, trigger, polarity);
851 1030
852 /*
853 * add it to the IO-APIC irq-routing table:
854 */
855 memset(&entry,0,sizeof(entry));
856 1031
857 entry.delivery_mode = INT_DELIVERY_MODE; 1032 if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
858 entry.dest_mode = INT_DEST_MODE; 1033 cpu_mask_to_apicid(mask), trigger, polarity,
859 entry.dest = cpu_mask_to_apicid(mask); 1034 cfg->vector)) {
860 entry.mask = 0; /* enable IRQ */ 1035 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
861 entry.trigger = trigger; 1036 mp_ioapics[apic].mp_apicid, pin);
862 entry.polarity = polarity; 1037 __clear_irq_vector(irq);
863 entry.vector = cfg->vector; 1038 return;
864 1039 }
865 /* Mask level triggered irqs.
866 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
867 */
868 if (trigger)
869 entry.mask = 1;
870 1040
871 ioapic_register_intr(irq, trigger); 1041 ioapic_register_intr(irq, trigger);
872 if (irq < 16) 1042 if (irq < 16)
@@ -887,10 +1057,10 @@ static void __init setup_IO_APIC_irqs(void)
887 idx = find_irq_entry(apic,pin,mp_INT); 1057 idx = find_irq_entry(apic,pin,mp_INT);
888 if (idx == -1) { 1058 if (idx == -1) {
889 if (first_notcon) { 1059 if (first_notcon) {
890 apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin); 1060 apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin);
891 first_notcon = 0; 1061 first_notcon = 0;
892 } else 1062 } else
893 apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin); 1063 apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin);
894 continue; 1064 continue;
895 } 1065 }
896 if (!first_notcon) { 1066 if (!first_notcon) {
@@ -911,26 +1081,24 @@ static void __init setup_IO_APIC_irqs(void)
911} 1081}
912 1082
913/* 1083/*
914 * Set up the 8259A-master output pin as broadcast to all 1084 * Set up the timer pin, possibly with the 8259A-master behind.
915 * CPUs.
916 */ 1085 */
917static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) 1086static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
1087 int vector)
918{ 1088{
919 struct IO_APIC_route_entry entry; 1089 struct IO_APIC_route_entry entry;
920 1090
921 memset(&entry, 0, sizeof(entry)); 1091 if (intr_remapping_enabled)
922 1092 return;
923 disable_8259A_irq(0);
924 1093
925 /* mask LVT0 */ 1094 memset(&entry, 0, sizeof(entry));
926 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
927 1095
928 /* 1096 /*
929 * We use logical delivery to get the timer IRQ 1097 * We use logical delivery to get the timer IRQ
930 * to the first CPU. 1098 * to the first CPU.
931 */ 1099 */
932 entry.dest_mode = INT_DEST_MODE; 1100 entry.dest_mode = INT_DEST_MODE;
933 entry.mask = 0; /* unmask IRQ now */ 1101 entry.mask = 1; /* mask IRQ now */
934 entry.dest = cpu_mask_to_apicid(TARGET_CPUS); 1102 entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
935 entry.delivery_mode = INT_DELIVERY_MODE; 1103 entry.delivery_mode = INT_DELIVERY_MODE;
936 entry.polarity = 0; 1104 entry.polarity = 0;
@@ -939,7 +1107,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
939 1107
940 /* 1108 /*
941 * The timer IRQ doesn't have to know that behind the 1109 * The timer IRQ doesn't have to know that behind the
942 * scene we have a 8259A-master in AEOI mode ... 1110 * scene we may have a 8259A-master in AEOI mode ...
943 */ 1111 */
944 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); 1112 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
945 1113
@@ -947,11 +1115,10 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
947 * Add it to the IO-APIC irq-routing table: 1115 * Add it to the IO-APIC irq-routing table:
948 */ 1116 */
949 ioapic_write_entry(apic, pin, entry); 1117 ioapic_write_entry(apic, pin, entry);
950
951 enable_8259A_irq(0);
952} 1118}
953 1119
954void __apicdebuginit print_IO_APIC(void) 1120
1121__apicdebuginit(void) print_IO_APIC(void)
955{ 1122{
956 int apic, i; 1123 int apic, i;
957 union IO_APIC_reg_00 reg_00; 1124 union IO_APIC_reg_00 reg_00;
@@ -965,7 +1132,7 @@ void __apicdebuginit print_IO_APIC(void)
965 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); 1132 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
966 for (i = 0; i < nr_ioapics; i++) 1133 for (i = 0; i < nr_ioapics; i++)
967 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", 1134 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
968 mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); 1135 mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
969 1136
970 /* 1137 /*
971 * We are a bit conservative about what we expect. We have to 1138 * We are a bit conservative about what we expect. We have to
@@ -983,7 +1150,7 @@ void __apicdebuginit print_IO_APIC(void)
983 spin_unlock_irqrestore(&ioapic_lock, flags); 1150 spin_unlock_irqrestore(&ioapic_lock, flags);
984 1151
985 printk("\n"); 1152 printk("\n");
986 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); 1153 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
987 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); 1154 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
988 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); 1155 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
989 1156
@@ -1045,9 +1212,7 @@ void __apicdebuginit print_IO_APIC(void)
1045 return; 1212 return;
1046} 1213}
1047 1214
1048#if 0 1215__apicdebuginit(void) print_APIC_bitfield(int base)
1049
1050static __apicdebuginit void print_APIC_bitfield (int base)
1051{ 1216{
1052 unsigned int v; 1217 unsigned int v;
1053 int i, j; 1218 int i, j;
@@ -1068,16 +1233,18 @@ static __apicdebuginit void print_APIC_bitfield (int base)
1068 } 1233 }
1069} 1234}
1070 1235
1071void __apicdebuginit print_local_APIC(void * dummy) 1236__apicdebuginit(void) print_local_APIC(void *dummy)
1072{ 1237{
1073 unsigned int v, ver, maxlvt; 1238 unsigned int v, ver, maxlvt;
1239 unsigned long icr;
1074 1240
1075 if (apic_verbosity == APIC_QUIET) 1241 if (apic_verbosity == APIC_QUIET)
1076 return; 1242 return;
1077 1243
1078 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", 1244 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1079 smp_processor_id(), hard_smp_processor_id()); 1245 smp_processor_id(), hard_smp_processor_id());
1080 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id())); 1246 v = apic_read(APIC_ID);
1247 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id());
1081 v = apic_read(APIC_LVR); 1248 v = apic_read(APIC_LVR);
1082 printk(KERN_INFO "... APIC VERSION: %08x\n", v); 1249 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
1083 ver = GET_APIC_VERSION(v); 1250 ver = GET_APIC_VERSION(v);
@@ -1113,10 +1280,9 @@ void __apicdebuginit print_local_APIC(void * dummy)
1113 v = apic_read(APIC_ESR); 1280 v = apic_read(APIC_ESR);
1114 printk(KERN_DEBUG "... APIC ESR: %08x\n", v); 1281 printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
1115 1282
1116 v = apic_read(APIC_ICR); 1283 icr = apic_icr_read();
1117 printk(KERN_DEBUG "... APIC ICR: %08x\n", v); 1284 printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr);
1118 v = apic_read(APIC_ICR2); 1285 printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32));
1119 printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
1120 1286
1121 v = apic_read(APIC_LVTT); 1287 v = apic_read(APIC_LVTT);
1122 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); 1288 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
@@ -1144,12 +1310,12 @@ void __apicdebuginit print_local_APIC(void * dummy)
1144 printk("\n"); 1310 printk("\n");
1145} 1311}
1146 1312
1147void print_all_local_APICs (void) 1313__apicdebuginit(void) print_all_local_APICs(void)
1148{ 1314{
1149 on_each_cpu(print_local_APIC, NULL, 1, 1); 1315 on_each_cpu(print_local_APIC, NULL, 1);
1150} 1316}
1151 1317
1152void __apicdebuginit print_PIC(void) 1318__apicdebuginit(void) print_PIC(void)
1153{ 1319{
1154 unsigned int v; 1320 unsigned int v;
1155 unsigned long flags; 1321 unsigned long flags;
@@ -1181,7 +1347,17 @@ void __apicdebuginit print_PIC(void)
1181 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); 1347 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1182} 1348}
1183 1349
1184#endif /* 0 */ 1350__apicdebuginit(int) print_all_ICs(void)
1351{
1352 print_PIC();
1353 print_all_local_APICs();
1354 print_IO_APIC();
1355
1356 return 0;
1357}
1358
1359fs_initcall(print_all_ICs);
1360
1185 1361
1186void __init enable_IO_APIC(void) 1362void __init enable_IO_APIC(void)
1187{ 1363{
@@ -1271,7 +1447,7 @@ void disable_IO_APIC(void)
1271 entry.dest_mode = 0; /* Physical */ 1447 entry.dest_mode = 0; /* Physical */
1272 entry.delivery_mode = dest_ExtINT; /* ExtInt */ 1448 entry.delivery_mode = dest_ExtINT; /* ExtInt */
1273 entry.vector = 0; 1449 entry.vector = 0;
1274 entry.dest = GET_APIC_ID(read_apic_id()); 1450 entry.dest = read_apic_id();
1275 1451
1276 /* 1452 /*
1277 * Add it to the IO-APIC irq-routing table: 1453 * Add it to the IO-APIC irq-routing table:
@@ -1358,12 +1534,10 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
1358static int ioapic_retrigger_irq(unsigned int irq) 1534static int ioapic_retrigger_irq(unsigned int irq)
1359{ 1535{
1360 struct irq_cfg *cfg = &irq_cfg[irq]; 1536 struct irq_cfg *cfg = &irq_cfg[irq];
1361 cpumask_t mask;
1362 unsigned long flags; 1537 unsigned long flags;
1363 1538
1364 spin_lock_irqsave(&vector_lock, flags); 1539 spin_lock_irqsave(&vector_lock, flags);
1365 mask = cpumask_of_cpu(first_cpu(cfg->domain)); 1540 send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
1366 send_IPI_mask(mask, cfg->vector);
1367 spin_unlock_irqrestore(&vector_lock, flags); 1541 spin_unlock_irqrestore(&vector_lock, flags);
1368 1542
1369 return 1; 1543 return 1;
@@ -1379,6 +1553,147 @@ static int ioapic_retrigger_irq(unsigned int irq)
1379 */ 1553 */
1380 1554
1381#ifdef CONFIG_SMP 1555#ifdef CONFIG_SMP
1556
1557#ifdef CONFIG_INTR_REMAP
1558static void ir_irq_migration(struct work_struct *work);
1559
1560static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
1561
1562/*
1563 * Migrate the IO-APIC irq in the presence of intr-remapping.
1564 *
1565 * For edge triggered, irq migration is a simple atomic update(of vector
1566 * and cpu destination) of IRTE and flush the hardware cache.
1567 *
1568 * For level triggered, we need to modify the io-apic RTE aswell with the update
1569 * vector information, along with modifying IRTE with vector and destination.
1570 * So irq migration for level triggered is little bit more complex compared to
1571 * edge triggered migration. But the good news is, we use the same algorithm
1572 * for level triggered migration as we have today, only difference being,
1573 * we now initiate the irq migration from process context instead of the
1574 * interrupt context.
1575 *
1576 * In future, when we do a directed EOI (combined with cpu EOI broadcast
1577 * suppression) to the IO-APIC, level triggered irq migration will also be
1578 * as simple as edge triggered migration and we can do the irq migration
1579 * with a simple atomic update to IO-APIC RTE.
1580 */
1581static void migrate_ioapic_irq(int irq, cpumask_t mask)
1582{
1583 struct irq_cfg *cfg = irq_cfg + irq;
1584 struct irq_desc *desc = irq_desc + irq;
1585 cpumask_t tmp, cleanup_mask;
1586 struct irte irte;
1587 int modify_ioapic_rte = desc->status & IRQ_LEVEL;
1588 unsigned int dest;
1589 unsigned long flags;
1590
1591 cpus_and(tmp, mask, cpu_online_map);
1592 if (cpus_empty(tmp))
1593 return;
1594
1595 if (get_irte(irq, &irte))
1596 return;
1597
1598 if (assign_irq_vector(irq, mask))
1599 return;
1600
1601 cpus_and(tmp, cfg->domain, mask);
1602 dest = cpu_mask_to_apicid(tmp);
1603
1604 if (modify_ioapic_rte) {
1605 spin_lock_irqsave(&ioapic_lock, flags);
1606 __target_IO_APIC_irq(irq, dest, cfg->vector);
1607 spin_unlock_irqrestore(&ioapic_lock, flags);
1608 }
1609
1610 irte.vector = cfg->vector;
1611 irte.dest_id = IRTE_DEST(dest);
1612
1613 /*
1614 * Modified the IRTE and flushes the Interrupt entry cache.
1615 */
1616 modify_irte(irq, &irte);
1617
1618 if (cfg->move_in_progress) {
1619 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
1620 cfg->move_cleanup_count = cpus_weight(cleanup_mask);
1621 send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
1622 cfg->move_in_progress = 0;
1623 }
1624
1625 irq_desc[irq].affinity = mask;
1626}
1627
1628static int migrate_irq_remapped_level(int irq)
1629{
1630 int ret = -1;
1631
1632 mask_IO_APIC_irq(irq);
1633
1634 if (io_apic_level_ack_pending(irq)) {
1635 /*
1636 * Interrupt in progress. Migrating irq now will change the
1637 * vector information in the IO-APIC RTE and that will confuse
1638 * the EOI broadcast performed by cpu.
1639 * So, delay the irq migration to the next instance.
1640 */
1641 schedule_delayed_work(&ir_migration_work, 1);
1642 goto unmask;
1643 }
1644
1645 /* everthing is clear. we have right of way */
1646 migrate_ioapic_irq(irq, irq_desc[irq].pending_mask);
1647
1648 ret = 0;
1649 irq_desc[irq].status &= ~IRQ_MOVE_PENDING;
1650 cpus_clear(irq_desc[irq].pending_mask);
1651
1652unmask:
1653 unmask_IO_APIC_irq(irq);
1654 return ret;
1655}
1656
1657static void ir_irq_migration(struct work_struct *work)
1658{
1659 int irq;
1660
1661 for (irq = 0; irq < NR_IRQS; irq++) {
1662 struct irq_desc *desc = irq_desc + irq;
1663 if (desc->status & IRQ_MOVE_PENDING) {
1664 unsigned long flags;
1665
1666 spin_lock_irqsave(&desc->lock, flags);
1667 if (!desc->chip->set_affinity ||
1668 !(desc->status & IRQ_MOVE_PENDING)) {
1669 desc->status &= ~IRQ_MOVE_PENDING;
1670 spin_unlock_irqrestore(&desc->lock, flags);
1671 continue;
1672 }
1673
1674 desc->chip->set_affinity(irq,
1675 irq_desc[irq].pending_mask);
1676 spin_unlock_irqrestore(&desc->lock, flags);
1677 }
1678 }
1679}
1680
1681/*
1682 * Migrates the IRQ destination in the process context.
1683 */
1684static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
1685{
1686 if (irq_desc[irq].status & IRQ_LEVEL) {
1687 irq_desc[irq].status |= IRQ_MOVE_PENDING;
1688 irq_desc[irq].pending_mask = mask;
1689 migrate_irq_remapped_level(irq);
1690 return;
1691 }
1692
1693 migrate_ioapic_irq(irq, mask);
1694}
1695#endif
1696
1382asmlinkage void smp_irq_move_cleanup_interrupt(void) 1697asmlinkage void smp_irq_move_cleanup_interrupt(void)
1383{ 1698{
1384 unsigned vector, me; 1699 unsigned vector, me;
@@ -1435,6 +1750,17 @@ static void irq_complete_move(unsigned int irq)
1435#else 1750#else
1436static inline void irq_complete_move(unsigned int irq) {} 1751static inline void irq_complete_move(unsigned int irq) {}
1437#endif 1752#endif
1753#ifdef CONFIG_INTR_REMAP
1754static void ack_x2apic_level(unsigned int irq)
1755{
1756 ack_x2APIC_irq();
1757}
1758
1759static void ack_x2apic_edge(unsigned int irq)
1760{
1761 ack_x2APIC_irq();
1762}
1763#endif
1438 1764
1439static void ack_apic_edge(unsigned int irq) 1765static void ack_apic_edge(unsigned int irq)
1440{ 1766{
@@ -1509,6 +1835,21 @@ static struct irq_chip ioapic_chip __read_mostly = {
1509 .retrigger = ioapic_retrigger_irq, 1835 .retrigger = ioapic_retrigger_irq,
1510}; 1836};
1511 1837
1838#ifdef CONFIG_INTR_REMAP
1839static struct irq_chip ir_ioapic_chip __read_mostly = {
1840 .name = "IR-IO-APIC",
1841 .startup = startup_ioapic_irq,
1842 .mask = mask_IO_APIC_irq,
1843 .unmask = unmask_IO_APIC_irq,
1844 .ack = ack_x2apic_edge,
1845 .eoi = ack_x2apic_level,
1846#ifdef CONFIG_SMP
1847 .set_affinity = set_ir_ioapic_affinity_irq,
1848#endif
1849 .retrigger = ioapic_retrigger_irq,
1850};
1851#endif
1852
1512static inline void init_IO_APIC_traps(void) 1853static inline void init_IO_APIC_traps(void)
1513{ 1854{
1514 int irq; 1855 int irq;
@@ -1540,7 +1881,7 @@ static inline void init_IO_APIC_traps(void)
1540 } 1881 }
1541} 1882}
1542 1883
1543static void enable_lapic_irq (unsigned int irq) 1884static void unmask_lapic_irq(unsigned int irq)
1544{ 1885{
1545 unsigned long v; 1886 unsigned long v;
1546 1887
@@ -1548,7 +1889,7 @@ static void enable_lapic_irq (unsigned int irq)
1548 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); 1889 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
1549} 1890}
1550 1891
1551static void disable_lapic_irq (unsigned int irq) 1892static void mask_lapic_irq(unsigned int irq)
1552{ 1893{
1553 unsigned long v; 1894 unsigned long v;
1554 1895
@@ -1561,19 +1902,20 @@ static void ack_lapic_irq (unsigned int irq)
1561 ack_APIC_irq(); 1902 ack_APIC_irq();
1562} 1903}
1563 1904
1564static void end_lapic_irq (unsigned int i) { /* nothing */ } 1905static struct irq_chip lapic_chip __read_mostly = {
1565 1906 .name = "local-APIC",
1566static struct hw_interrupt_type lapic_irq_type __read_mostly = { 1907 .mask = mask_lapic_irq,
1567 .name = "local-APIC", 1908 .unmask = unmask_lapic_irq,
1568 .typename = "local-APIC-edge", 1909 .ack = ack_lapic_irq,
1569 .startup = NULL, /* startup_irq() not used for IRQ0 */
1570 .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
1571 .enable = enable_lapic_irq,
1572 .disable = disable_lapic_irq,
1573 .ack = ack_lapic_irq,
1574 .end = end_lapic_irq,
1575}; 1910};
1576 1911
1912static void lapic_register_intr(int irq)
1913{
1914 irq_desc[irq].status &= ~IRQ_LEVEL;
1915 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
1916 "edge");
1917}
1918
1577static void __init setup_nmi(void) 1919static void __init setup_nmi(void)
1578{ 1920{
1579 /* 1921 /*
@@ -1659,6 +2001,7 @@ static inline void __init check_timer(void)
1659 struct irq_cfg *cfg = irq_cfg + 0; 2001 struct irq_cfg *cfg = irq_cfg + 0;
1660 int apic1, pin1, apic2, pin2; 2002 int apic1, pin1, apic2, pin2;
1661 unsigned long flags; 2003 unsigned long flags;
2004 int no_pin1 = 0;
1662 2005
1663 local_irq_save(flags); 2006 local_irq_save(flags);
1664 2007
@@ -1669,34 +2012,50 @@ static inline void __init check_timer(void)
1669 assign_irq_vector(0, TARGET_CPUS); 2012 assign_irq_vector(0, TARGET_CPUS);
1670 2013
1671 /* 2014 /*
1672 * Subtle, code in do_timer_interrupt() expects an AEOI 2015 * As IRQ0 is to be enabled in the 8259A, the virtual
1673 * mode for the 8259A whenever interrupts are routed 2016 * wire has to be disabled in the local APIC.
1674 * through I/O APICs. Also IRQ0 has to be enabled in
1675 * the 8259A which implies the virtual wire has to be
1676 * disabled in the local APIC.
1677 */ 2017 */
1678 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 2018 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
1679 init_8259A(1); 2019 init_8259A(1);
1680 if (timer_over_8254 > 0)
1681 enable_8259A_irq(0);
1682 2020
1683 pin1 = find_isa_irq_pin(0, mp_INT); 2021 pin1 = find_isa_irq_pin(0, mp_INT);
1684 apic1 = find_isa_irq_apic(0, mp_INT); 2022 apic1 = find_isa_irq_apic(0, mp_INT);
1685 pin2 = ioapic_i8259.pin; 2023 pin2 = ioapic_i8259.pin;
1686 apic2 = ioapic_i8259.apic; 2024 apic2 = ioapic_i8259.apic;
1687 2025
1688 apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", 2026 apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
1689 cfg->vector, apic1, pin1, apic2, pin2); 2027 "apic1=%d pin1=%d apic2=%d pin2=%d\n",
2028 cfg->vector, apic1, pin1, apic2, pin2);
2029
2030 /*
2031 * Some BIOS writers are clueless and report the ExtINTA
2032 * I/O APIC input from the cascaded 8259A as the timer
2033 * interrupt input. So just in case, if only one pin
2034 * was found above, try it both directly and through the
2035 * 8259A.
2036 */
2037 if (pin1 == -1) {
2038 if (intr_remapping_enabled)
2039 panic("BIOS bug: timer not connected to IO-APIC");
2040 pin1 = pin2;
2041 apic1 = apic2;
2042 no_pin1 = 1;
2043 } else if (pin2 == -1) {
2044 pin2 = pin1;
2045 apic2 = apic1;
2046 }
1690 2047
1691 if (pin1 != -1) { 2048 if (pin1 != -1) {
1692 /* 2049 /*
1693 * Ok, does IRQ0 through the IOAPIC work? 2050 * Ok, does IRQ0 through the IOAPIC work?
1694 */ 2051 */
2052 if (no_pin1) {
2053 add_pin_to_irq(0, apic1, pin1);
2054 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
2055 }
1695 unmask_IO_APIC_irq(0); 2056 unmask_IO_APIC_irq(0);
1696 if (!no_timer_check && timer_irq_works()) { 2057 if (!no_timer_check && timer_irq_works()) {
1697 nmi_watchdog_default();
1698 if (nmi_watchdog == NMI_IO_APIC) { 2058 if (nmi_watchdog == NMI_IO_APIC) {
1699 disable_8259A_irq(0);
1700 setup_nmi(); 2059 setup_nmi();
1701 enable_8259A_irq(0); 2060 enable_8259A_irq(0);
1702 } 2061 }
@@ -1704,55 +2063,65 @@ static inline void __init check_timer(void)
1704 clear_IO_APIC_pin(0, pin1); 2063 clear_IO_APIC_pin(0, pin1);
1705 goto out; 2064 goto out;
1706 } 2065 }
2066 if (intr_remapping_enabled)
2067 panic("timer doesn't work through Interrupt-remapped IO-APIC");
1707 clear_IO_APIC_pin(apic1, pin1); 2068 clear_IO_APIC_pin(apic1, pin1);
1708 apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not " 2069 if (!no_pin1)
1709 "connected to IO-APIC\n"); 2070 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
1710 } 2071 "8254 timer not connected to IO-APIC\n");
1711 2072
1712 apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) " 2073 apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
1713 "through the 8259A ... "); 2074 "(IRQ0) through the 8259A ...\n");
1714 if (pin2 != -1) { 2075 apic_printk(APIC_QUIET, KERN_INFO
1715 apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...", 2076 "..... (found apic %d pin %d) ...\n", apic2, pin2);
1716 apic2, pin2);
1717 /* 2077 /*
1718 * legacy devices should be connected to IO APIC #0 2078 * legacy devices should be connected to IO APIC #0
1719 */ 2079 */
1720 setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector); 2080 replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
2081 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
2082 unmask_IO_APIC_irq(0);
2083 enable_8259A_irq(0);
1721 if (timer_irq_works()) { 2084 if (timer_irq_works()) {
1722 apic_printk(APIC_VERBOSE," works.\n"); 2085 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
1723 nmi_watchdog_default(); 2086 timer_through_8259 = 1;
1724 if (nmi_watchdog == NMI_IO_APIC) { 2087 if (nmi_watchdog == NMI_IO_APIC) {
2088 disable_8259A_irq(0);
1725 setup_nmi(); 2089 setup_nmi();
2090 enable_8259A_irq(0);
1726 } 2091 }
1727 goto out; 2092 goto out;
1728 } 2093 }
1729 /* 2094 /*
1730 * Cleanup, just in case ... 2095 * Cleanup, just in case ...
1731 */ 2096 */
2097 disable_8259A_irq(0);
1732 clear_IO_APIC_pin(apic2, pin2); 2098 clear_IO_APIC_pin(apic2, pin2);
2099 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
1733 } 2100 }
1734 apic_printk(APIC_VERBOSE," failed.\n");
1735 2101
1736 if (nmi_watchdog == NMI_IO_APIC) { 2102 if (nmi_watchdog == NMI_IO_APIC) {
1737 printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); 2103 apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
1738 nmi_watchdog = 0; 2104 "through the IO-APIC - disabling NMI Watchdog!\n");
2105 nmi_watchdog = NMI_NONE;
1739 } 2106 }
1740 2107
1741 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); 2108 apic_printk(APIC_QUIET, KERN_INFO
2109 "...trying to set up timer as Virtual Wire IRQ...\n");
1742 2110
1743 disable_8259A_irq(0); 2111 lapic_register_intr(0);
1744 irq_desc[0].chip = &lapic_irq_type;
1745 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ 2112 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
1746 enable_8259A_irq(0); 2113 enable_8259A_irq(0);
1747 2114
1748 if (timer_irq_works()) { 2115 if (timer_irq_works()) {
1749 apic_printk(APIC_VERBOSE," works.\n"); 2116 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
1750 goto out; 2117 goto out;
1751 } 2118 }
2119 disable_8259A_irq(0);
1752 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); 2120 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
1753 apic_printk(APIC_VERBOSE," failed.\n"); 2121 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
1754 2122
1755 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ..."); 2123 apic_printk(APIC_QUIET, KERN_INFO
2124 "...trying to set up timer as ExtINT IRQ...\n");
1756 2125
1757 init_8259A(0); 2126 init_8259A(0);
1758 make_8259A_irq(0); 2127 make_8259A_irq(0);
@@ -1761,11 +2130,12 @@ static inline void __init check_timer(void)
1761 unlock_ExtINT_logic(); 2130 unlock_ExtINT_logic();
1762 2131
1763 if (timer_irq_works()) { 2132 if (timer_irq_works()) {
1764 apic_printk(APIC_VERBOSE," works.\n"); 2133 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
1765 goto out; 2134 goto out;
1766 } 2135 }
1767 apic_printk(APIC_VERBOSE," failed :(.\n"); 2136 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
1768 panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); 2137 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
2138 "report. Then try booting with the 'noapic' option.\n");
1769out: 2139out:
1770 local_irq_restore(flags); 2140 local_irq_restore(flags);
1771} 2141}
@@ -1778,11 +2148,21 @@ static int __init notimercheck(char *s)
1778__setup("no_timer_check", notimercheck); 2148__setup("no_timer_check", notimercheck);
1779 2149
1780/* 2150/*
1781 * 2151 * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
1782 * IRQs that are handled by the PIC in the MPS IOAPIC case. 2152 * to devices. However there may be an I/O APIC pin available for
1783 * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. 2153 * this interrupt regardless. The pin may be left unconnected, but
1784 * Linux doesn't really care, as it's not actually used 2154 * typically it will be reused as an ExtINT cascade interrupt for
1785 * for any interrupt handling anyway. 2155 * the master 8259A. In the MPS case such a pin will normally be
2156 * reported as an ExtINT interrupt in the MP table. With ACPI
2157 * there is no provision for ExtINT interrupts, and in the absence
2158 * of an override it would be treated as an ordinary ISA I/O APIC
2159 * interrupt, that is edge-triggered and unmasked by default. We
2160 * used to do this, but it caused problems on some systems because
2161 * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
2162 * the same ExtINT cascade interrupt to drive the local APIC of the
2163 * bootstrap processor. Therefore we refrain from routing IRQ2 to
2164 * the I/O APIC in all cases now. No actual device should request
2165 * it anyway. --macro
1786 */ 2166 */
1787#define PIC_IRQS (1<<2) 2167#define PIC_IRQS (1<<2)
1788 2168
@@ -1793,10 +2173,7 @@ void __init setup_IO_APIC(void)
1793 * calling enable_IO_APIC() is moved to setup_local_APIC for BP 2173 * calling enable_IO_APIC() is moved to setup_local_APIC for BP
1794 */ 2174 */
1795 2175
1796 if (acpi_ioapic) 2176 io_apic_irqs = ~PIC_IRQS;
1797 io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
1798 else
1799 io_apic_irqs = ~PIC_IRQS;
1800 2177
1801 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); 2178 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
1802 2179
@@ -1804,8 +2181,6 @@ void __init setup_IO_APIC(void)
1804 setup_IO_APIC_irqs(); 2181 setup_IO_APIC_irqs();
1805 init_IO_APIC_traps(); 2182 init_IO_APIC_traps();
1806 check_timer(); 2183 check_timer();
1807 if (!acpi_ioapic)
1808 print_IO_APIC();
1809} 2184}
1810 2185
1811struct sysfs_ioapic_data { 2186struct sysfs_ioapic_data {
@@ -1841,8 +2216,8 @@ static int ioapic_resume(struct sys_device *dev)
1841 2216
1842 spin_lock_irqsave(&ioapic_lock, flags); 2217 spin_lock_irqsave(&ioapic_lock, flags);
1843 reg_00.raw = io_apic_read(dev->id, 0); 2218 reg_00.raw = io_apic_read(dev->id, 0);
1844 if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { 2219 if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
1845 reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; 2220 reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
1846 io_apic_write(dev->id, 0, reg_00.raw); 2221 io_apic_write(dev->id, 0, reg_00.raw);
1847 } 2222 }
1848 spin_unlock_irqrestore(&ioapic_lock, flags); 2223 spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -1927,6 +2302,9 @@ void destroy_irq(unsigned int irq)
1927 2302
1928 dynamic_irq_cleanup(irq); 2303 dynamic_irq_cleanup(irq);
1929 2304
2305#ifdef CONFIG_INTR_REMAP
2306 free_irte(irq);
2307#endif
1930 spin_lock_irqsave(&vector_lock, flags); 2308 spin_lock_irqsave(&vector_lock, flags);
1931 __clear_irq_vector(irq); 2309 __clear_irq_vector(irq);
1932 spin_unlock_irqrestore(&vector_lock, flags); 2310 spin_unlock_irqrestore(&vector_lock, flags);
@@ -1945,10 +2323,41 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
1945 2323
1946 tmp = TARGET_CPUS; 2324 tmp = TARGET_CPUS;
1947 err = assign_irq_vector(irq, tmp); 2325 err = assign_irq_vector(irq, tmp);
1948 if (!err) { 2326 if (err)
1949 cpus_and(tmp, cfg->domain, tmp); 2327 return err;
1950 dest = cpu_mask_to_apicid(tmp); 2328
2329 cpus_and(tmp, cfg->domain, tmp);
2330 dest = cpu_mask_to_apicid(tmp);
2331
2332#ifdef CONFIG_INTR_REMAP
2333 if (irq_remapped(irq)) {
2334 struct irte irte;
2335 int ir_index;
2336 u16 sub_handle;
2337
2338 ir_index = map_irq_to_irte_handle(irq, &sub_handle);
2339 BUG_ON(ir_index == -1);
2340
2341 memset (&irte, 0, sizeof(irte));
1951 2342
2343 irte.present = 1;
2344 irte.dst_mode = INT_DEST_MODE;
2345 irte.trigger_mode = 0; /* edge */
2346 irte.dlvry_mode = INT_DELIVERY_MODE;
2347 irte.vector = cfg->vector;
2348 irte.dest_id = IRTE_DEST(dest);
2349
2350 modify_irte(irq, &irte);
2351
2352 msg->address_hi = MSI_ADDR_BASE_HI;
2353 msg->data = sub_handle;
2354 msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
2355 MSI_ADDR_IR_SHV |
2356 MSI_ADDR_IR_INDEX1(ir_index) |
2357 MSI_ADDR_IR_INDEX2(ir_index);
2358 } else
2359#endif
2360 {
1952 msg->address_hi = MSI_ADDR_BASE_HI; 2361 msg->address_hi = MSI_ADDR_BASE_HI;
1953 msg->address_lo = 2362 msg->address_lo =
1954 MSI_ADDR_BASE_LO | 2363 MSI_ADDR_BASE_LO |
@@ -1999,6 +2408,55 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
1999 write_msi_msg(irq, &msg); 2408 write_msi_msg(irq, &msg);
2000 irq_desc[irq].affinity = mask; 2409 irq_desc[irq].affinity = mask;
2001} 2410}
2411
2412#ifdef CONFIG_INTR_REMAP
2413/*
2414 * Migrate the MSI irq to another cpumask. This migration is
2415 * done in the process context using interrupt-remapping hardware.
2416 */
2417static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2418{
2419 struct irq_cfg *cfg = irq_cfg + irq;
2420 unsigned int dest;
2421 cpumask_t tmp, cleanup_mask;
2422 struct irte irte;
2423
2424 cpus_and(tmp, mask, cpu_online_map);
2425 if (cpus_empty(tmp))
2426 return;
2427
2428 if (get_irte(irq, &irte))
2429 return;
2430
2431 if (assign_irq_vector(irq, mask))
2432 return;
2433
2434 cpus_and(tmp, cfg->domain, mask);
2435 dest = cpu_mask_to_apicid(tmp);
2436
2437 irte.vector = cfg->vector;
2438 irte.dest_id = IRTE_DEST(dest);
2439
2440 /*
2441 * atomically update the IRTE with the new destination and vector.
2442 */
2443 modify_irte(irq, &irte);
2444
2445 /*
2446 * After this point, all the interrupts will start arriving
2447 * at the new destination. So, time to cleanup the previous
2448 * vector allocation.
2449 */
2450 if (cfg->move_in_progress) {
2451 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
2452 cfg->move_cleanup_count = cpus_weight(cleanup_mask);
2453 send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
2454 cfg->move_in_progress = 0;
2455 }
2456
2457 irq_desc[irq].affinity = mask;
2458}
2459#endif
2002#endif /* CONFIG_SMP */ 2460#endif /* CONFIG_SMP */
2003 2461
2004/* 2462/*
@@ -2016,26 +2474,157 @@ static struct irq_chip msi_chip = {
2016 .retrigger = ioapic_retrigger_irq, 2474 .retrigger = ioapic_retrigger_irq,
2017}; 2475};
2018 2476
2019int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) 2477#ifdef CONFIG_INTR_REMAP
2478static struct irq_chip msi_ir_chip = {
2479 .name = "IR-PCI-MSI",
2480 .unmask = unmask_msi_irq,
2481 .mask = mask_msi_irq,
2482 .ack = ack_x2apic_edge,
2483#ifdef CONFIG_SMP
2484 .set_affinity = ir_set_msi_irq_affinity,
2485#endif
2486 .retrigger = ioapic_retrigger_irq,
2487};
2488
2489/*
2490 * Map the PCI dev to the corresponding remapping hardware unit
2491 * and allocate 'nvec' consecutive interrupt-remapping table entries
2492 * in it.
2493 */
2494static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
2495{
2496 struct intel_iommu *iommu;
2497 int index;
2498
2499 iommu = map_dev_to_ir(dev);
2500 if (!iommu) {
2501 printk(KERN_ERR
2502 "Unable to map PCI %s to iommu\n", pci_name(dev));
2503 return -ENOENT;
2504 }
2505
2506 index = alloc_irte(iommu, irq, nvec);
2507 if (index < 0) {
2508 printk(KERN_ERR
2509 "Unable to allocate %d IRTE for PCI %s\n", nvec,
2510 pci_name(dev));
2511 return -ENOSPC;
2512 }
2513 return index;
2514}
2515#endif
2516
2517static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
2020{ 2518{
2519 int ret;
2021 struct msi_msg msg; 2520 struct msi_msg msg;
2521
2522 ret = msi_compose_msg(dev, irq, &msg);
2523 if (ret < 0)
2524 return ret;
2525
2526 set_irq_msi(irq, desc);
2527 write_msi_msg(irq, &msg);
2528
2529#ifdef CONFIG_INTR_REMAP
2530 if (irq_remapped(irq)) {
2531 struct irq_desc *desc = irq_desc + irq;
2532 /*
2533 * irq migration in process context
2534 */
2535 desc->status |= IRQ_MOVE_PCNTXT;
2536 set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
2537 } else
2538#endif
2539 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
2540
2541 return 0;
2542}
2543
2544int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
2545{
2022 int irq, ret; 2546 int irq, ret;
2547
2023 irq = create_irq(); 2548 irq = create_irq();
2024 if (irq < 0) 2549 if (irq < 0)
2025 return irq; 2550 return irq;
2026 2551
2027 ret = msi_compose_msg(dev, irq, &msg); 2552#ifdef CONFIG_INTR_REMAP
2553 if (!intr_remapping_enabled)
2554 goto no_ir;
2555
2556 ret = msi_alloc_irte(dev, irq, 1);
2557 if (ret < 0)
2558 goto error;
2559no_ir:
2560#endif
2561 ret = setup_msi_irq(dev, desc, irq);
2028 if (ret < 0) { 2562 if (ret < 0) {
2029 destroy_irq(irq); 2563 destroy_irq(irq);
2030 return ret; 2564 return ret;
2031 } 2565 }
2566 return 0;
2032 2567
2033 set_irq_msi(irq, desc); 2568#ifdef CONFIG_INTR_REMAP
2034 write_msi_msg(irq, &msg); 2569error:
2570 destroy_irq(irq);
2571 return ret;
2572#endif
2573}
2035 2574
2036 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); 2575int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
2576{
2577 int irq, ret, sub_handle;
2578 struct msi_desc *desc;
2579#ifdef CONFIG_INTR_REMAP
2580 struct intel_iommu *iommu = 0;
2581 int index = 0;
2582#endif
2583
2584 sub_handle = 0;
2585 list_for_each_entry(desc, &dev->msi_list, list) {
2586 irq = create_irq();
2587 if (irq < 0)
2588 return irq;
2589#ifdef CONFIG_INTR_REMAP
2590 if (!intr_remapping_enabled)
2591 goto no_ir;
2037 2592
2593 if (!sub_handle) {
2594 /*
2595 * allocate the consecutive block of IRTE's
2596 * for 'nvec'
2597 */
2598 index = msi_alloc_irte(dev, irq, nvec);
2599 if (index < 0) {
2600 ret = index;
2601 goto error;
2602 }
2603 } else {
2604 iommu = map_dev_to_ir(dev);
2605 if (!iommu) {
2606 ret = -ENOENT;
2607 goto error;
2608 }
2609 /*
2610 * setup the mapping between the irq and the IRTE
2611 * base index, the sub_handle pointing to the
2612 * appropriate interrupt remap table entry.
2613 */
2614 set_irte_irq(irq, iommu, index, sub_handle);
2615 }
2616no_ir:
2617#endif
2618 ret = setup_msi_irq(dev, desc, irq);
2619 if (ret < 0)
2620 goto error;
2621 sub_handle++;
2622 }
2038 return 0; 2623 return 0;
2624
2625error:
2626 destroy_irq(irq);
2627 return ret;
2039} 2628}
2040 2629
2041void arch_teardown_msi_irq(unsigned int irq) 2630void arch_teardown_msi_irq(unsigned int irq)
@@ -2242,8 +2831,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
2242 return -1; 2831 return -1;
2243 2832
2244 for (i = 0; i < mp_irq_entries; i++) 2833 for (i = 0; i < mp_irq_entries; i++)
2245 if (mp_irqs[i].mpc_irqtype == mp_INT && 2834 if (mp_irqs[i].mp_irqtype == mp_INT &&
2246 mp_irqs[i].mpc_srcbusirq == bus_irq) 2835 mp_irqs[i].mp_srcbusirq == bus_irq)
2247 break; 2836 break;
2248 if (i >= mp_irq_entries) 2837 if (i >= mp_irq_entries)
2249 return -1; 2838 return -1;
@@ -2283,6 +2872,10 @@ void __init setup_ioapic_dest(void)
2283 setup_IO_APIC_irq(ioapic, pin, irq, 2872 setup_IO_APIC_irq(ioapic, pin, irq,
2284 irq_trigger(irq_entry), 2873 irq_trigger(irq_entry),
2285 irq_polarity(irq_entry)); 2874 irq_polarity(irq_entry));
2875#ifdef CONFIG_INTR_REMAP
2876 else if (intr_remapping_enabled)
2877 set_ir_ioapic_affinity_irq(irq, TARGET_CPUS);
2878#endif
2286 else 2879 else
2287 set_ioapic_affinity_irq(irq, TARGET_CPUS); 2880 set_ioapic_affinity_irq(irq, TARGET_CPUS);
2288 } 2881 }
@@ -2336,7 +2929,7 @@ void __init ioapic_init_mappings(void)
2336 ioapic_res = ioapic_setup_resources(); 2929 ioapic_res = ioapic_setup_resources();
2337 for (i = 0; i < nr_ioapics; i++) { 2930 for (i = 0; i < nr_ioapics; i++) {
2338 if (smp_found_config) { 2931 if (smp_found_config) {
2339 ioapic_phys = mp_ioapics[i].mpc_apicaddr; 2932 ioapic_phys = mp_ioapics[i].mp_apicaddr;
2340 } else { 2933 } else {
2341 ioapic_phys = (unsigned long) 2934 ioapic_phys = (unsigned long)
2342 alloc_bootmem_pages(PAGE_SIZE); 2935 alloc_bootmem_pages(PAGE_SIZE);
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index 5921e5f0a640..720d2607aacb 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -92,6 +92,14 @@ static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = {
92 DMI_MATCH(DMI_BOARD_NAME, "30BF") 92 DMI_MATCH(DMI_BOARD_NAME, "30BF")
93 } 93 }
94 }, 94 },
95 {
96 .callback = dmi_io_delay_0xed_port,
97 .ident = "Presario F700",
98 .matches = {
99 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
100 DMI_MATCH(DMI_BOARD_NAME, "30D3")
101 }
102 },
95 { } 103 { }
96}; 104};
97 105
@@ -103,6 +111,9 @@ void __init io_delay_init(void)
103 111
104static int __init io_delay_param(char *s) 112static int __init io_delay_param(char *s)
105{ 113{
114 if (!s)
115 return -EINVAL;
116
106 if (!strcmp(s, "0x80")) 117 if (!strcmp(s, "0x80"))
107 io_delay_type = CONFIG_IO_DELAY_TYPE_0X80; 118 io_delay_type = CONFIG_IO_DELAY_TYPE_0X80;
108 else if (!strcmp(s, "0xed")) 119 else if (!strcmp(s, "0xed"))
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 50e5e4a31c85..191914302744 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -14,6 +14,7 @@
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/thread_info.h> 15#include <linux/thread_info.h>
16#include <linux/syscalls.h> 16#include <linux/syscalls.h>
17#include <asm/syscalls.h>
17 18
18/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ 19/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
19static void set_bitmap(unsigned long *bitmap, unsigned int base, 20static void set_bitmap(unsigned long *bitmap, unsigned int base,
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
index c0df7b89ca23..f1c688e46f35 100644
--- a/arch/x86/kernel/ipi.c
+++ b/arch/x86/kernel/ipi.c
@@ -8,7 +8,6 @@
8#include <linux/kernel_stat.h> 8#include <linux/kernel_stat.h>
9#include <linux/mc146818rtc.h> 9#include <linux/mc146818rtc.h>
10#include <linux/cache.h> 10#include <linux/cache.h>
11#include <linux/interrupt.h>
12#include <linux/cpu.h> 11#include <linux/cpu.h>
13#include <linux/module.h> 12#include <linux/module.h>
14 13
@@ -21,6 +20,8 @@
21 20
22#ifdef CONFIG_X86_32 21#ifdef CONFIG_X86_32
23#include <mach_apic.h> 22#include <mach_apic.h>
23#include <mach_ipi.h>
24
24/* 25/*
25 * the following functions deal with sending IPIs between CPUs. 26 * the following functions deal with sending IPIs between CPUs.
26 * 27 *
@@ -71,7 +72,7 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector)
71 /* 72 /*
72 * Send the IPI. The write to APIC_ICR fires this off. 73 * Send the IPI. The write to APIC_ICR fires this off.
73 */ 74 */
74 apic_write_around(APIC_ICR, cfg); 75 apic_write(APIC_ICR, cfg);
75} 76}
76 77
77void send_IPI_self(int vector) 78void send_IPI_self(int vector)
@@ -99,7 +100,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
99 * prepare target chip field 100 * prepare target chip field
100 */ 101 */
101 cfg = __prepare_ICR2(mask); 102 cfg = __prepare_ICR2(mask);
102 apic_write_around(APIC_ICR2, cfg); 103 apic_write(APIC_ICR2, cfg);
103 104
104 /* 105 /*
105 * program the ICR 106 * program the ICR
@@ -109,7 +110,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
109 /* 110 /*
110 * Send the IPI. The write to APIC_ICR fires this off. 111 * Send the IPI. The write to APIC_ICR fires this off.
111 */ 112 */
112 apic_write_around(APIC_ICR, cfg); 113 apic_write(APIC_ICR, cfg);
113} 114}
114 115
115/* 116/*
@@ -148,7 +149,6 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector)
148} 149}
149 150
150/* must come after the send_IPI functions above for inlining */ 151/* must come after the send_IPI functions above for inlining */
151#include <mach_ipi.h>
152static int convert_apicid_to_cpu(int apic_id) 152static int convert_apicid_to_cpu(int apic_id)
153{ 153{
154 int i; 154 int i;
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 147352df28b9..b71e02d42f4f 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -48,6 +48,29 @@ void ack_bad_irq(unsigned int irq)
48#endif 48#endif
49} 49}
50 50
51#ifdef CONFIG_DEBUG_STACKOVERFLOW
52/* Debugging check for stack overflow: is there less than 1KB free? */
53static int check_stack_overflow(void)
54{
55 long sp;
56
57 __asm__ __volatile__("andl %%esp,%0" :
58 "=r" (sp) : "0" (THREAD_SIZE - 1));
59
60 return sp < (sizeof(struct thread_info) + STACK_WARN);
61}
62
63static void print_stack_overflow(void)
64{
65 printk(KERN_WARNING "low stack detected by irq handler\n");
66 dump_stack();
67}
68
69#else
70static inline int check_stack_overflow(void) { return 0; }
71static inline void print_stack_overflow(void) { }
72#endif
73
51#ifdef CONFIG_4KSTACKS 74#ifdef CONFIG_4KSTACKS
52/* 75/*
53 * per-CPU IRQ handling contexts (thread information and stack) 76 * per-CPU IRQ handling contexts (thread information and stack)
@@ -59,48 +82,26 @@ union irq_ctx {
59 82
60static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; 83static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
61static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; 84static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
62#endif
63
64/*
65 * do_IRQ handles all normal device IRQ's (the special
66 * SMP cross-CPU interrupts have their own specific
67 * handlers).
68 */
69unsigned int do_IRQ(struct pt_regs *regs)
70{
71 struct pt_regs *old_regs;
72 /* high bit used in ret_from_ code */
73 int irq = ~regs->orig_ax;
74 struct irq_desc *desc = irq_desc + irq;
75#ifdef CONFIG_4KSTACKS
76 union irq_ctx *curctx, *irqctx;
77 u32 *isp;
78#endif
79 85
80 if (unlikely((unsigned)irq >= NR_IRQS)) { 86static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
81 printk(KERN_EMERG "%s: cannot handle IRQ %d\n", 87static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
82 __func__, irq);
83 BUG();
84 }
85 88
86 old_regs = set_irq_regs(regs); 89static void call_on_stack(void *func, void *stack)
87 irq_enter(); 90{
88#ifdef CONFIG_DEBUG_STACKOVERFLOW 91 asm volatile("xchgl %%ebx,%%esp \n"
89 /* Debugging check for stack overflow: is there less than 1KB free? */ 92 "call *%%edi \n"
90 { 93 "movl %%ebx,%%esp \n"
91 long sp; 94 : "=b" (stack)
92 95 : "0" (stack),
93 __asm__ __volatile__("andl %%esp,%0" : 96 "D"(func)
94 "=r" (sp) : "0" (THREAD_SIZE - 1)); 97 : "memory", "cc", "edx", "ecx", "eax");
95 if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) { 98}
96 printk("do_IRQ: stack overflow: %ld\n",
97 sp - sizeof(struct thread_info));
98 dump_stack();
99 }
100 }
101#endif
102 99
103#ifdef CONFIG_4KSTACKS 100static inline int
101execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
102{
103 union irq_ctx *curctx, *irqctx;
104 u32 *isp, arg1, arg2;
104 105
105 curctx = (union irq_ctx *) current_thread_info(); 106 curctx = (union irq_ctx *) current_thread_info();
106 irqctx = hardirq_ctx[smp_processor_id()]; 107 irqctx = hardirq_ctx[smp_processor_id()];
@@ -111,52 +112,39 @@ unsigned int do_IRQ(struct pt_regs *regs)
111 * handler) we can't do that and just have to keep using the 112 * handler) we can't do that and just have to keep using the
112 * current stack (which is the irq stack already after all) 113 * current stack (which is the irq stack already after all)
113 */ 114 */
114 if (curctx != irqctx) { 115 if (unlikely(curctx == irqctx))
115 int arg1, arg2, bx; 116 return 0;
116
117 /* build the stack frame on the IRQ stack */
118 isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
119 irqctx->tinfo.task = curctx->tinfo.task;
120 irqctx->tinfo.previous_esp = current_stack_pointer;
121 117
122 /* 118 /* build the stack frame on the IRQ stack */
123 * Copy the softirq bits in preempt_count so that the 119 isp = (u32 *) ((char*)irqctx + sizeof(*irqctx));
124 * softirq checks work in the hardirq context. 120 irqctx->tinfo.task = curctx->tinfo.task;
125 */ 121 irqctx->tinfo.previous_esp = current_stack_pointer;
126 irqctx->tinfo.preempt_count =
127 (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
128 (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
129
130 asm volatile(
131 " xchgl %%ebx,%%esp \n"
132 " call *%%edi \n"
133 " movl %%ebx,%%esp \n"
134 : "=a" (arg1), "=d" (arg2), "=b" (bx)
135 : "0" (irq), "1" (desc), "2" (isp),
136 "D" (desc->handle_irq)
137 : "memory", "cc", "ecx"
138 );
139 } else
140#endif
141 desc->handle_irq(irq, desc);
142 122
143 irq_exit(); 123 /*
144 set_irq_regs(old_regs); 124 * Copy the softirq bits in preempt_count so that the
125 * softirq checks work in the hardirq context.
126 */
127 irqctx->tinfo.preempt_count =
128 (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
129 (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
130
131 if (unlikely(overflow))
132 call_on_stack(print_stack_overflow, isp);
133
134 asm volatile("xchgl %%ebx,%%esp \n"
135 "call *%%edi \n"
136 "movl %%ebx,%%esp \n"
137 : "=a" (arg1), "=d" (arg2), "=b" (isp)
138 : "0" (irq), "1" (desc), "2" (isp),
139 "D" (desc->handle_irq)
140 : "memory", "cc", "ecx");
145 return 1; 141 return 1;
146} 142}
147 143
148#ifdef CONFIG_4KSTACKS
149
150static char softirq_stack[NR_CPUS * THREAD_SIZE]
151 __attribute__((__section__(".bss.page_aligned")));
152
153static char hardirq_stack[NR_CPUS * THREAD_SIZE]
154 __attribute__((__section__(".bss.page_aligned")));
155
156/* 144/*
157 * allocate per-cpu stacks for hardirq and for softirq processing 145 * allocate per-cpu stacks for hardirq and for softirq processing
158 */ 146 */
159void irq_ctx_init(int cpu) 147void __cpuinit irq_ctx_init(int cpu)
160{ 148{
161 union irq_ctx *irqctx; 149 union irq_ctx *irqctx;
162 150
@@ -164,25 +152,25 @@ void irq_ctx_init(int cpu)
164 return; 152 return;
165 153
166 irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; 154 irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
167 irqctx->tinfo.task = NULL; 155 irqctx->tinfo.task = NULL;
168 irqctx->tinfo.exec_domain = NULL; 156 irqctx->tinfo.exec_domain = NULL;
169 irqctx->tinfo.cpu = cpu; 157 irqctx->tinfo.cpu = cpu;
170 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; 158 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
171 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 159 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
172 160
173 hardirq_ctx[cpu] = irqctx; 161 hardirq_ctx[cpu] = irqctx;
174 162
175 irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE]; 163 irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
176 irqctx->tinfo.task = NULL; 164 irqctx->tinfo.task = NULL;
177 irqctx->tinfo.exec_domain = NULL; 165 irqctx->tinfo.exec_domain = NULL;
178 irqctx->tinfo.cpu = cpu; 166 irqctx->tinfo.cpu = cpu;
179 irqctx->tinfo.preempt_count = 0; 167 irqctx->tinfo.preempt_count = 0;
180 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 168 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
181 169
182 softirq_ctx[cpu] = irqctx; 170 softirq_ctx[cpu] = irqctx;
183 171
184 printk("CPU %u irqstacks, hard=%p soft=%p\n", 172 printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
185 cpu,hardirq_ctx[cpu],softirq_ctx[cpu]); 173 cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
186} 174}
187 175
188void irq_ctx_exit(int cpu) 176void irq_ctx_exit(int cpu)
@@ -211,25 +199,56 @@ asmlinkage void do_softirq(void)
211 /* build the stack frame on the softirq stack */ 199 /* build the stack frame on the softirq stack */
212 isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); 200 isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
213 201
214 asm volatile( 202 call_on_stack(__do_softirq, isp);
215 " xchgl %%ebx,%%esp \n"
216 " call __do_softirq \n"
217 " movl %%ebx,%%esp \n"
218 : "=b"(isp)
219 : "0"(isp)
220 : "memory", "cc", "edx", "ecx", "eax"
221 );
222 /* 203 /*
223 * Shouldnt happen, we returned above if in_interrupt(): 204 * Shouldnt happen, we returned above if in_interrupt():
224 */ 205 */
225 WARN_ON_ONCE(softirq_count()); 206 WARN_ON_ONCE(softirq_count());
226 } 207 }
227 208
228 local_irq_restore(flags); 209 local_irq_restore(flags);
229} 210}
211
212#else
213static inline int
214execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
230#endif 215#endif
231 216
232/* 217/*
218 * do_IRQ handles all normal device IRQ's (the special
219 * SMP cross-CPU interrupts have their own specific
220 * handlers).
221 */
222unsigned int do_IRQ(struct pt_regs *regs)
223{
224 struct pt_regs *old_regs;
225 /* high bit used in ret_from_ code */
226 int overflow, irq = ~regs->orig_ax;
227 struct irq_desc *desc = irq_desc + irq;
228
229 if (unlikely((unsigned)irq >= NR_IRQS)) {
230 printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
231 __func__, irq);
232 BUG();
233 }
234
235 old_regs = set_irq_regs(regs);
236 irq_enter();
237
238 overflow = check_stack_overflow();
239
240 if (!execute_on_irq_stack(overflow, desc, irq)) {
241 if (unlikely(overflow))
242 print_stack_overflow();
243 desc->handle_irq(irq, desc);
244 }
245
246 irq_exit();
247 set_irq_regs(old_regs);
248 return 1;
249}
250
251/*
233 * Interrupt statistics: 252 * Interrupt statistics:
234 */ 253 */
235 254
@@ -306,23 +325,27 @@ skip:
306 for_each_online_cpu(j) 325 for_each_online_cpu(j)
307 seq_printf(p, "%10u ", 326 seq_printf(p, "%10u ",
308 per_cpu(irq_stat,j).irq_call_count); 327 per_cpu(irq_stat,j).irq_call_count);
309 seq_printf(p, " function call interrupts\n"); 328 seq_printf(p, " Function call interrupts\n");
310 seq_printf(p, "TLB: "); 329 seq_printf(p, "TLB: ");
311 for_each_online_cpu(j) 330 for_each_online_cpu(j)
312 seq_printf(p, "%10u ", 331 seq_printf(p, "%10u ",
313 per_cpu(irq_stat,j).irq_tlb_count); 332 per_cpu(irq_stat,j).irq_tlb_count);
314 seq_printf(p, " TLB shootdowns\n"); 333 seq_printf(p, " TLB shootdowns\n");
315#endif 334#endif
335#ifdef CONFIG_X86_MCE
316 seq_printf(p, "TRM: "); 336 seq_printf(p, "TRM: ");
317 for_each_online_cpu(j) 337 for_each_online_cpu(j)
318 seq_printf(p, "%10u ", 338 seq_printf(p, "%10u ",
319 per_cpu(irq_stat,j).irq_thermal_count); 339 per_cpu(irq_stat,j).irq_thermal_count);
320 seq_printf(p, " Thermal event interrupts\n"); 340 seq_printf(p, " Thermal event interrupts\n");
341#endif
342#ifdef CONFIG_X86_LOCAL_APIC
321 seq_printf(p, "SPU: "); 343 seq_printf(p, "SPU: ");
322 for_each_online_cpu(j) 344 for_each_online_cpu(j)
323 seq_printf(p, "%10u ", 345 seq_printf(p, "%10u ",
324 per_cpu(irq_stat,j).irq_spurious_count); 346 per_cpu(irq_stat,j).irq_spurious_count);
325 seq_printf(p, " Spurious interrupts\n"); 347 seq_printf(p, " Spurious interrupts\n");
348#endif
326 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); 349 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
327#if defined(CONFIG_X86_IO_APIC) 350#if defined(CONFIG_X86_IO_APIC)
328 seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); 351 seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
@@ -331,6 +354,40 @@ skip:
331 return 0; 354 return 0;
332} 355}
333 356
357/*
358 * /proc/stat helpers
359 */
360u64 arch_irq_stat_cpu(unsigned int cpu)
361{
362 u64 sum = nmi_count(cpu);
363
364#ifdef CONFIG_X86_LOCAL_APIC
365 sum += per_cpu(irq_stat, cpu).apic_timer_irqs;
366#endif
367#ifdef CONFIG_SMP
368 sum += per_cpu(irq_stat, cpu).irq_resched_count;
369 sum += per_cpu(irq_stat, cpu).irq_call_count;
370 sum += per_cpu(irq_stat, cpu).irq_tlb_count;
371#endif
372#ifdef CONFIG_X86_MCE
373 sum += per_cpu(irq_stat, cpu).irq_thermal_count;
374#endif
375#ifdef CONFIG_X86_LOCAL_APIC
376 sum += per_cpu(irq_stat, cpu).irq_spurious_count;
377#endif
378 return sum;
379}
380
381u64 arch_irq_stat(void)
382{
383 u64 sum = atomic_read(&irq_err_count);
384
385#ifdef CONFIG_X86_IO_APIC
386 sum += atomic_read(&irq_mis_count);
387#endif
388 return sum;
389}
390
334#ifdef CONFIG_HOTPLUG_CPU 391#ifdef CONFIG_HOTPLUG_CPU
335#include <mach_apic.h> 392#include <mach_apic.h>
336 393
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 3aac15466a91..f065fe9071b9 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -129,12 +129,13 @@ skip:
129 seq_printf(p, "CAL: "); 129 seq_printf(p, "CAL: ");
130 for_each_online_cpu(j) 130 for_each_online_cpu(j)
131 seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count); 131 seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count);
132 seq_printf(p, " function call interrupts\n"); 132 seq_printf(p, " Function call interrupts\n");
133 seq_printf(p, "TLB: "); 133 seq_printf(p, "TLB: ");
134 for_each_online_cpu(j) 134 for_each_online_cpu(j)
135 seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count); 135 seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count);
136 seq_printf(p, " TLB shootdowns\n"); 136 seq_printf(p, " TLB shootdowns\n");
137#endif 137#endif
138#ifdef CONFIG_X86_MCE
138 seq_printf(p, "TRM: "); 139 seq_printf(p, "TRM: ");
139 for_each_online_cpu(j) 140 for_each_online_cpu(j)
140 seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count); 141 seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count);
@@ -143,6 +144,7 @@ skip:
143 for_each_online_cpu(j) 144 for_each_online_cpu(j)
144 seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count); 145 seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count);
145 seq_printf(p, " Threshold APIC interrupts\n"); 146 seq_printf(p, " Threshold APIC interrupts\n");
147#endif
146 seq_printf(p, "SPU: "); 148 seq_printf(p, "SPU: ");
147 for_each_online_cpu(j) 149 for_each_online_cpu(j)
148 seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count); 150 seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count);
@@ -153,6 +155,32 @@ skip:
153} 155}
154 156
155/* 157/*
158 * /proc/stat helpers
159 */
160u64 arch_irq_stat_cpu(unsigned int cpu)
161{
162 u64 sum = cpu_pda(cpu)->__nmi_count;
163
164 sum += cpu_pda(cpu)->apic_timer_irqs;
165#ifdef CONFIG_SMP
166 sum += cpu_pda(cpu)->irq_resched_count;
167 sum += cpu_pda(cpu)->irq_call_count;
168 sum += cpu_pda(cpu)->irq_tlb_count;
169#endif
170#ifdef CONFIG_X86_MCE
171 sum += cpu_pda(cpu)->irq_thermal_count;
172 sum += cpu_pda(cpu)->irq_threshold_count;
173#endif
174 sum += cpu_pda(cpu)->irq_spurious_count;
175 return sum;
176}
177
178u64 arch_irq_stat(void)
179{
180 return atomic_read(&irq_err_count);
181}
182
183/*
156 * do_IRQ handles all normal device IRQ's (the special 184 * do_IRQ handles all normal device IRQ's (the special
157 * SMP cross-CPU interrupts have their own specific 185 * SMP cross-CPU interrupts have their own specific
158 * handlers). 186 * handlers).
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
new file mode 100644
index 000000000000..9200a1e2752d
--- /dev/null
+++ b/arch/x86/kernel/irqinit_32.c
@@ -0,0 +1,163 @@
1#include <linux/errno.h>
2#include <linux/signal.h>
3#include <linux/sched.h>
4#include <linux/ioport.h>
5#include <linux/interrupt.h>
6#include <linux/slab.h>
7#include <linux/random.h>
8#include <linux/init.h>
9#include <linux/kernel_stat.h>
10#include <linux/sysdev.h>
11#include <linux/bitops.h>
12
13#include <asm/atomic.h>
14#include <asm/system.h>
15#include <asm/io.h>
16#include <asm/timer.h>
17#include <asm/pgtable.h>
18#include <asm/delay.h>
19#include <asm/desc.h>
20#include <asm/apic.h>
21#include <asm/arch_hooks.h>
22#include <asm/i8259.h>
23
24
25
26/*
27 * Note that on a 486, we don't want to do a SIGFPE on an irq13
28 * as the irq is unreliable, and exception 16 works correctly
29 * (ie as explained in the intel literature). On a 386, you
30 * can't use exception 16 due to bad IBM design, so we have to
31 * rely on the less exact irq13.
32 *
33 * Careful.. Not only is IRQ13 unreliable, but it is also
34 * leads to races. IBM designers who came up with it should
35 * be shot.
36 */
37
38
39static irqreturn_t math_error_irq(int cpl, void *dev_id)
40{
41 extern void math_error(void __user *);
42 outb(0,0xF0);
43 if (ignore_fpu_irq || !boot_cpu_data.hard_math)
44 return IRQ_NONE;
45 math_error((void __user *)get_irq_regs()->ip);
46 return IRQ_HANDLED;
47}
48
49/*
50 * New motherboards sometimes make IRQ 13 be a PCI interrupt,
51 * so allow interrupt sharing.
52 */
53static struct irqaction fpu_irq = {
54 .handler = math_error_irq,
55 .mask = CPU_MASK_NONE,
56 .name = "fpu",
57};
58
59void __init init_ISA_irqs (void)
60{
61 int i;
62
63#ifdef CONFIG_X86_LOCAL_APIC
64 init_bsp_APIC();
65#endif
66 init_8259A(0);
67
68 /*
69 * 16 old-style INTA-cycle interrupts:
70 */
71 for (i = 0; i < 16; i++) {
72 set_irq_chip_and_handler_name(i, &i8259A_chip,
73 handle_level_irq, "XT");
74 }
75}
76
77/*
78 * IRQ2 is cascade interrupt to second interrupt controller
79 */
80static struct irqaction irq2 = {
81 .handler = no_action,
82 .mask = CPU_MASK_NONE,
83 .name = "cascade",
84};
85
86/* Overridden in paravirt.c */
87void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
88
89void __init native_init_IRQ(void)
90{
91 int i;
92
93 /* all the set up before the call gates are initialised */
94 pre_intr_init_hook();
95
96 /*
97 * Cover the whole vector space, no vector can escape
98 * us. (some of these will be overridden and become
99 * 'special' SMP interrupts)
100 */
101 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
102 int vector = FIRST_EXTERNAL_VECTOR + i;
103 if (i >= NR_IRQS)
104 break;
105 /* SYSCALL_VECTOR was reserved in trap_init. */
106 if (!test_bit(vector, used_vectors))
107 set_intr_gate(vector, interrupt[i]);
108 }
109
110#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
111 /*
112 * IRQ0 must be given a fixed assignment and initialized,
113 * because it's used before the IO-APIC is set up.
114 */
115 set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
116
117 /*
118 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
119 * IPI, driven by wakeup.
120 */
121 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
122
123 /* IPI for invalidation */
124 alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
125
126 /* IPI for generic function call */
127 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
128
129 /* IPI for single call function */
130 set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt);
131#endif
132
133#ifdef CONFIG_X86_LOCAL_APIC
134 /* self generated IPI for local APIC timer */
135 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
136
137 /* IPI vectors for APIC spurious and error interrupts */
138 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
139 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
140#endif
141
142#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
143 /* thermal monitor LVT interrupt */
144 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
145#endif
146
147 if (!acpi_ioapic)
148 setup_irq(2, &irq2);
149
150 /* setup after call gates are initialised (usually add in
151 * the architecture specific gates)
152 */
153 intr_init_hook();
154
155 /*
156 * External FPU? Set up irq13 if so, for
157 * original braindamaged IBM FERR coupling.
158 */
159 if (boot_cpu_data.hard_math && !cpu_has_fpu)
160 setup_irq(FPU_IRQ, &fpu_irq);
161
162 irq_ctx_init(smp_processor_id());
163}
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
new file mode 100644
index 000000000000..5b5be9d43c2a
--- /dev/null
+++ b/arch/x86/kernel/irqinit_64.c
@@ -0,0 +1,233 @@
1#include <linux/linkage.h>
2#include <linux/errno.h>
3#include <linux/signal.h>
4#include <linux/sched.h>
5#include <linux/ioport.h>
6#include <linux/interrupt.h>
7#include <linux/timex.h>
8#include <linux/slab.h>
9#include <linux/random.h>
10#include <linux/init.h>
11#include <linux/kernel_stat.h>
12#include <linux/sysdev.h>
13#include <linux/bitops.h>
14
15#include <asm/acpi.h>
16#include <asm/atomic.h>
17#include <asm/system.h>
18#include <asm/io.h>
19#include <asm/hw_irq.h>
20#include <asm/pgtable.h>
21#include <asm/delay.h>
22#include <asm/desc.h>
23#include <asm/apic.h>
24#include <asm/i8259.h>
25
26/*
27 * Common place to define all x86 IRQ vectors
28 *
29 * This builds up the IRQ handler stubs using some ugly macros in irq.h
30 *
31 * These macros create the low-level assembly IRQ routines that save
32 * register context and call do_IRQ(). do_IRQ() then does all the
33 * operations that are needed to keep the AT (or SMP IOAPIC)
34 * interrupt-controller happy.
35 */
36
37#define IRQ_NAME2(nr) nr##_interrupt(void)
38#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
39
40/*
41 * SMP has a few special interrupts for IPI messages
42 */
43
44#define BUILD_IRQ(nr) \
45 asmlinkage void IRQ_NAME(nr); \
46 asm("\n.text\n.p2align\n" \
47 "IRQ" #nr "_interrupt:\n\t" \
48 "push $~(" #nr ") ; " \
49 "jmp common_interrupt\n" \
50 ".previous");
51
52#define BI(x,y) \
53 BUILD_IRQ(x##y)
54
55#define BUILD_16_IRQS(x) \
56 BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
57 BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
58 BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
59 BI(x,c) BI(x,d) BI(x,e) BI(x,f)
60
61/*
62 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
63 * (these are usually mapped to vectors 0x30-0x3f)
64 */
65
66/*
67 * The IO-APIC gives us many more interrupt sources. Most of these
68 * are unused but an SMP system is supposed to have enough memory ...
69 * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
70 * across the spectrum, so we really want to be prepared to get all
71 * of these. Plus, more powerful systems might have more than 64
72 * IO-APIC registers.
73 *
74 * (these are usually mapped into the 0x30-0xff vector range)
75 */
76 BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
77BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
78BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
79BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
80
81#undef BUILD_16_IRQS
82#undef BI
83
84
85#define IRQ(x,y) \
86 IRQ##x##y##_interrupt
87
88#define IRQLIST_16(x) \
89 IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
90 IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
91 IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
92 IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
93
94/* for the irq vectors */
95static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
96 IRQLIST_16(0x2), IRQLIST_16(0x3),
97 IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
98 IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
99 IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
100};
101
102#undef IRQ
103#undef IRQLIST_16
104
105
106
107
108/*
109 * IRQ2 is cascade interrupt to second interrupt controller
110 */
111
112static struct irqaction irq2 = {
113 .handler = no_action,
114 .mask = CPU_MASK_NONE,
115 .name = "cascade",
116};
117DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
118 [0 ... IRQ0_VECTOR - 1] = -1,
119 [IRQ0_VECTOR] = 0,
120 [IRQ1_VECTOR] = 1,
121 [IRQ2_VECTOR] = 2,
122 [IRQ3_VECTOR] = 3,
123 [IRQ4_VECTOR] = 4,
124 [IRQ5_VECTOR] = 5,
125 [IRQ6_VECTOR] = 6,
126 [IRQ7_VECTOR] = 7,
127 [IRQ8_VECTOR] = 8,
128 [IRQ9_VECTOR] = 9,
129 [IRQ10_VECTOR] = 10,
130 [IRQ11_VECTOR] = 11,
131 [IRQ12_VECTOR] = 12,
132 [IRQ13_VECTOR] = 13,
133 [IRQ14_VECTOR] = 14,
134 [IRQ15_VECTOR] = 15,
135 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
136};
137
138void __init init_ISA_irqs(void)
139{
140 int i;
141
142 init_bsp_APIC();
143 init_8259A(0);
144
145 for (i = 0; i < NR_IRQS; i++) {
146 irq_desc[i].status = IRQ_DISABLED;
147 irq_desc[i].action = NULL;
148 irq_desc[i].depth = 1;
149
150 if (i < 16) {
151 /*
152 * 16 old-style INTA-cycle interrupts:
153 */
154 set_irq_chip_and_handler_name(i, &i8259A_chip,
155 handle_level_irq, "XT");
156 } else {
157 /*
158 * 'high' PCI IRQs filled in on demand
159 */
160 irq_desc[i].chip = &no_irq_chip;
161 }
162 }
163}
164
165void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
166
167static void __init smp_intr_init(void)
168{
169#ifdef CONFIG_SMP
170 /*
171 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
172 * IPI, driven by wakeup.
173 */
174 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
175
176 /* IPIs for invalidation */
177 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
178 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
179 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
180 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
181 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
182 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
183 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
184 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
185
186 /* IPI for generic function call */
187 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
188
189 /* IPI for generic single function call */
190 alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
191 call_function_single_interrupt);
192
193 /* Low priority IPI to cleanup after moving an irq */
194 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
195#endif
196}
197
198static void __init apic_intr_init(void)
199{
200 smp_intr_init();
201
202 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
203 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
204
205 /* self generated IPI for local APIC timer */
206 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
207
208 /* IPI vectors for APIC spurious and error interrupts */
209 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
210 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
211}
212
213void __init native_init_IRQ(void)
214{
215 int i;
216
217 init_ISA_irqs();
218 /*
219 * Cover the whole vector space, no vector can escape
220 * us. (some of these will be overridden and become
221 * 'special' SMP interrupts)
222 */
223 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
224 int vector = FIRST_EXTERNAL_VECTOR + i;
225 if (vector != IA32_SYSCALL_VECTOR)
226 set_intr_gate(vector, interrupt[i]);
227 }
228
229 apic_intr_init();
230
231 if (!acpi_ioapic)
232 setup_irq(2, &irq2);
233}
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
index 7377ccb21335..304d8bad6559 100644
--- a/arch/x86/kernel/k8.c
+++ b/arch/x86/kernel/k8.c
@@ -16,8 +16,9 @@ EXPORT_SYMBOL(num_k8_northbridges);
16static u32 *flush_words; 16static u32 *flush_words;
17 17
18struct pci_device_id k8_nb_ids[] = { 18struct pci_device_id k8_nb_ids[] = {
19 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, 19 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, 20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
21 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_11H_NB_MISC) },
21 {} 22 {}
22}; 23};
23EXPORT_SYMBOL(k8_nb_ids); 24EXPORT_SYMBOL(k8_nb_ids);
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index c03205991718..ff7d3b0124f1 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -12,9 +12,13 @@
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/io.h> 13#include <linux/io.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/module.h>
15 16
16#include <asm/setup.h> 17#include <asm/setup.h>
17 18
19struct dentry *arch_debugfs_dir;
20EXPORT_SYMBOL(arch_debugfs_dir);
21
18#ifdef CONFIG_DEBUG_BOOT_PARAMS 22#ifdef CONFIG_DEBUG_BOOT_PARAMS
19struct setup_data_node { 23struct setup_data_node {
20 u64 paddr; 24 u64 paddr;
@@ -135,6 +139,7 @@ static int __init create_setup_data_nodes(struct dentry *parent)
135 if (PageHighMem(pg)) { 139 if (PageHighMem(pg)) {
136 data = ioremap_cache(pa_data, sizeof(*data)); 140 data = ioremap_cache(pa_data, sizeof(*data));
137 if (!data) { 141 if (!data) {
142 kfree(node);
138 error = -ENXIO; 143 error = -ENXIO;
139 goto err_dir; 144 goto err_dir;
140 } 145 }
@@ -209,6 +214,10 @@ static int __init arch_kdebugfs_init(void)
209{ 214{
210 int error = 0; 215 int error = 0;
211 216
217 arch_debugfs_dir = debugfs_create_dir("x86", NULL);
218 if (!arch_debugfs_dir)
219 return -ENOMEM;
220
212#ifdef CONFIG_DEBUG_BOOT_PARAMS 221#ifdef CONFIG_DEBUG_BOOT_PARAMS
213 error = boot_params_kdebugfs_init(); 222 error = boot_params_kdebugfs_init();
214#endif 223#endif
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index f47f0eb886b8..10435a120d22 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -69,6 +69,9 @@ static int gdb_x86vector = -1;
69 */ 69 */
70void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) 70void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
71{ 71{
72#ifndef CONFIG_X86_32
73 u32 *gdb_regs32 = (u32 *)gdb_regs;
74#endif
72 gdb_regs[GDB_AX] = regs->ax; 75 gdb_regs[GDB_AX] = regs->ax;
73 gdb_regs[GDB_BX] = regs->bx; 76 gdb_regs[GDB_BX] = regs->bx;
74 gdb_regs[GDB_CX] = regs->cx; 77 gdb_regs[GDB_CX] = regs->cx;
@@ -76,9 +79,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
76 gdb_regs[GDB_SI] = regs->si; 79 gdb_regs[GDB_SI] = regs->si;
77 gdb_regs[GDB_DI] = regs->di; 80 gdb_regs[GDB_DI] = regs->di;
78 gdb_regs[GDB_BP] = regs->bp; 81 gdb_regs[GDB_BP] = regs->bp;
79 gdb_regs[GDB_PS] = regs->flags;
80 gdb_regs[GDB_PC] = regs->ip; 82 gdb_regs[GDB_PC] = regs->ip;
81#ifdef CONFIG_X86_32 83#ifdef CONFIG_X86_32
84 gdb_regs[GDB_PS] = regs->flags;
82 gdb_regs[GDB_DS] = regs->ds; 85 gdb_regs[GDB_DS] = regs->ds;
83 gdb_regs[GDB_ES] = regs->es; 86 gdb_regs[GDB_ES] = regs->es;
84 gdb_regs[GDB_CS] = regs->cs; 87 gdb_regs[GDB_CS] = regs->cs;
@@ -94,6 +97,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
94 gdb_regs[GDB_R13] = regs->r13; 97 gdb_regs[GDB_R13] = regs->r13;
95 gdb_regs[GDB_R14] = regs->r14; 98 gdb_regs[GDB_R14] = regs->r14;
96 gdb_regs[GDB_R15] = regs->r15; 99 gdb_regs[GDB_R15] = regs->r15;
100 gdb_regs32[GDB_PS] = regs->flags;
101 gdb_regs32[GDB_CS] = regs->cs;
102 gdb_regs32[GDB_SS] = regs->ss;
97#endif 103#endif
98 gdb_regs[GDB_SP] = regs->sp; 104 gdb_regs[GDB_SP] = regs->sp;
99} 105}
@@ -112,6 +118,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
112 */ 118 */
113void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) 119void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
114{ 120{
121#ifndef CONFIG_X86_32
122 u32 *gdb_regs32 = (u32 *)gdb_regs;
123#endif
115 gdb_regs[GDB_AX] = 0; 124 gdb_regs[GDB_AX] = 0;
116 gdb_regs[GDB_BX] = 0; 125 gdb_regs[GDB_BX] = 0;
117 gdb_regs[GDB_CX] = 0; 126 gdb_regs[GDB_CX] = 0;
@@ -129,8 +138,10 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
129 gdb_regs[GDB_FS] = 0xFFFF; 138 gdb_regs[GDB_FS] = 0xFFFF;
130 gdb_regs[GDB_GS] = 0xFFFF; 139 gdb_regs[GDB_GS] = 0xFFFF;
131#else 140#else
132 gdb_regs[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); 141 gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8);
133 gdb_regs[GDB_PC] = 0; 142 gdb_regs32[GDB_CS] = __KERNEL_CS;
143 gdb_regs32[GDB_SS] = __KERNEL_DS;
144 gdb_regs[GDB_PC] = p->thread.ip;
134 gdb_regs[GDB_R8] = 0; 145 gdb_regs[GDB_R8] = 0;
135 gdb_regs[GDB_R9] = 0; 146 gdb_regs[GDB_R9] = 0;
136 gdb_regs[GDB_R10] = 0; 147 gdb_regs[GDB_R10] = 0;
@@ -153,6 +164,9 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
153 */ 164 */
154void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) 165void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
155{ 166{
167#ifndef CONFIG_X86_32
168 u32 *gdb_regs32 = (u32 *)gdb_regs;
169#endif
156 regs->ax = gdb_regs[GDB_AX]; 170 regs->ax = gdb_regs[GDB_AX];
157 regs->bx = gdb_regs[GDB_BX]; 171 regs->bx = gdb_regs[GDB_BX];
158 regs->cx = gdb_regs[GDB_CX]; 172 regs->cx = gdb_regs[GDB_CX];
@@ -160,9 +174,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
160 regs->si = gdb_regs[GDB_SI]; 174 regs->si = gdb_regs[GDB_SI];
161 regs->di = gdb_regs[GDB_DI]; 175 regs->di = gdb_regs[GDB_DI];
162 regs->bp = gdb_regs[GDB_BP]; 176 regs->bp = gdb_regs[GDB_BP];
163 regs->flags = gdb_regs[GDB_PS];
164 regs->ip = gdb_regs[GDB_PC]; 177 regs->ip = gdb_regs[GDB_PC];
165#ifdef CONFIG_X86_32 178#ifdef CONFIG_X86_32
179 regs->flags = gdb_regs[GDB_PS];
166 regs->ds = gdb_regs[GDB_DS]; 180 regs->ds = gdb_regs[GDB_DS];
167 regs->es = gdb_regs[GDB_ES]; 181 regs->es = gdb_regs[GDB_ES];
168 regs->cs = gdb_regs[GDB_CS]; 182 regs->cs = gdb_regs[GDB_CS];
@@ -175,6 +189,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
175 regs->r13 = gdb_regs[GDB_R13]; 189 regs->r13 = gdb_regs[GDB_R13];
176 regs->r14 = gdb_regs[GDB_R14]; 190 regs->r14 = gdb_regs[GDB_R14];
177 regs->r15 = gdb_regs[GDB_R15]; 191 regs->r15 = gdb_regs[GDB_R15];
192 regs->flags = gdb_regs32[GDB_PS];
193 regs->cs = gdb_regs32[GDB_CS];
194 regs->ss = gdb_regs32[GDB_SS];
178#endif 195#endif
179} 196}
180 197
@@ -378,10 +395,8 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
378 if (remcomInBuffer[0] == 's') { 395 if (remcomInBuffer[0] == 's') {
379 linux_regs->flags |= X86_EFLAGS_TF; 396 linux_regs->flags |= X86_EFLAGS_TF;
380 kgdb_single_step = 1; 397 kgdb_single_step = 1;
381 if (kgdb_contthread) { 398 atomic_set(&kgdb_cpu_doing_single_step,
382 atomic_set(&kgdb_cpu_doing_single_step, 399 raw_smp_processor_id());
383 raw_smp_processor_id());
384 }
385 } 400 }
386 401
387 get_debugreg(dr6, 6); 402 get_debugreg(dr6, 6);
@@ -440,12 +455,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
440 return NOTIFY_DONE; 455 return NOTIFY_DONE;
441 456
442 case DIE_NMI_IPI: 457 case DIE_NMI_IPI:
443 if (atomic_read(&kgdb_active) != -1) { 458 /* Just ignore, we will handle the roundup on DIE_NMI. */
444 /* KGDB CPU roundup */
445 kgdb_nmicallback(raw_smp_processor_id(), regs);
446 was_in_debug_nmi[raw_smp_processor_id()] = 1;
447 touch_nmi_watchdog();
448 }
449 return NOTIFY_DONE; 459 return NOTIFY_DONE;
450 460
451 case DIE_NMIUNKNOWN: 461 case DIE_NMIUNKNOWN:
@@ -466,9 +476,15 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
466 476
467 case DIE_DEBUG: 477 case DIE_DEBUG:
468 if (atomic_read(&kgdb_cpu_doing_single_step) == 478 if (atomic_read(&kgdb_cpu_doing_single_step) ==
469 raw_smp_processor_id() && 479 raw_smp_processor_id()) {
470 user_mode(regs)) 480 if (user_mode(regs))
471 return single_step_cont(regs, args); 481 return single_step_cont(regs, args);
482 break;
483 } else if (test_thread_flag(TIF_SINGLESTEP))
484 /* This means a user thread is single stepping
485 * a system call which should be ignored
486 */
487 return NOTIFY_DONE;
472 /* fall through */ 488 /* fall through */
473 default: 489 default:
474 if (user_mode(regs)) 490 if (user_mode(regs))
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index b8c6743a13da..6c27679ec6aa 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -431,7 +431,6 @@ static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
431 regs->ip = (unsigned long)p->ainsn.insn; 431 regs->ip = (unsigned long)p->ainsn.insn;
432} 432}
433 433
434/* Called with kretprobe_lock held */
435void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, 434void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
436 struct pt_regs *regs) 435 struct pt_regs *regs)
437{ 436{
@@ -682,8 +681,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
682 unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; 681 unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
683 682
684 INIT_HLIST_HEAD(&empty_rp); 683 INIT_HLIST_HEAD(&empty_rp);
685 spin_lock_irqsave(&kretprobe_lock, flags); 684 kretprobe_hash_lock(current, &head, &flags);
686 head = kretprobe_inst_table_head(current);
687 /* fixup registers */ 685 /* fixup registers */
688#ifdef CONFIG_X86_64 686#ifdef CONFIG_X86_64
689 regs->cs = __KERNEL_CS; 687 regs->cs = __KERNEL_CS;
@@ -732,7 +730,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
732 730
733 kretprobe_assert(ri, orig_ret_address, trampoline_address); 731 kretprobe_assert(ri, orig_ret_address, trampoline_address);
734 732
735 spin_unlock_irqrestore(&kretprobe_lock, flags); 733 kretprobe_hash_unlock(current, &flags);
736 734
737 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { 735 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
738 hlist_del(&ri->hlist); 736 hlist_del(&ri->hlist);
@@ -860,7 +858,6 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs)
860 858
861 resume_execution(cur, regs, kcb); 859 resume_execution(cur, regs, kcb);
862 regs->flags |= kcb->kprobe_saved_flags; 860 regs->flags |= kcb->kprobe_saved_flags;
863 trace_hardirqs_fixup_flags(regs->flags);
864 861
865 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { 862 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
866 kcb->kprobe_status = KPROBE_HIT_SSDONE; 863 kcb->kprobe_status = KPROBE_HIT_SSDONE;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 8b7a3cf37d2b..478bca986eca 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -178,7 +178,7 @@ static void kvm_flush_tlb(void)
178 kvm_deferred_mmu_op(&ftlb, sizeof ftlb); 178 kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
179} 179}
180 180
181static void kvm_release_pt(u32 pfn) 181static void kvm_release_pt(unsigned long pfn)
182{ 182{
183 struct kvm_mmu_op_release_pt rpt = { 183 struct kvm_mmu_op_release_pt rpt = {
184 .header.op = KVM_MMU_OP_RELEASE_PT, 184 .header.op = KVM_MMU_OP_RELEASE_PT,
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 87edf1ceb1df..d02def06ca91 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -113,7 +113,7 @@ static void kvm_setup_secondary_clock(void)
113#endif 113#endif
114 114
115#ifdef CONFIG_SMP 115#ifdef CONFIG_SMP
116void __init kvm_smp_prepare_boot_cpu(void) 116static void __init kvm_smp_prepare_boot_cpu(void)
117{ 117{
118 WARN_ON(kvm_register_clock("primary cpu clock")); 118 WARN_ON(kvm_register_clock("primary cpu clock"));
119 native_smp_prepare_boot_cpu(); 119 native_smp_prepare_boot_cpu();
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 0224c3637c73..eee32b43fee3 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -18,11 +18,12 @@
18#include <asm/ldt.h> 18#include <asm/ldt.h>
19#include <asm/desc.h> 19#include <asm/desc.h>
20#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
21#include <asm/syscalls.h>
21 22
22#ifdef CONFIG_SMP 23#ifdef CONFIG_SMP
23static void flush_ldt(void *null) 24static void flush_ldt(void *current_mm)
24{ 25{
25 if (current->active_mm) 26 if (current->active_mm == current_mm)
26 load_LDT(&current->active_mm->context); 27 load_LDT(&current->active_mm->context);
27} 28}
28#endif 29#endif
@@ -51,6 +52,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
51 memset(newldt + oldsize * LDT_ENTRY_SIZE, 0, 52 memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
52 (mincount - oldsize) * LDT_ENTRY_SIZE); 53 (mincount - oldsize) * LDT_ENTRY_SIZE);
53 54
55 paravirt_alloc_ldt(newldt, mincount);
56
54#ifdef CONFIG_X86_64 57#ifdef CONFIG_X86_64
55 /* CHECKME: Do we really need this ? */ 58 /* CHECKME: Do we really need this ? */
56 wmb(); 59 wmb();
@@ -62,19 +65,18 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
62 65
63 if (reload) { 66 if (reload) {
64#ifdef CONFIG_SMP 67#ifdef CONFIG_SMP
65 cpumask_t mask;
66
67 preempt_disable(); 68 preempt_disable();
68 load_LDT(pc); 69 load_LDT(pc);
69 mask = cpumask_of_cpu(smp_processor_id()); 70 if (!cpus_equal(current->mm->cpu_vm_mask,
70 if (!cpus_equal(current->mm->cpu_vm_mask, mask)) 71 cpumask_of_cpu(smp_processor_id())))
71 smp_call_function(flush_ldt, NULL, 1, 1); 72 smp_call_function(flush_ldt, current->mm, 1);
72 preempt_enable(); 73 preempt_enable();
73#else 74#else
74 load_LDT(pc); 75 load_LDT(pc);
75#endif 76#endif
76 } 77 }
77 if (oldsize) { 78 if (oldsize) {
79 paravirt_free_ldt(oldldt, oldsize);
78 if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) 80 if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
79 vfree(oldldt); 81 vfree(oldldt);
80 else 82 else
@@ -86,10 +88,13 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
86static inline int copy_ldt(mm_context_t *new, mm_context_t *old) 88static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
87{ 89{
88 int err = alloc_ldt(new, old->size, 0); 90 int err = alloc_ldt(new, old->size, 0);
91 int i;
89 92
90 if (err < 0) 93 if (err < 0)
91 return err; 94 return err;
92 memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE); 95
96 for(i = 0; i < old->size; i++)
97 write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE);
93 return 0; 98 return 0;
94} 99}
95 100
@@ -126,6 +131,7 @@ void destroy_context(struct mm_struct *mm)
126 if (mm == current->active_mm) 131 if (mm == current->active_mm)
127 clear_LDT(); 132 clear_LDT();
128#endif 133#endif
134 paravirt_free_ldt(mm->context.ldt, mm->context.size);
129 if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE) 135 if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
130 vfree(mm->context.ldt); 136 vfree(mm->context.ldt);
131 else 137 else
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index d0b234c9fc31..0732adba05ca 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -11,6 +11,9 @@
11#include <linux/delay.h> 11#include <linux/delay.h>
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/numa.h> 13#include <linux/numa.h>
14#include <linux/ftrace.h>
15#include <linux/suspend.h>
16
14#include <asm/pgtable.h> 17#include <asm/pgtable.h>
15#include <asm/pgalloc.h> 18#include <asm/pgalloc.h>
16#include <asm/tlbflush.h> 19#include <asm/tlbflush.h>
@@ -20,6 +23,7 @@
20#include <asm/cpufeature.h> 23#include <asm/cpufeature.h>
21#include <asm/desc.h> 24#include <asm/desc.h>
22#include <asm/system.h> 25#include <asm/system.h>
26#include <asm/cacheflush.h>
23 27
24#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) 28#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
25static u32 kexec_pgd[1024] PAGE_ALIGNED; 29static u32 kexec_pgd[1024] PAGE_ALIGNED;
@@ -39,7 +43,7 @@ static void set_idt(void *newidt, __u16 limit)
39 curidt.address = (unsigned long)newidt; 43 curidt.address = (unsigned long)newidt;
40 44
41 load_idt(&curidt); 45 load_idt(&curidt);
42}; 46}
43 47
44 48
45static void set_gdt(void *newgdt, __u16 limit) 49static void set_gdt(void *newgdt, __u16 limit)
@@ -51,7 +55,7 @@ static void set_gdt(void *newgdt, __u16 limit)
51 curgdt.address = (unsigned long)newgdt; 55 curgdt.address = (unsigned long)newgdt;
52 56
53 load_gdt(&curgdt); 57 load_gdt(&curgdt);
54}; 58}
55 59
56static void load_segments(void) 60static void load_segments(void)
57{ 61{
@@ -75,7 +79,7 @@ static void load_segments(void)
75/* 79/*
76 * A architecture hook called to validate the 80 * A architecture hook called to validate the
77 * proposed image and prepare the control pages 81 * proposed image and prepare the control pages
78 * as needed. The pages for KEXEC_CONTROL_CODE_SIZE 82 * as needed. The pages for KEXEC_CONTROL_PAGE_SIZE
79 * have been allocated, but the segments have yet 83 * have been allocated, but the segments have yet
80 * been copied into the kernel. 84 * been copied into the kernel.
81 * 85 *
@@ -83,10 +87,12 @@ static void load_segments(void)
83 * reboot code buffer to allow us to avoid allocations 87 * reboot code buffer to allow us to avoid allocations
84 * later. 88 * later.
85 * 89 *
86 * Currently nothing. 90 * Make control page executable.
87 */ 91 */
88int machine_kexec_prepare(struct kimage *image) 92int machine_kexec_prepare(struct kimage *image)
89{ 93{
94 if (nx_enabled)
95 set_pages_x(image->control_code_page, 1);
90 return 0; 96 return 0;
91} 97}
92 98
@@ -96,25 +102,54 @@ int machine_kexec_prepare(struct kimage *image)
96 */ 102 */
97void machine_kexec_cleanup(struct kimage *image) 103void machine_kexec_cleanup(struct kimage *image)
98{ 104{
105 if (nx_enabled)
106 set_pages_nx(image->control_code_page, 1);
99} 107}
100 108
101/* 109/*
102 * Do not allocate memory (or fail in any way) in machine_kexec(). 110 * Do not allocate memory (or fail in any way) in machine_kexec().
103 * We are past the point of no return, committed to rebooting now. 111 * We are past the point of no return, committed to rebooting now.
104 */ 112 */
105NORET_TYPE void machine_kexec(struct kimage *image) 113void machine_kexec(struct kimage *image)
106{ 114{
107 unsigned long page_list[PAGES_NR]; 115 unsigned long page_list[PAGES_NR];
108 void *control_page; 116 void *control_page;
117 int save_ftrace_enabled;
118 asmlinkage unsigned long
119 (*relocate_kernel_ptr)(unsigned long indirection_page,
120 unsigned long control_page,
121 unsigned long start_address,
122 unsigned int has_pae,
123 unsigned int preserve_context);
124
125#ifdef CONFIG_KEXEC_JUMP
126 if (kexec_image->preserve_context)
127 save_processor_state();
128#endif
129
130 save_ftrace_enabled = __ftrace_enabled_save();
109 131
110 /* Interrupts aren't acceptable while we reboot */ 132 /* Interrupts aren't acceptable while we reboot */
111 local_irq_disable(); 133 local_irq_disable();
112 134
135 if (image->preserve_context) {
136#ifdef CONFIG_X86_IO_APIC
137 /* We need to put APICs in legacy mode so that we can
138 * get timer interrupts in second kernel. kexec/kdump
139 * paths already have calls to disable_IO_APIC() in
140 * one form or other. kexec jump path also need
141 * one.
142 */
143 disable_IO_APIC();
144#endif
145 }
146
113 control_page = page_address(image->control_code_page); 147 control_page = page_address(image->control_code_page);
114 memcpy(control_page, relocate_kernel, PAGE_SIZE); 148 memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
115 149
150 relocate_kernel_ptr = control_page;
116 page_list[PA_CONTROL_PAGE] = __pa(control_page); 151 page_list[PA_CONTROL_PAGE] = __pa(control_page);
117 page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; 152 page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
118 page_list[PA_PGD] = __pa(kexec_pgd); 153 page_list[PA_PGD] = __pa(kexec_pgd);
119 page_list[VA_PGD] = (unsigned long)kexec_pgd; 154 page_list[VA_PGD] = (unsigned long)kexec_pgd;
120#ifdef CONFIG_X86_PAE 155#ifdef CONFIG_X86_PAE
@@ -127,6 +162,7 @@ NORET_TYPE void machine_kexec(struct kimage *image)
127 page_list[VA_PTE_0] = (unsigned long)kexec_pte0; 162 page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
128 page_list[PA_PTE_1] = __pa(kexec_pte1); 163 page_list[PA_PTE_1] = __pa(kexec_pte1);
129 page_list[VA_PTE_1] = (unsigned long)kexec_pte1; 164 page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
165 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT);
130 166
131 /* The segment registers are funny things, they have both a 167 /* The segment registers are funny things, they have both a
132 * visible and an invisible part. Whenever the visible part is 168 * visible and an invisible part. Whenever the visible part is
@@ -145,8 +181,17 @@ NORET_TYPE void machine_kexec(struct kimage *image)
145 set_idt(phys_to_virt(0),0); 181 set_idt(phys_to_virt(0),0);
146 182
147 /* now call it */ 183 /* now call it */
148 relocate_kernel((unsigned long)image->head, (unsigned long)page_list, 184 image->start = relocate_kernel_ptr((unsigned long)image->head,
149 image->start, cpu_has_pae); 185 (unsigned long)page_list,
186 image->start, cpu_has_pae,
187 image->preserve_context);
188
189#ifdef CONFIG_KEXEC_JUMP
190 if (kexec_image->preserve_context)
191 restore_processor_state();
192#endif
193
194 __ftrace_enabled_restore(save_ftrace_enabled);
150} 195}
151 196
152void arch_crash_save_vmcoreinfo(void) 197void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 576a03db4511..c43caa3a91f3 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -11,6 +11,8 @@
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/reboot.h> 12#include <linux/reboot.h>
13#include <linux/numa.h> 13#include <linux/numa.h>
14#include <linux/ftrace.h>
15
14#include <asm/pgtable.h> 16#include <asm/pgtable.h>
15#include <asm/tlbflush.h> 17#include <asm/tlbflush.h>
16#include <asm/mmu_context.h> 18#include <asm/mmu_context.h>
@@ -110,7 +112,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
110{ 112{
111 pgd_t *level4p; 113 pgd_t *level4p;
112 level4p = (pgd_t *)__va(start_pgtable); 114 level4p = (pgd_t *)__va(start_pgtable);
113 return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT); 115 return init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
114} 116}
115 117
116static void set_idt(void *newidt, u16 limit) 118static void set_idt(void *newidt, u16 limit)
@@ -179,11 +181,13 @@ void machine_kexec_cleanup(struct kimage *image)
179 * Do not allocate memory (or fail in any way) in machine_kexec(). 181 * Do not allocate memory (or fail in any way) in machine_kexec().
180 * We are past the point of no return, committed to rebooting now. 182 * We are past the point of no return, committed to rebooting now.
181 */ 183 */
182NORET_TYPE void machine_kexec(struct kimage *image) 184void machine_kexec(struct kimage *image)
183{ 185{
184 unsigned long page_list[PAGES_NR]; 186 unsigned long page_list[PAGES_NR];
185 void *control_page; 187 void *control_page;
186 188
189 tracer_disable();
190
187 /* Interrupts aren't acceptable while we reboot */ 191 /* Interrupts aren't acceptable while we reboot */
188 local_irq_disable(); 192 local_irq_disable();
189 193
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 07c0f828f488..3b599518c322 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -33,6 +33,8 @@
33#include <linux/module.h> 33#include <linux/module.h>
34#include <asm/geode.h> 34#include <asm/geode.h>
35 35
36#define MFGPT_DEFAULT_IRQ 7
37
36static struct mfgpt_timer_t { 38static struct mfgpt_timer_t {
37 unsigned int avail:1; 39 unsigned int avail:1;
38} mfgpt_timers[MFGPT_MAX_TIMERS]; 40} mfgpt_timers[MFGPT_MAX_TIMERS];
@@ -157,29 +159,48 @@ int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable)
157} 159}
158EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event); 160EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event);
159 161
160int geode_mfgpt_set_irq(int timer, int cmp, int irq, int enable) 162int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable)
161{ 163{
162 u32 val, dummy; 164 u32 zsel, lpc, dummy;
163 int offset; 165 int shift;
164 166
165 if (timer < 0 || timer >= MFGPT_MAX_TIMERS) 167 if (timer < 0 || timer >= MFGPT_MAX_TIMERS)
166 return -EIO; 168 return -EIO;
167 169
168 if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable)) 170 /*
171 * Unfortunately, MFGPTs come in pairs sharing their IRQ lines. If VSA
172 * is using the same CMP of the timer's Siamese twin, the IRQ is set to
173 * 2, and we mustn't use nor change it.
174 * XXX: Likewise, 2 Linux drivers might clash if the 2nd overwrites the
175 * IRQ of the 1st. This can only happen if forcing an IRQ, calling this
176 * with *irq==0 is safe. Currently there _are_ no 2 drivers.
177 */
178 rdmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
179 shift = ((cmp == MFGPT_CMP1 ? 0 : 4) + timer % 4) * 4;
180 if (((zsel >> shift) & 0xF) == 2)
169 return -EIO; 181 return -EIO;
170 182
171 rdmsr(MSR_PIC_ZSEL_LOW, val, dummy); 183 /* Choose IRQ: if none supplied, keep IRQ already set or use default */
184 if (!*irq)
185 *irq = (zsel >> shift) & 0xF;
186 if (!*irq)
187 *irq = MFGPT_DEFAULT_IRQ;
172 188
173 offset = (timer % 4) * 4; 189 /* Can't use IRQ if it's 0 (=disabled), 2, or routed to LPC */
174 190 if (*irq < 1 || *irq == 2 || *irq > 15)
175 val &= ~((0xF << offset) | (0xF << (offset + 16))); 191 return -EIO;
192 rdmsr(MSR_PIC_IRQM_LPC, lpc, dummy);
193 if (lpc & (1 << *irq))
194 return -EIO;
176 195
196 /* All chosen and checked - go for it */
197 if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable))
198 return -EIO;
177 if (enable) { 199 if (enable) {
178 val |= (irq & 0x0F) << (offset); 200 zsel = (zsel & ~(0xF << shift)) | (*irq << shift);
179 val |= (irq & 0x0F) << (offset + 16); 201 wrmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
180 } 202 }
181 203
182 wrmsr(MSR_PIC_ZSEL_LOW, val, dummy);
183 return 0; 204 return 0;
184} 205}
185 206
@@ -242,7 +263,7 @@ EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer);
242static unsigned int mfgpt_tick_mode = CLOCK_EVT_MODE_SHUTDOWN; 263static unsigned int mfgpt_tick_mode = CLOCK_EVT_MODE_SHUTDOWN;
243static u16 mfgpt_event_clock; 264static u16 mfgpt_event_clock;
244 265
245static int irq = 7; 266static int irq;
246static int __init mfgpt_setup(char *str) 267static int __init mfgpt_setup(char *str)
247{ 268{
248 get_option(&str, &irq); 269 get_option(&str, &irq);
@@ -346,7 +367,7 @@ int __init mfgpt_timer_setup(void)
346 mfgpt_event_clock = timer; 367 mfgpt_event_clock = timer;
347 368
348 /* Set up the IRQ on the MFGPT side */ 369 /* Set up the IRQ on the MFGPT side */
349 if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, irq)) { 370 if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, &irq)) {
350 printk(KERN_ERR "mfgpt-timer: Could not set up IRQ %d\n", irq); 371 printk(KERN_ERR "mfgpt-timer: Could not set up IRQ %d\n", irq);
351 return -EIO; 372 return -EIO;
352 } 373 }
@@ -374,13 +395,14 @@ int __init mfgpt_timer_setup(void)
374 &mfgpt_clockevent); 395 &mfgpt_clockevent);
375 396
376 printk(KERN_INFO 397 printk(KERN_INFO
377 "mfgpt-timer: registering the MFGPT timer as a clock event.\n"); 398 "mfgpt-timer: Registering MFGPT timer %d as a clock event, using IRQ %d\n",
399 timer, irq);
378 clockevents_register_device(&mfgpt_clockevent); 400 clockevents_register_device(&mfgpt_clockevent);
379 401
380 return 0; 402 return 0;
381 403
382err: 404err:
383 geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, irq); 405 geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, &irq);
384 printk(KERN_ERR 406 printk(KERN_ERR
385 "mfgpt-timer: Unable to set up the MFGPT clock source\n"); 407 "mfgpt-timer: Unable to set up the MFGPT clock source\n");
386 return -EIO; 408 return -EIO;
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
deleted file mode 100644
index 69729e38b78a..000000000000
--- a/arch/x86/kernel/microcode.c
+++ /dev/null
@@ -1,848 +0,0 @@
1/*
2 * Intel CPU Microcode Update Driver for Linux
3 *
4 * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
5 * 2006 Shaohua Li <shaohua.li@intel.com>
6 *
7 * This driver allows to upgrade microcode on Intel processors
8 * belonging to IA-32 family - PentiumPro, Pentium II,
9 * Pentium III, Xeon, Pentium 4, etc.
10 *
11 * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual,
12 * Order Number 245472 or free download from:
13 *
14 * http://developer.intel.com/design/pentium4/manuals/245472.htm
15 *
16 * For more information, go to http://www.urbanmyth.org/microcode
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 *
23 * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
24 * Initial release.
25 * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
26 * Added read() support + cleanups.
27 * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
28 * Added 'device trimming' support. open(O_WRONLY) zeroes
29 * and frees the saved copy of applied microcode.
30 * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
31 * Made to use devfs (/dev/cpu/microcode) + cleanups.
32 * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
33 * Added misc device support (now uses both devfs and misc).
34 * Added MICROCODE_IOCFREE ioctl to clear memory.
35 * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
36 * Messages for error cases (non Intel & no suitable microcode).
37 * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
38 * Removed ->release(). Removed exclusive open and status bitmap.
39 * Added microcode_rwsem to serialize read()/write()/ioctl().
40 * Removed global kernel lock usage.
41 * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
42 * Write 0 to 0x8B msr and then cpuid before reading revision,
43 * so that it works even if there were no update done by the
44 * BIOS. Otherwise, reading from 0x8B gives junk (which happened
45 * to be 0 on my machine which is why it worked even when I
46 * disabled update by the BIOS)
47 * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
48 * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
49 * Tigran Aivazian <tigran@veritas.com>
50 * Intel Pentium 4 processor support and bugfixes.
51 * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
52 * Bugfix for HT (Hyper-Threading) enabled processors
53 * whereby processor resources are shared by all logical processors
54 * in a single CPU package.
55 * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
56 * Tigran Aivazian <tigran@veritas.com>,
57 * Serialize updates as required on HT processors due to speculative
58 * nature of implementation.
59 * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
60 * Fix the panic when writing zero-length microcode chunk.
61 * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
62 * Jun Nakajima <jun.nakajima@intel.com>
63 * Support for the microcode updates in the new format.
64 * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
65 * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
66 * because we no longer hold a copy of applied microcode
67 * in kernel memory.
68 * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
69 * Fix sigmatch() macro to handle old CPUs with pf == 0.
70 * Thanks to Stuart Swales for pointing out this bug.
71 */
72
73//#define DEBUG /* pr_debug */
74#include <linux/capability.h>
75#include <linux/kernel.h>
76#include <linux/init.h>
77#include <linux/sched.h>
78#include <linux/cpumask.h>
79#include <linux/module.h>
80#include <linux/slab.h>
81#include <linux/vmalloc.h>
82#include <linux/miscdevice.h>
83#include <linux/spinlock.h>
84#include <linux/mm.h>
85#include <linux/fs.h>
86#include <linux/mutex.h>
87#include <linux/cpu.h>
88#include <linux/firmware.h>
89#include <linux/platform_device.h>
90
91#include <asm/msr.h>
92#include <asm/uaccess.h>
93#include <asm/processor.h>
94
95MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
96MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
97MODULE_LICENSE("GPL");
98
99#define MICROCODE_VERSION "1.14a"
100
101#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */
102#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */
103#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
104#define EXT_HEADER_SIZE (sizeof (struct extended_sigtable)) /* 20 bytes */
105#define EXT_SIGNATURE_SIZE (sizeof (struct extended_signature)) /* 12 bytes */
106#define DWSIZE (sizeof (u32))
107#define get_totalsize(mc) \
108 (((microcode_t *)mc)->hdr.totalsize ? \
109 ((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE)
110#define get_datasize(mc) \
111 (((microcode_t *)mc)->hdr.datasize ? \
112 ((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
113
114#define sigmatch(s1, s2, p1, p2) \
115 (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
116
117#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
118
119/* serialize access to the physical write to MSR 0x79 */
120static DEFINE_SPINLOCK(microcode_update_lock);
121
122/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
123static DEFINE_MUTEX(microcode_mutex);
124
125static struct ucode_cpu_info {
126 int valid;
127 unsigned int sig;
128 unsigned int pf;
129 unsigned int rev;
130 microcode_t *mc;
131} ucode_cpu_info[NR_CPUS];
132
133static void collect_cpu_info(int cpu_num)
134{
135 struct cpuinfo_x86 *c = &cpu_data(cpu_num);
136 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
137 unsigned int val[2];
138
139 /* We should bind the task to the CPU */
140 BUG_ON(raw_smp_processor_id() != cpu_num);
141 uci->pf = uci->rev = 0;
142 uci->mc = NULL;
143 uci->valid = 1;
144
145 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
146 cpu_has(c, X86_FEATURE_IA64)) {
147 printk(KERN_ERR "microcode: CPU%d not a capable Intel "
148 "processor\n", cpu_num);
149 uci->valid = 0;
150 return;
151 }
152
153 uci->sig = cpuid_eax(0x00000001);
154
155 if ((c->x86_model >= 5) || (c->x86 > 6)) {
156 /* get processor flags from MSR 0x17 */
157 rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
158 uci->pf = 1 << ((val[1] >> 18) & 7);
159 }
160
161 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
162 /* see notes above for revision 1.07. Apparent chip bug */
163 sync_core();
164 /* get the current revision from MSR 0x8B */
165 rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev);
166 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
167 uci->sig, uci->pf, uci->rev);
168}
169
170static inline int microcode_update_match(int cpu_num,
171 microcode_header_t *mc_header, int sig, int pf)
172{
173 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
174
175 if (!sigmatch(sig, uci->sig, pf, uci->pf)
176 || mc_header->rev <= uci->rev)
177 return 0;
178 return 1;
179}
180
181static int microcode_sanity_check(void *mc)
182{
183 microcode_header_t *mc_header = mc;
184 struct extended_sigtable *ext_header = NULL;
185 struct extended_signature *ext_sig;
186 unsigned long total_size, data_size, ext_table_size;
187 int sum, orig_sum, ext_sigcount = 0, i;
188
189 total_size = get_totalsize(mc_header);
190 data_size = get_datasize(mc_header);
191 if (data_size + MC_HEADER_SIZE > total_size) {
192 printk(KERN_ERR "microcode: error! "
193 "Bad data size in microcode data file\n");
194 return -EINVAL;
195 }
196
197 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
198 printk(KERN_ERR "microcode: error! "
199 "Unknown microcode update format\n");
200 return -EINVAL;
201 }
202 ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
203 if (ext_table_size) {
204 if ((ext_table_size < EXT_HEADER_SIZE)
205 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
206 printk(KERN_ERR "microcode: error! "
207 "Small exttable size in microcode data file\n");
208 return -EINVAL;
209 }
210 ext_header = mc + MC_HEADER_SIZE + data_size;
211 if (ext_table_size != exttable_size(ext_header)) {
212 printk(KERN_ERR "microcode: error! "
213 "Bad exttable size in microcode data file\n");
214 return -EFAULT;
215 }
216 ext_sigcount = ext_header->count;
217 }
218
219 /* check extended table checksum */
220 if (ext_table_size) {
221 int ext_table_sum = 0;
222 int *ext_tablep = (int *)ext_header;
223
224 i = ext_table_size / DWSIZE;
225 while (i--)
226 ext_table_sum += ext_tablep[i];
227 if (ext_table_sum) {
228 printk(KERN_WARNING "microcode: aborting, "
229 "bad extended signature table checksum\n");
230 return -EINVAL;
231 }
232 }
233
234 /* calculate the checksum */
235 orig_sum = 0;
236 i = (MC_HEADER_SIZE + data_size) / DWSIZE;
237 while (i--)
238 orig_sum += ((int *)mc)[i];
239 if (orig_sum) {
240 printk(KERN_ERR "microcode: aborting, bad checksum\n");
241 return -EINVAL;
242 }
243 if (!ext_table_size)
244 return 0;
245 /* check extended signature checksum */
246 for (i = 0; i < ext_sigcount; i++) {
247 ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
248 EXT_SIGNATURE_SIZE * i;
249 sum = orig_sum
250 - (mc_header->sig + mc_header->pf + mc_header->cksum)
251 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
252 if (sum) {
253 printk(KERN_ERR "microcode: aborting, bad checksum\n");
254 return -EINVAL;
255 }
256 }
257 return 0;
258}
259
260/*
261 * return 0 - no update found
262 * return 1 - found update
263 * return < 0 - error
264 */
265static int get_maching_microcode(void *mc, int cpu)
266{
267 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
268 microcode_header_t *mc_header = mc;
269 struct extended_sigtable *ext_header;
270 unsigned long total_size = get_totalsize(mc_header);
271 int ext_sigcount, i;
272 struct extended_signature *ext_sig;
273 void *new_mc;
274
275 if (microcode_update_match(cpu, mc_header,
276 mc_header->sig, mc_header->pf))
277 goto find;
278
279 if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
280 return 0;
281
282 ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
283 ext_sigcount = ext_header->count;
284 ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
285 for (i = 0; i < ext_sigcount; i++) {
286 if (microcode_update_match(cpu, mc_header,
287 ext_sig->sig, ext_sig->pf))
288 goto find;
289 ext_sig++;
290 }
291 return 0;
292find:
293 pr_debug("microcode: CPU%d found a matching microcode update with"
294 " version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev);
295 new_mc = vmalloc(total_size);
296 if (!new_mc) {
297 printk(KERN_ERR "microcode: error! Can not allocate memory\n");
298 return -ENOMEM;
299 }
300
301 /* free previous update file */
302 vfree(uci->mc);
303
304 memcpy(new_mc, mc, total_size);
305 uci->mc = new_mc;
306 return 1;
307}
308
309static void apply_microcode(int cpu)
310{
311 unsigned long flags;
312 unsigned int val[2];
313 int cpu_num = raw_smp_processor_id();
314 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
315
316 /* We should bind the task to the CPU */
317 BUG_ON(cpu_num != cpu);
318
319 if (uci->mc == NULL)
320 return;
321
322 /* serialize access to the physical write to MSR 0x79 */
323 spin_lock_irqsave(&microcode_update_lock, flags);
324
325 /* write microcode via MSR 0x79 */
326 wrmsr(MSR_IA32_UCODE_WRITE,
327 (unsigned long) uci->mc->bits,
328 (unsigned long) uci->mc->bits >> 16 >> 16);
329 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
330
331 /* see notes above for revision 1.07. Apparent chip bug */
332 sync_core();
333
334 /* get the current revision from MSR 0x8B */
335 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
336
337 spin_unlock_irqrestore(&microcode_update_lock, flags);
338 if (val[1] != uci->mc->hdr.rev) {
339 printk(KERN_ERR "microcode: CPU%d update from revision "
340 "0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]);
341 return;
342 }
343 printk(KERN_INFO "microcode: CPU%d updated from revision "
344 "0x%x to 0x%x, date = %08x \n",
345 cpu_num, uci->rev, val[1], uci->mc->hdr.date);
346 uci->rev = val[1];
347}
348
349#ifdef CONFIG_MICROCODE_OLD_INTERFACE
350static void __user *user_buffer; /* user area microcode data buffer */
351static unsigned int user_buffer_size; /* it's size */
352
353static long get_next_ucode(void **mc, long offset)
354{
355 microcode_header_t mc_header;
356 unsigned long total_size;
357
358 /* No more data */
359 if (offset >= user_buffer_size)
360 return 0;
361 if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) {
362 printk(KERN_ERR "microcode: error! Can not read user data\n");
363 return -EFAULT;
364 }
365 total_size = get_totalsize(&mc_header);
366 if (offset + total_size > user_buffer_size) {
367 printk(KERN_ERR "microcode: error! Bad total size in microcode "
368 "data file\n");
369 return -EINVAL;
370 }
371 *mc = vmalloc(total_size);
372 if (!*mc)
373 return -ENOMEM;
374 if (copy_from_user(*mc, user_buffer + offset, total_size)) {
375 printk(KERN_ERR "microcode: error! Can not read user data\n");
376 vfree(*mc);
377 return -EFAULT;
378 }
379 return offset + total_size;
380}
381
382static int do_microcode_update (void)
383{
384 long cursor = 0;
385 int error = 0;
386 void *new_mc = NULL;
387 int cpu;
388 cpumask_t old;
389
390 old = current->cpus_allowed;
391
392 while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) {
393 error = microcode_sanity_check(new_mc);
394 if (error)
395 goto out;
396 /*
397 * It's possible the data file has multiple matching ucode,
398 * lets keep searching till the latest version
399 */
400 for_each_online_cpu(cpu) {
401 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
402
403 if (!uci->valid)
404 continue;
405 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
406 error = get_maching_microcode(new_mc, cpu);
407 if (error < 0)
408 goto out;
409 if (error == 1)
410 apply_microcode(cpu);
411 }
412 vfree(new_mc);
413 }
414out:
415 if (cursor > 0)
416 vfree(new_mc);
417 if (cursor < 0)
418 error = cursor;
419 set_cpus_allowed_ptr(current, &old);
420 return error;
421}
422
423static int microcode_open (struct inode *unused1, struct file *unused2)
424{
425 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
426}
427
428static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
429{
430 ssize_t ret;
431
432 if ((len >> PAGE_SHIFT) > num_physpages) {
433 printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages);
434 return -EINVAL;
435 }
436
437 get_online_cpus();
438 mutex_lock(&microcode_mutex);
439
440 user_buffer = (void __user *) buf;
441 user_buffer_size = (int) len;
442
443 ret = do_microcode_update();
444 if (!ret)
445 ret = (ssize_t)len;
446
447 mutex_unlock(&microcode_mutex);
448 put_online_cpus();
449
450 return ret;
451}
452
453static const struct file_operations microcode_fops = {
454 .owner = THIS_MODULE,
455 .write = microcode_write,
456 .open = microcode_open,
457};
458
459static struct miscdevice microcode_dev = {
460 .minor = MICROCODE_MINOR,
461 .name = "microcode",
462 .fops = &microcode_fops,
463};
464
465static int __init microcode_dev_init (void)
466{
467 int error;
468
469 error = misc_register(&microcode_dev);
470 if (error) {
471 printk(KERN_ERR
472 "microcode: can't misc_register on minor=%d\n",
473 MICROCODE_MINOR);
474 return error;
475 }
476
477 return 0;
478}
479
480static void microcode_dev_exit (void)
481{
482 misc_deregister(&microcode_dev);
483}
484
485MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
486#else
487#define microcode_dev_init() 0
488#define microcode_dev_exit() do { } while(0)
489#endif
490
491static long get_next_ucode_from_buffer(void **mc, void *buf,
492 unsigned long size, long offset)
493{
494 microcode_header_t *mc_header;
495 unsigned long total_size;
496
497 /* No more data */
498 if (offset >= size)
499 return 0;
500 mc_header = (microcode_header_t *)(buf + offset);
501 total_size = get_totalsize(mc_header);
502
503 if (offset + total_size > size) {
504 printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
505 return -EINVAL;
506 }
507
508 *mc = vmalloc(total_size);
509 if (!*mc) {
510 printk(KERN_ERR "microcode: error! Can not allocate memory\n");
511 return -ENOMEM;
512 }
513 memcpy(*mc, buf + offset, total_size);
514 return offset + total_size;
515}
516
517/* fake device for request_firmware */
518static struct platform_device *microcode_pdev;
519
520static int cpu_request_microcode(int cpu)
521{
522 char name[30];
523 struct cpuinfo_x86 *c = &cpu_data(cpu);
524 const struct firmware *firmware;
525 void *buf;
526 unsigned long size;
527 long offset = 0;
528 int error;
529 void *mc;
530
531 /* We should bind the task to the CPU */
532 BUG_ON(cpu != raw_smp_processor_id());
533 sprintf(name,"intel-ucode/%02x-%02x-%02x",
534 c->x86, c->x86_model, c->x86_mask);
535 error = request_firmware(&firmware, name, &microcode_pdev->dev);
536 if (error) {
537 pr_debug("microcode: ucode data file %s load failed\n", name);
538 return error;
539 }
540 buf = firmware->data;
541 size = firmware->size;
542 while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset))
543 > 0) {
544 error = microcode_sanity_check(mc);
545 if (error)
546 break;
547 error = get_maching_microcode(mc, cpu);
548 if (error < 0)
549 break;
550 /*
551 * It's possible the data file has multiple matching ucode,
552 * lets keep searching till the latest version
553 */
554 if (error == 1) {
555 apply_microcode(cpu);
556 error = 0;
557 }
558 vfree(mc);
559 }
560 if (offset > 0)
561 vfree(mc);
562 if (offset < 0)
563 error = offset;
564 release_firmware(firmware);
565
566 return error;
567}
568
569static int apply_microcode_check_cpu(int cpu)
570{
571 struct cpuinfo_x86 *c = &cpu_data(cpu);
572 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
573 cpumask_t old;
574 unsigned int val[2];
575 int err = 0;
576
577 /* Check if the microcode is available */
578 if (!uci->mc)
579 return 0;
580
581 old = current->cpus_allowed;
582 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
583
584 /* Check if the microcode we have in memory matches the CPU */
585 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
586 cpu_has(c, X86_FEATURE_IA64) || uci->sig != cpuid_eax(0x00000001))
587 err = -EINVAL;
588
589 if (!err && ((c->x86_model >= 5) || (c->x86 > 6))) {
590 /* get processor flags from MSR 0x17 */
591 rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
592 if (uci->pf != (1 << ((val[1] >> 18) & 7)))
593 err = -EINVAL;
594 }
595
596 if (!err) {
597 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
598 /* see notes above for revision 1.07. Apparent chip bug */
599 sync_core();
600 /* get the current revision from MSR 0x8B */
601 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
602 if (uci->rev != val[1])
603 err = -EINVAL;
604 }
605
606 if (!err)
607 apply_microcode(cpu);
608 else
609 printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:"
610 " sig=0x%x, pf=0x%x, rev=0x%x\n",
611 cpu, uci->sig, uci->pf, uci->rev);
612
613 set_cpus_allowed_ptr(current, &old);
614 return err;
615}
616
617static void microcode_init_cpu(int cpu, int resume)
618{
619 cpumask_t old;
620 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
621
622 old = current->cpus_allowed;
623
624 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
625 mutex_lock(&microcode_mutex);
626 collect_cpu_info(cpu);
627 if (uci->valid && system_state == SYSTEM_RUNNING && !resume)
628 cpu_request_microcode(cpu);
629 mutex_unlock(&microcode_mutex);
630 set_cpus_allowed_ptr(current, &old);
631}
632
633static void microcode_fini_cpu(int cpu)
634{
635 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
636
637 mutex_lock(&microcode_mutex);
638 uci->valid = 0;
639 vfree(uci->mc);
640 uci->mc = NULL;
641 mutex_unlock(&microcode_mutex);
642}
643
644static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
645{
646 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
647 char *end;
648 unsigned long val = simple_strtoul(buf, &end, 0);
649 int err = 0;
650 int cpu = dev->id;
651
652 if (end == buf)
653 return -EINVAL;
654 if (val == 1) {
655 cpumask_t old;
656
657 old = current->cpus_allowed;
658
659 get_online_cpus();
660 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
661
662 mutex_lock(&microcode_mutex);
663 if (uci->valid)
664 err = cpu_request_microcode(cpu);
665 mutex_unlock(&microcode_mutex);
666 put_online_cpus();
667 set_cpus_allowed_ptr(current, &old);
668 }
669 if (err)
670 return err;
671 return sz;
672}
673
674static ssize_t version_show(struct sys_device *dev, char *buf)
675{
676 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
677
678 return sprintf(buf, "0x%x\n", uci->rev);
679}
680
681static ssize_t pf_show(struct sys_device *dev, char *buf)
682{
683 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
684
685 return sprintf(buf, "0x%x\n", uci->pf);
686}
687
688static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
689static SYSDEV_ATTR(version, 0400, version_show, NULL);
690static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
691
692static struct attribute *mc_default_attrs[] = {
693 &attr_reload.attr,
694 &attr_version.attr,
695 &attr_processor_flags.attr,
696 NULL
697};
698
699static struct attribute_group mc_attr_group = {
700 .attrs = mc_default_attrs,
701 .name = "microcode",
702};
703
704static int __mc_sysdev_add(struct sys_device *sys_dev, int resume)
705{
706 int err, cpu = sys_dev->id;
707 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
708
709 if (!cpu_online(cpu))
710 return 0;
711
712 pr_debug("microcode: CPU%d added\n", cpu);
713 memset(uci, 0, sizeof(*uci));
714
715 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
716 if (err)
717 return err;
718
719 microcode_init_cpu(cpu, resume);
720
721 return 0;
722}
723
724static int mc_sysdev_add(struct sys_device *sys_dev)
725{
726 return __mc_sysdev_add(sys_dev, 0);
727}
728
729static int mc_sysdev_remove(struct sys_device *sys_dev)
730{
731 int cpu = sys_dev->id;
732
733 if (!cpu_online(cpu))
734 return 0;
735
736 pr_debug("microcode: CPU%d removed\n", cpu);
737 microcode_fini_cpu(cpu);
738 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
739 return 0;
740}
741
742static int mc_sysdev_resume(struct sys_device *dev)
743{
744 int cpu = dev->id;
745
746 if (!cpu_online(cpu))
747 return 0;
748 pr_debug("microcode: CPU%d resumed\n", cpu);
749 /* only CPU 0 will apply ucode here */
750 apply_microcode(0);
751 return 0;
752}
753
754static struct sysdev_driver mc_sysdev_driver = {
755 .add = mc_sysdev_add,
756 .remove = mc_sysdev_remove,
757 .resume = mc_sysdev_resume,
758};
759
760static __cpuinit int
761mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
762{
763 unsigned int cpu = (unsigned long)hcpu;
764 struct sys_device *sys_dev;
765
766 sys_dev = get_cpu_sysdev(cpu);
767 switch (action) {
768 case CPU_UP_CANCELED_FROZEN:
769 /* The CPU refused to come up during a system resume */
770 microcode_fini_cpu(cpu);
771 break;
772 case CPU_ONLINE:
773 case CPU_DOWN_FAILED:
774 mc_sysdev_add(sys_dev);
775 break;
776 case CPU_ONLINE_FROZEN:
777 /* System-wide resume is in progress, try to apply microcode */
778 if (apply_microcode_check_cpu(cpu)) {
779 /* The application of microcode failed */
780 microcode_fini_cpu(cpu);
781 __mc_sysdev_add(sys_dev, 1);
782 break;
783 }
784 case CPU_DOWN_FAILED_FROZEN:
785 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
786 printk(KERN_ERR "microcode: Failed to create the sysfs "
787 "group for CPU%d\n", cpu);
788 break;
789 case CPU_DOWN_PREPARE:
790 mc_sysdev_remove(sys_dev);
791 break;
792 case CPU_DOWN_PREPARE_FROZEN:
793 /* Suspend is in progress, only remove the interface */
794 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
795 break;
796 }
797 return NOTIFY_OK;
798}
799
800static struct notifier_block __refdata mc_cpu_notifier = {
801 .notifier_call = mc_cpu_callback,
802};
803
804static int __init microcode_init (void)
805{
806 int error;
807
808 error = microcode_dev_init();
809 if (error)
810 return error;
811 microcode_pdev = platform_device_register_simple("microcode", -1,
812 NULL, 0);
813 if (IS_ERR(microcode_pdev)) {
814 microcode_dev_exit();
815 return PTR_ERR(microcode_pdev);
816 }
817
818 get_online_cpus();
819 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
820 put_online_cpus();
821 if (error) {
822 microcode_dev_exit();
823 platform_device_unregister(microcode_pdev);
824 return error;
825 }
826
827 register_hotcpu_notifier(&mc_cpu_notifier);
828
829 printk(KERN_INFO
830 "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
831 return 0;
832}
833
834static void __exit microcode_exit (void)
835{
836 microcode_dev_exit();
837
838 unregister_hotcpu_notifier(&mc_cpu_notifier);
839
840 get_online_cpus();
841 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
842 put_online_cpus();
843
844 platform_device_unregister(microcode_pdev);
845}
846
847module_init(microcode_init)
848module_exit(microcode_exit)
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
new file mode 100644
index 000000000000..7a1f8eeac2c7
--- /dev/null
+++ b/arch/x86/kernel/microcode_amd.c
@@ -0,0 +1,435 @@
1/*
2 * AMD CPU Microcode Update Driver for Linux
3 * Copyright (C) 2008 Advanced Micro Devices Inc.
4 *
5 * Author: Peter Oruba <peter.oruba@amd.com>
6 *
7 * Based on work by:
8 * Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
9 *
10 * This driver allows to upgrade microcode on AMD
11 * family 0x10 and 0x11 processors.
12 *
13 * Licensed unter the terms of the GNU General Public
14 * License version 2. See file COPYING for details.
15*/
16
17#include <linux/capability.h>
18#include <linux/kernel.h>
19#include <linux/init.h>
20#include <linux/sched.h>
21#include <linux/cpumask.h>
22#include <linux/module.h>
23#include <linux/slab.h>
24#include <linux/vmalloc.h>
25#include <linux/miscdevice.h>
26#include <linux/spinlock.h>
27#include <linux/mm.h>
28#include <linux/fs.h>
29#include <linux/mutex.h>
30#include <linux/cpu.h>
31#include <linux/firmware.h>
32#include <linux/platform_device.h>
33#include <linux/pci.h>
34#include <linux/pci_ids.h>
35
36#include <asm/msr.h>
37#include <asm/uaccess.h>
38#include <asm/processor.h>
39#include <asm/microcode.h>
40
41MODULE_DESCRIPTION("AMD Microcode Update Driver");
42MODULE_AUTHOR("Peter Oruba <peter.oruba@amd.com>");
43MODULE_LICENSE("GPL v2");
44
45#define UCODE_MAGIC 0x00414d44
46#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
47#define UCODE_UCODE_TYPE 0x00000001
48
49struct equiv_cpu_entry {
50 unsigned int installed_cpu;
51 unsigned int fixed_errata_mask;
52 unsigned int fixed_errata_compare;
53 unsigned int equiv_cpu;
54};
55
56struct microcode_header_amd {
57 unsigned int data_code;
58 unsigned int patch_id;
59 unsigned char mc_patch_data_id[2];
60 unsigned char mc_patch_data_len;
61 unsigned char init_flag;
62 unsigned int mc_patch_data_checksum;
63 unsigned int nb_dev_id;
64 unsigned int sb_dev_id;
65 unsigned char processor_rev_id[2];
66 unsigned char nb_rev_id;
67 unsigned char sb_rev_id;
68 unsigned char bios_api_rev;
69 unsigned char reserved1[3];
70 unsigned int match_reg[8];
71};
72
73struct microcode_amd {
74 struct microcode_header_amd hdr;
75 unsigned int mpb[0];
76};
77
78#define UCODE_MAX_SIZE (2048)
79#define DEFAULT_UCODE_DATASIZE (896)
80#define MC_HEADER_SIZE (sizeof(struct microcode_header_amd))
81#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
82#define DWSIZE (sizeof(u32))
83/* For now we support a fixed ucode total size only */
84#define get_totalsize(mc) \
85 ((((struct microcode_amd *)mc)->hdr.mc_patch_data_len * 28) \
86 + MC_HEADER_SIZE)
87
88/* serialize access to the physical write */
89static DEFINE_SPINLOCK(microcode_update_lock);
90
91static struct equiv_cpu_entry *equiv_cpu_table;
92
93static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
94{
95 struct cpuinfo_x86 *c = &cpu_data(cpu);
96
97 memset(csig, 0, sizeof(*csig));
98
99 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
100 printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n",
101 cpu);
102 return -1;
103 }
104
105 asm volatile("movl %1, %%ecx; rdmsr"
106 : "=a" (csig->rev)
107 : "i" (0x0000008B) : "ecx");
108
109 printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n",
110 csig->rev);
111
112 return 0;
113}
114
115static int get_matching_microcode(int cpu, void *mc, int rev)
116{
117 struct microcode_header_amd *mc_header = mc;
118 struct pci_dev *nb_pci_dev, *sb_pci_dev;
119 unsigned int current_cpu_id;
120 unsigned int equiv_cpu_id = 0x00;
121 unsigned int i = 0;
122
123 BUG_ON(equiv_cpu_table == NULL);
124 current_cpu_id = cpuid_eax(0x00000001);
125
126 while (equiv_cpu_table[i].installed_cpu != 0) {
127 if (current_cpu_id == equiv_cpu_table[i].installed_cpu) {
128 equiv_cpu_id = equiv_cpu_table[i].equiv_cpu;
129 break;
130 }
131 i++;
132 }
133
134 if (!equiv_cpu_id) {
135 printk(KERN_ERR "microcode: CPU%d cpu_id "
136 "not found in equivalent cpu table \n", cpu);
137 return 0;
138 }
139
140 if ((mc_header->processor_rev_id[0]) != (equiv_cpu_id & 0xff)) {
141 printk(KERN_ERR
142 "microcode: CPU%d patch does not match "
143 "(patch is %x, cpu extended is %x) \n",
144 cpu, mc_header->processor_rev_id[0],
145 (equiv_cpu_id & 0xff));
146 return 0;
147 }
148
149 if ((mc_header->processor_rev_id[1]) != ((equiv_cpu_id >> 16) & 0xff)) {
150 printk(KERN_ERR "microcode: CPU%d patch does not match "
151 "(patch is %x, cpu base id is %x) \n",
152 cpu, mc_header->processor_rev_id[1],
153 ((equiv_cpu_id >> 16) & 0xff));
154
155 return 0;
156 }
157
158 /* ucode may be northbridge specific */
159 if (mc_header->nb_dev_id) {
160 nb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
161 (mc_header->nb_dev_id & 0xff),
162 NULL);
163 if ((!nb_pci_dev) ||
164 (mc_header->nb_rev_id != nb_pci_dev->revision)) {
165 printk(KERN_ERR "microcode: CPU%d NB mismatch \n", cpu);
166 pci_dev_put(nb_pci_dev);
167 return 0;
168 }
169 pci_dev_put(nb_pci_dev);
170 }
171
172 /* ucode may be southbridge specific */
173 if (mc_header->sb_dev_id) {
174 sb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
175 (mc_header->sb_dev_id & 0xff),
176 NULL);
177 if ((!sb_pci_dev) ||
178 (mc_header->sb_rev_id != sb_pci_dev->revision)) {
179 printk(KERN_ERR "microcode: CPU%d SB mismatch \n", cpu);
180 pci_dev_put(sb_pci_dev);
181 return 0;
182 }
183 pci_dev_put(sb_pci_dev);
184 }
185
186 if (mc_header->patch_id <= rev)
187 return 0;
188
189 return 1;
190}
191
192static void apply_microcode_amd(int cpu)
193{
194 unsigned long flags;
195 unsigned int eax, edx;
196 unsigned int rev;
197 int cpu_num = raw_smp_processor_id();
198 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
199 struct microcode_amd *mc_amd = uci->mc;
200 unsigned long addr;
201
202 /* We should bind the task to the CPU */
203 BUG_ON(cpu_num != cpu);
204
205 if (mc_amd == NULL)
206 return;
207
208 spin_lock_irqsave(&microcode_update_lock, flags);
209
210 addr = (unsigned long)&mc_amd->hdr.data_code;
211 edx = (unsigned int)(((unsigned long)upper_32_bits(addr)));
212 eax = (unsigned int)(((unsigned long)lower_32_bits(addr)));
213
214 asm volatile("movl %0, %%ecx; wrmsr" :
215 : "i" (0xc0010020), "a" (eax), "d" (edx) : "ecx");
216
217 /* get patch id after patching */
218 asm volatile("movl %1, %%ecx; rdmsr"
219 : "=a" (rev)
220 : "i" (0x0000008B) : "ecx");
221
222 spin_unlock_irqrestore(&microcode_update_lock, flags);
223
224 /* check current patch id and patch's id for match */
225 if (rev != mc_amd->hdr.patch_id) {
226 printk(KERN_ERR "microcode: CPU%d update from revision "
227 "0x%x to 0x%x failed\n", cpu_num,
228 mc_amd->hdr.patch_id, rev);
229 return;
230 }
231
232 printk(KERN_INFO "microcode: CPU%d updated from revision "
233 "0x%x to 0x%x \n",
234 cpu_num, uci->cpu_sig.rev, mc_amd->hdr.patch_id);
235
236 uci->cpu_sig.rev = rev;
237}
238
239static void * get_next_ucode(u8 *buf, unsigned int size,
240 int (*get_ucode_data)(void *, const void *, size_t),
241 unsigned int *mc_size)
242{
243 unsigned int total_size;
244#define UCODE_CONTAINER_SECTION_HDR 8
245 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
246 void *mc;
247
248 if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR))
249 return NULL;
250
251 if (section_hdr[0] != UCODE_UCODE_TYPE) {
252 printk(KERN_ERR "microcode: error! "
253 "Wrong microcode payload type field\n");
254 return NULL;
255 }
256
257 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
258
259 printk(KERN_INFO "microcode: size %u, total_size %u\n",
260 size, total_size);
261
262 if (total_size > size || total_size > UCODE_MAX_SIZE) {
263 printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
264 return NULL;
265 }
266
267 mc = vmalloc(UCODE_MAX_SIZE);
268 if (mc) {
269 memset(mc, 0, UCODE_MAX_SIZE);
270 if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size)) {
271 vfree(mc);
272 mc = NULL;
273 } else
274 *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
275 }
276#undef UCODE_CONTAINER_SECTION_HDR
277 return mc;
278}
279
280
281static int install_equiv_cpu_table(u8 *buf,
282 int (*get_ucode_data)(void *, const void *, size_t))
283{
284#define UCODE_CONTAINER_HEADER_SIZE 12
285 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE];
286 unsigned int *buf_pos = (unsigned int *)container_hdr;
287 unsigned long size;
288
289 if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE))
290 return 0;
291
292 size = buf_pos[2];
293
294 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
295 printk(KERN_ERR "microcode: error! "
296 "Wrong microcode equivalnet cpu table\n");
297 return 0;
298 }
299
300 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
301 if (!equiv_cpu_table) {
302 printk(KERN_ERR "microcode: error, can't allocate memory for equiv CPU table\n");
303 return 0;
304 }
305
306 buf += UCODE_CONTAINER_HEADER_SIZE;
307 if (get_ucode_data(equiv_cpu_table, buf, size)) {
308 vfree(equiv_cpu_table);
309 return 0;
310 }
311
312 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
313#undef UCODE_CONTAINER_HEADER_SIZE
314}
315
316static void free_equiv_cpu_table(void)
317{
318 if (equiv_cpu_table) {
319 vfree(equiv_cpu_table);
320 equiv_cpu_table = NULL;
321 }
322}
323
324static int generic_load_microcode(int cpu, void *data, size_t size,
325 int (*get_ucode_data)(void *, const void *, size_t))
326{
327 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
328 u8 *ucode_ptr = data, *new_mc = NULL, *mc;
329 int new_rev = uci->cpu_sig.rev;
330 unsigned int leftover;
331 unsigned long offset;
332
333 offset = install_equiv_cpu_table(ucode_ptr, get_ucode_data);
334 if (!offset) {
335 printk(KERN_ERR "microcode: installing equivalent cpu table failed\n");
336 return -EINVAL;
337 }
338
339 ucode_ptr += offset;
340 leftover = size - offset;
341
342 while (leftover) {
343 unsigned int uninitialized_var(mc_size);
344 struct microcode_header_amd *mc_header;
345
346 mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, &mc_size);
347 if (!mc)
348 break;
349
350 mc_header = (struct microcode_header_amd *)mc;
351 if (get_matching_microcode(cpu, mc, new_rev)) {
352 if (new_mc)
353 vfree(new_mc);
354 new_rev = mc_header->patch_id;
355 new_mc = mc;
356 } else
357 vfree(mc);
358
359 ucode_ptr += mc_size;
360 leftover -= mc_size;
361 }
362
363 if (new_mc) {
364 if (!leftover) {
365 if (uci->mc)
366 vfree(uci->mc);
367 uci->mc = new_mc;
368 pr_debug("microcode: CPU%d found a matching microcode update with"
369 " version 0x%x (current=0x%x)\n",
370 cpu, new_rev, uci->cpu_sig.rev);
371 } else
372 vfree(new_mc);
373 }
374
375 free_equiv_cpu_table();
376
377 return (int)leftover;
378}
379
380static int get_ucode_fw(void *to, const void *from, size_t n)
381{
382 memcpy(to, from, n);
383 return 0;
384}
385
386static int request_microcode_fw(int cpu, struct device *device)
387{
388 const char *fw_name = "amd-ucode/microcode_amd.bin";
389 const struct firmware *firmware;
390 int ret;
391
392 /* We should bind the task to the CPU */
393 BUG_ON(cpu != raw_smp_processor_id());
394
395 ret = request_firmware(&firmware, fw_name, device);
396 if (ret) {
397 printk(KERN_ERR "microcode: ucode data file %s load failed\n", fw_name);
398 return ret;
399 }
400
401 ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size,
402 &get_ucode_fw);
403
404 release_firmware(firmware);
405
406 return ret;
407}
408
409static int request_microcode_user(int cpu, const void __user *buf, size_t size)
410{
411 printk(KERN_WARNING "microcode: AMD microcode update via /dev/cpu/microcode"
412 "is not supported\n");
413 return -1;
414}
415
416static void microcode_fini_cpu_amd(int cpu)
417{
418 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
419
420 vfree(uci->mc);
421 uci->mc = NULL;
422}
423
424static struct microcode_ops microcode_amd_ops = {
425 .request_microcode_user = request_microcode_user,
426 .request_microcode_fw = request_microcode_fw,
427 .collect_cpu_info = collect_cpu_info_amd,
428 .apply_microcode = apply_microcode_amd,
429 .microcode_fini_cpu = microcode_fini_cpu_amd,
430};
431
432struct microcode_ops * __init init_amd_microcode(void)
433{
434 return &microcode_amd_ops;
435}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
new file mode 100644
index 000000000000..936d8d55f230
--- /dev/null
+++ b/arch/x86/kernel/microcode_core.c
@@ -0,0 +1,508 @@
1/*
2 * Intel CPU Microcode Update Driver for Linux
3 *
4 * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
5 * 2006 Shaohua Li <shaohua.li@intel.com>
6 *
7 * This driver allows to upgrade microcode on Intel processors
8 * belonging to IA-32 family - PentiumPro, Pentium II,
9 * Pentium III, Xeon, Pentium 4, etc.
10 *
11 * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
12 * Software Developer's Manual
13 * Order Number 253668 or free download from:
14 *
15 * http://developer.intel.com/design/pentium4/manuals/253668.htm
16 *
17 * For more information, go to http://www.urbanmyth.org/microcode
18 *
19 * This program is free software; you can redistribute it and/or
20 * modify it under the terms of the GNU General Public License
21 * as published by the Free Software Foundation; either version
22 * 2 of the License, or (at your option) any later version.
23 *
24 * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
25 * Initial release.
26 * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
27 * Added read() support + cleanups.
28 * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
29 * Added 'device trimming' support. open(O_WRONLY) zeroes
30 * and frees the saved copy of applied microcode.
31 * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
32 * Made to use devfs (/dev/cpu/microcode) + cleanups.
33 * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
34 * Added misc device support (now uses both devfs and misc).
35 * Added MICROCODE_IOCFREE ioctl to clear memory.
36 * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
37 * Messages for error cases (non Intel & no suitable microcode).
38 * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
39 * Removed ->release(). Removed exclusive open and status bitmap.
40 * Added microcode_rwsem to serialize read()/write()/ioctl().
41 * Removed global kernel lock usage.
42 * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
43 * Write 0 to 0x8B msr and then cpuid before reading revision,
44 * so that it works even if there were no update done by the
45 * BIOS. Otherwise, reading from 0x8B gives junk (which happened
46 * to be 0 on my machine which is why it worked even when I
47 * disabled update by the BIOS)
48 * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
49 * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
50 * Tigran Aivazian <tigran@veritas.com>
51 * Intel Pentium 4 processor support and bugfixes.
52 * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
53 * Bugfix for HT (Hyper-Threading) enabled processors
54 * whereby processor resources are shared by all logical processors
55 * in a single CPU package.
56 * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
57 * Tigran Aivazian <tigran@veritas.com>,
58 * Serialize updates as required on HT processors due to
59 * speculative nature of implementation.
60 * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
61 * Fix the panic when writing zero-length microcode chunk.
62 * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
63 * Jun Nakajima <jun.nakajima@intel.com>
64 * Support for the microcode updates in the new format.
65 * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
66 * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
67 * because we no longer hold a copy of applied microcode
68 * in kernel memory.
69 * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug.
72 */
73#include <linux/capability.h>
74#include <linux/kernel.h>
75#include <linux/init.h>
76#include <linux/sched.h>
77#include <linux/smp_lock.h>
78#include <linux/cpumask.h>
79#include <linux/module.h>
80#include <linux/slab.h>
81#include <linux/vmalloc.h>
82#include <linux/miscdevice.h>
83#include <linux/spinlock.h>
84#include <linux/mm.h>
85#include <linux/fs.h>
86#include <linux/mutex.h>
87#include <linux/cpu.h>
88#include <linux/firmware.h>
89#include <linux/platform_device.h>
90
91#include <asm/msr.h>
92#include <asm/uaccess.h>
93#include <asm/processor.h>
94#include <asm/microcode.h>
95
96MODULE_DESCRIPTION("Microcode Update Driver");
97MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
98MODULE_LICENSE("GPL");
99
100#define MICROCODE_VERSION "2.00"
101
102struct microcode_ops *microcode_ops;
103
104/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
105static DEFINE_MUTEX(microcode_mutex);
106
107struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
108EXPORT_SYMBOL_GPL(ucode_cpu_info);
109
110#ifdef CONFIG_MICROCODE_OLD_INTERFACE
111static int do_microcode_update(const void __user *buf, size_t size)
112{
113 cpumask_t old;
114 int error = 0;
115 int cpu;
116
117 old = current->cpus_allowed;
118
119 for_each_online_cpu(cpu) {
120 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
121
122 if (!uci->valid)
123 continue;
124
125 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
126 error = microcode_ops->request_microcode_user(cpu, buf, size);
127 if (error < 0)
128 goto out;
129 if (!error)
130 microcode_ops->apply_microcode(cpu);
131 }
132out:
133 set_cpus_allowed_ptr(current, &old);
134 return error;
135}
136
137static int microcode_open(struct inode *unused1, struct file *unused2)
138{
139 cycle_kernel_lock();
140 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
141}
142
143static ssize_t microcode_write(struct file *file, const char __user *buf,
144 size_t len, loff_t *ppos)
145{
146 ssize_t ret;
147
148 if ((len >> PAGE_SHIFT) > num_physpages) {
149 printk(KERN_ERR "microcode: too much data (max %ld pages)\n",
150 num_physpages);
151 return -EINVAL;
152 }
153
154 get_online_cpus();
155 mutex_lock(&microcode_mutex);
156
157 ret = do_microcode_update(buf, len);
158 if (!ret)
159 ret = (ssize_t)len;
160
161 mutex_unlock(&microcode_mutex);
162 put_online_cpus();
163
164 return ret;
165}
166
167static const struct file_operations microcode_fops = {
168 .owner = THIS_MODULE,
169 .write = microcode_write,
170 .open = microcode_open,
171};
172
173static struct miscdevice microcode_dev = {
174 .minor = MICROCODE_MINOR,
175 .name = "microcode",
176 .fops = &microcode_fops,
177};
178
179static int __init microcode_dev_init(void)
180{
181 int error;
182
183 error = misc_register(&microcode_dev);
184 if (error) {
185 printk(KERN_ERR
186 "microcode: can't misc_register on minor=%d\n",
187 MICROCODE_MINOR);
188 return error;
189 }
190
191 return 0;
192}
193
194static void microcode_dev_exit(void)
195{
196 misc_deregister(&microcode_dev);
197}
198
199MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
200#else
201#define microcode_dev_init() 0
202#define microcode_dev_exit() do { } while (0)
203#endif
204
205/* fake device for request_firmware */
206struct platform_device *microcode_pdev;
207
208static ssize_t reload_store(struct sys_device *dev,
209 struct sysdev_attribute *attr,
210 const char *buf, size_t sz)
211{
212 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
213 char *end;
214 unsigned long val = simple_strtoul(buf, &end, 0);
215 int err = 0;
216 int cpu = dev->id;
217
218 if (end == buf)
219 return -EINVAL;
220 if (val == 1) {
221 cpumask_t old = current->cpus_allowed;
222
223 get_online_cpus();
224 if (cpu_online(cpu)) {
225 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
226 mutex_lock(&microcode_mutex);
227 if (uci->valid) {
228 err = microcode_ops->request_microcode_fw(cpu,
229 &microcode_pdev->dev);
230 if (!err)
231 microcode_ops->apply_microcode(cpu);
232 }
233 mutex_unlock(&microcode_mutex);
234 set_cpus_allowed_ptr(current, &old);
235 }
236 put_online_cpus();
237 }
238 if (err)
239 return err;
240 return sz;
241}
242
243static ssize_t version_show(struct sys_device *dev,
244 struct sysdev_attribute *attr, char *buf)
245{
246 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
247
248 return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
249}
250
251static ssize_t pf_show(struct sys_device *dev,
252 struct sysdev_attribute *attr, char *buf)
253{
254 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
255
256 return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
257}
258
259static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
260static SYSDEV_ATTR(version, 0400, version_show, NULL);
261static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
262
263static struct attribute *mc_default_attrs[] = {
264 &attr_reload.attr,
265 &attr_version.attr,
266 &attr_processor_flags.attr,
267 NULL
268};
269
270static struct attribute_group mc_attr_group = {
271 .attrs = mc_default_attrs,
272 .name = "microcode",
273};
274
275static void microcode_fini_cpu(int cpu)
276{
277 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
278
279 mutex_lock(&microcode_mutex);
280 microcode_ops->microcode_fini_cpu(cpu);
281 uci->valid = 0;
282 mutex_unlock(&microcode_mutex);
283}
284
285static void collect_cpu_info(int cpu)
286{
287 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
288
289 memset(uci, 0, sizeof(*uci));
290 if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig))
291 uci->valid = 1;
292}
293
294static int microcode_resume_cpu(int cpu)
295{
296 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
297 struct cpu_signature nsig;
298
299 pr_debug("microcode: CPU%d resumed\n", cpu);
300
301 if (!uci->mc)
302 return 1;
303
304 /*
305 * Let's verify that the 'cached' ucode does belong
306 * to this cpu (a bit of paranoia):
307 */
308 if (microcode_ops->collect_cpu_info(cpu, &nsig)) {
309 microcode_fini_cpu(cpu);
310 return -1;
311 }
312
313 if (memcmp(&nsig, &uci->cpu_sig, sizeof(nsig))) {
314 microcode_fini_cpu(cpu);
315 /* Should we look for a new ucode here? */
316 return 1;
317 }
318
319 return 0;
320}
321
322void microcode_update_cpu(int cpu)
323{
324 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
325 int err = 0;
326
327 /*
328 * Check if the system resume is in progress (uci->valid != NULL),
329 * otherwise just request a firmware:
330 */
331 if (uci->valid) {
332 err = microcode_resume_cpu(cpu);
333 } else {
334 collect_cpu_info(cpu);
335 if (uci->valid && system_state == SYSTEM_RUNNING)
336 err = microcode_ops->request_microcode_fw(cpu,
337 &microcode_pdev->dev);
338 }
339 if (!err)
340 microcode_ops->apply_microcode(cpu);
341}
342
343static void microcode_init_cpu(int cpu)
344{
345 cpumask_t old = current->cpus_allowed;
346
347 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
348 /* We should bind the task to the CPU */
349 BUG_ON(raw_smp_processor_id() != cpu);
350
351 mutex_lock(&microcode_mutex);
352 microcode_update_cpu(cpu);
353 mutex_unlock(&microcode_mutex);
354
355 set_cpus_allowed_ptr(current, &old);
356}
357
358static int mc_sysdev_add(struct sys_device *sys_dev)
359{
360 int err, cpu = sys_dev->id;
361 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
362
363 if (!cpu_online(cpu))
364 return 0;
365
366 pr_debug("microcode: CPU%d added\n", cpu);
367 memset(uci, 0, sizeof(*uci));
368
369 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
370 if (err)
371 return err;
372
373 microcode_init_cpu(cpu);
374 return 0;
375}
376
377static int mc_sysdev_remove(struct sys_device *sys_dev)
378{
379 int cpu = sys_dev->id;
380
381 if (!cpu_online(cpu))
382 return 0;
383
384 pr_debug("microcode: CPU%d removed\n", cpu);
385 microcode_fini_cpu(cpu);
386 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
387 return 0;
388}
389
390static int mc_sysdev_resume(struct sys_device *dev)
391{
392 int cpu = dev->id;
393
394 if (!cpu_online(cpu))
395 return 0;
396
397 /* only CPU 0 will apply ucode here */
398 microcode_update_cpu(0);
399 return 0;
400}
401
402static struct sysdev_driver mc_sysdev_driver = {
403 .add = mc_sysdev_add,
404 .remove = mc_sysdev_remove,
405 .resume = mc_sysdev_resume,
406};
407
408static __cpuinit int
409mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
410{
411 unsigned int cpu = (unsigned long)hcpu;
412 struct sys_device *sys_dev;
413
414 sys_dev = get_cpu_sysdev(cpu);
415 switch (action) {
416 case CPU_ONLINE:
417 case CPU_ONLINE_FROZEN:
418 microcode_init_cpu(cpu);
419 case CPU_DOWN_FAILED:
420 case CPU_DOWN_FAILED_FROZEN:
421 pr_debug("microcode: CPU%d added\n", cpu);
422 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
423 printk(KERN_ERR "microcode: Failed to create the sysfs "
424 "group for CPU%d\n", cpu);
425 break;
426 case CPU_DOWN_PREPARE:
427 case CPU_DOWN_PREPARE_FROZEN:
428 /* Suspend is in progress, only remove the interface */
429 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
430 pr_debug("microcode: CPU%d removed\n", cpu);
431 break;
432 case CPU_DEAD:
433 case CPU_UP_CANCELED_FROZEN:
434 /* The CPU refused to come up during a system resume */
435 microcode_fini_cpu(cpu);
436 break;
437 }
438 return NOTIFY_OK;
439}
440
441static struct notifier_block __refdata mc_cpu_notifier = {
442 .notifier_call = mc_cpu_callback,
443};
444
445static int __init microcode_init(void)
446{
447 struct cpuinfo_x86 *c = &cpu_data(0);
448 int error;
449
450 if (c->x86_vendor == X86_VENDOR_INTEL)
451 microcode_ops = init_intel_microcode();
452 else if (c->x86_vendor == X86_VENDOR_AMD)
453 microcode_ops = init_amd_microcode();
454
455 if (!microcode_ops) {
456 printk(KERN_ERR "microcode: no support for this CPU vendor\n");
457 return -ENODEV;
458 }
459
460 error = microcode_dev_init();
461 if (error)
462 return error;
463 microcode_pdev = platform_device_register_simple("microcode", -1,
464 NULL, 0);
465 if (IS_ERR(microcode_pdev)) {
466 microcode_dev_exit();
467 return PTR_ERR(microcode_pdev);
468 }
469
470 get_online_cpus();
471 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
472 put_online_cpus();
473 if (error) {
474 microcode_dev_exit();
475 platform_device_unregister(microcode_pdev);
476 return error;
477 }
478
479 register_hotcpu_notifier(&mc_cpu_notifier);
480
481 printk(KERN_INFO
482 "Microcode Update Driver: v" MICROCODE_VERSION
483 " <tigran@aivazian.fsnet.co.uk>"
484 " <peter.oruba@amd.com>\n");
485
486 return 0;
487}
488
489static void __exit microcode_exit(void)
490{
491 microcode_dev_exit();
492
493 unregister_hotcpu_notifier(&mc_cpu_notifier);
494
495 get_online_cpus();
496 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
497 put_online_cpus();
498
499 platform_device_unregister(microcode_pdev);
500
501 microcode_ops = NULL;
502
503 printk(KERN_INFO
504 "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
505}
506
507module_init(microcode_init);
508module_exit(microcode_exit);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
new file mode 100644
index 000000000000..622dc4a21784
--- /dev/null
+++ b/arch/x86/kernel/microcode_intel.c
@@ -0,0 +1,480 @@
1/*
2 * Intel CPU Microcode Update Driver for Linux
3 *
4 * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
5 * 2006 Shaohua Li <shaohua.li@intel.com>
6 *
7 * This driver allows to upgrade microcode on Intel processors
8 * belonging to IA-32 family - PentiumPro, Pentium II,
9 * Pentium III, Xeon, Pentium 4, etc.
10 *
11 * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
12 * Software Developer's Manual
13 * Order Number 253668 or free download from:
14 *
15 * http://developer.intel.com/design/pentium4/manuals/253668.htm
16 *
17 * For more information, go to http://www.urbanmyth.org/microcode
18 *
19 * This program is free software; you can redistribute it and/or
20 * modify it under the terms of the GNU General Public License
21 * as published by the Free Software Foundation; either version
22 * 2 of the License, or (at your option) any later version.
23 *
24 * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
25 * Initial release.
26 * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
27 * Added read() support + cleanups.
28 * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
29 * Added 'device trimming' support. open(O_WRONLY) zeroes
30 * and frees the saved copy of applied microcode.
31 * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
32 * Made to use devfs (/dev/cpu/microcode) + cleanups.
33 * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
34 * Added misc device support (now uses both devfs and misc).
35 * Added MICROCODE_IOCFREE ioctl to clear memory.
36 * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
37 * Messages for error cases (non Intel & no suitable microcode).
38 * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
39 * Removed ->release(). Removed exclusive open and status bitmap.
40 * Added microcode_rwsem to serialize read()/write()/ioctl().
41 * Removed global kernel lock usage.
42 * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
43 * Write 0 to 0x8B msr and then cpuid before reading revision,
44 * so that it works even if there were no update done by the
45 * BIOS. Otherwise, reading from 0x8B gives junk (which happened
46 * to be 0 on my machine which is why it worked even when I
47 * disabled update by the BIOS)
48 * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
49 * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
50 * Tigran Aivazian <tigran@veritas.com>
51 * Intel Pentium 4 processor support and bugfixes.
52 * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
53 * Bugfix for HT (Hyper-Threading) enabled processors
54 * whereby processor resources are shared by all logical processors
55 * in a single CPU package.
56 * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
57 * Tigran Aivazian <tigran@veritas.com>,
58 * Serialize updates as required on HT processors due to
59 * speculative nature of implementation.
60 * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
61 * Fix the panic when writing zero-length microcode chunk.
62 * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
63 * Jun Nakajima <jun.nakajima@intel.com>
64 * Support for the microcode updates in the new format.
65 * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
66 * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
67 * because we no longer hold a copy of applied microcode
68 * in kernel memory.
69 * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug.
72 */
73#include <linux/capability.h>
74#include <linux/kernel.h>
75#include <linux/init.h>
76#include <linux/sched.h>
77#include <linux/smp_lock.h>
78#include <linux/cpumask.h>
79#include <linux/module.h>
80#include <linux/slab.h>
81#include <linux/vmalloc.h>
82#include <linux/miscdevice.h>
83#include <linux/spinlock.h>
84#include <linux/mm.h>
85#include <linux/fs.h>
86#include <linux/mutex.h>
87#include <linux/cpu.h>
88#include <linux/firmware.h>
89#include <linux/platform_device.h>
90
91#include <asm/msr.h>
92#include <asm/uaccess.h>
93#include <asm/processor.h>
94#include <asm/microcode.h>
95
96MODULE_DESCRIPTION("Microcode Update Driver");
97MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
98MODULE_LICENSE("GPL");
99
100struct microcode_header_intel {
101 unsigned int hdrver;
102 unsigned int rev;
103 unsigned int date;
104 unsigned int sig;
105 unsigned int cksum;
106 unsigned int ldrver;
107 unsigned int pf;
108 unsigned int datasize;
109 unsigned int totalsize;
110 unsigned int reserved[3];
111};
112
113struct microcode_intel {
114 struct microcode_header_intel hdr;
115 unsigned int bits[0];
116};
117
118/* microcode format is extended from prescott processors */
119struct extended_signature {
120 unsigned int sig;
121 unsigned int pf;
122 unsigned int cksum;
123};
124
125struct extended_sigtable {
126 unsigned int count;
127 unsigned int cksum;
128 unsigned int reserved[3];
129 struct extended_signature sigs[0];
130};
131
132#define DEFAULT_UCODE_DATASIZE (2000)
133#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel))
134#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
135#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
136#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
137#define DWSIZE (sizeof(u32))
138#define get_totalsize(mc) \
139 (((struct microcode_intel *)mc)->hdr.totalsize ? \
140 ((struct microcode_intel *)mc)->hdr.totalsize : \
141 DEFAULT_UCODE_TOTALSIZE)
142
143#define get_datasize(mc) \
144 (((struct microcode_intel *)mc)->hdr.datasize ? \
145 ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
146
147#define sigmatch(s1, s2, p1, p2) \
148 (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
149
150#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
151
152/* serialize access to the physical write to MSR 0x79 */
153static DEFINE_SPINLOCK(microcode_update_lock);
154
155static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
156{
157 struct cpuinfo_x86 *c = &cpu_data(cpu_num);
158 unsigned int val[2];
159
160 memset(csig, 0, sizeof(*csig));
161
162 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
163 cpu_has(c, X86_FEATURE_IA64)) {
164 printk(KERN_ERR "microcode: CPU%d not a capable Intel "
165 "processor\n", cpu_num);
166 return -1;
167 }
168
169 csig->sig = cpuid_eax(0x00000001);
170
171 if ((c->x86_model >= 5) || (c->x86 > 6)) {
172 /* get processor flags from MSR 0x17 */
173 rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
174 csig->pf = 1 << ((val[1] >> 18) & 7);
175 }
176
177 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
178 /* see notes above for revision 1.07. Apparent chip bug */
179 sync_core();
180 /* get the current revision from MSR 0x8B */
181 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
182 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
183 csig->sig, csig->pf, csig->rev);
184
185 return 0;
186}
187
188static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf)
189{
190 return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1;
191}
192
193static inline int
194update_match_revision(struct microcode_header_intel *mc_header, int rev)
195{
196 return (mc_header->rev <= rev) ? 0 : 1;
197}
198
199static int microcode_sanity_check(void *mc)
200{
201 struct microcode_header_intel *mc_header = mc;
202 struct extended_sigtable *ext_header = NULL;
203 struct extended_signature *ext_sig;
204 unsigned long total_size, data_size, ext_table_size;
205 int sum, orig_sum, ext_sigcount = 0, i;
206
207 total_size = get_totalsize(mc_header);
208 data_size = get_datasize(mc_header);
209 if (data_size + MC_HEADER_SIZE > total_size) {
210 printk(KERN_ERR "microcode: error! "
211 "Bad data size in microcode data file\n");
212 return -EINVAL;
213 }
214
215 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
216 printk(KERN_ERR "microcode: error! "
217 "Unknown microcode update format\n");
218 return -EINVAL;
219 }
220 ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
221 if (ext_table_size) {
222 if ((ext_table_size < EXT_HEADER_SIZE)
223 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
224 printk(KERN_ERR "microcode: error! "
225 "Small exttable size in microcode data file\n");
226 return -EINVAL;
227 }
228 ext_header = mc + MC_HEADER_SIZE + data_size;
229 if (ext_table_size != exttable_size(ext_header)) {
230 printk(KERN_ERR "microcode: error! "
231 "Bad exttable size in microcode data file\n");
232 return -EFAULT;
233 }
234 ext_sigcount = ext_header->count;
235 }
236
237 /* check extended table checksum */
238 if (ext_table_size) {
239 int ext_table_sum = 0;
240 int *ext_tablep = (int *)ext_header;
241
242 i = ext_table_size / DWSIZE;
243 while (i--)
244 ext_table_sum += ext_tablep[i];
245 if (ext_table_sum) {
246 printk(KERN_WARNING "microcode: aborting, "
247 "bad extended signature table checksum\n");
248 return -EINVAL;
249 }
250 }
251
252 /* calculate the checksum */
253 orig_sum = 0;
254 i = (MC_HEADER_SIZE + data_size) / DWSIZE;
255 while (i--)
256 orig_sum += ((int *)mc)[i];
257 if (orig_sum) {
258 printk(KERN_ERR "microcode: aborting, bad checksum\n");
259 return -EINVAL;
260 }
261 if (!ext_table_size)
262 return 0;
263 /* check extended signature checksum */
264 for (i = 0; i < ext_sigcount; i++) {
265 ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
266 EXT_SIGNATURE_SIZE * i;
267 sum = orig_sum
268 - (mc_header->sig + mc_header->pf + mc_header->cksum)
269 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
270 if (sum) {
271 printk(KERN_ERR "microcode: aborting, bad checksum\n");
272 return -EINVAL;
273 }
274 }
275 return 0;
276}
277
278/*
279 * return 0 - no update found
280 * return 1 - found update
281 */
282static int
283get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
284{
285 struct microcode_header_intel *mc_header = mc;
286 struct extended_sigtable *ext_header;
287 unsigned long total_size = get_totalsize(mc_header);
288 int ext_sigcount, i;
289 struct extended_signature *ext_sig;
290
291 if (!update_match_revision(mc_header, rev))
292 return 0;
293
294 if (update_match_cpu(cpu_sig, mc_header->sig, mc_header->pf))
295 return 1;
296
297 /* Look for ext. headers: */
298 if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
299 return 0;
300
301 ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
302 ext_sigcount = ext_header->count;
303 ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
304
305 for (i = 0; i < ext_sigcount; i++) {
306 if (update_match_cpu(cpu_sig, ext_sig->sig, ext_sig->pf))
307 return 1;
308 ext_sig++;
309 }
310 return 0;
311}
312
313static void apply_microcode(int cpu)
314{
315 unsigned long flags;
316 unsigned int val[2];
317 int cpu_num = raw_smp_processor_id();
318 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
319 struct microcode_intel *mc_intel = uci->mc;
320
321 /* We should bind the task to the CPU */
322 BUG_ON(cpu_num != cpu);
323
324 if (mc_intel == NULL)
325 return;
326
327 /* serialize access to the physical write to MSR 0x79 */
328 spin_lock_irqsave(&microcode_update_lock, flags);
329
330 /* write microcode via MSR 0x79 */
331 wrmsr(MSR_IA32_UCODE_WRITE,
332 (unsigned long) mc_intel->bits,
333 (unsigned long) mc_intel->bits >> 16 >> 16);
334 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
335
336 /* see notes above for revision 1.07. Apparent chip bug */
337 sync_core();
338
339 /* get the current revision from MSR 0x8B */
340 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
341
342 spin_unlock_irqrestore(&microcode_update_lock, flags);
343 if (val[1] != mc_intel->hdr.rev) {
344 printk(KERN_ERR "microcode: CPU%d update from revision "
345 "0x%x to 0x%x failed\n", cpu_num, uci->cpu_sig.rev, val[1]);
346 return;
347 }
348 printk(KERN_INFO "microcode: CPU%d updated from revision "
349 "0x%x to 0x%x, date = %04x-%02x-%02x \n",
350 cpu_num, uci->cpu_sig.rev, val[1],
351 mc_intel->hdr.date & 0xffff,
352 mc_intel->hdr.date >> 24,
353 (mc_intel->hdr.date >> 16) & 0xff);
354 uci->cpu_sig.rev = val[1];
355}
356
357static int generic_load_microcode(int cpu, void *data, size_t size,
358 int (*get_ucode_data)(void *, const void *, size_t))
359{
360 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
361 u8 *ucode_ptr = data, *new_mc = NULL, *mc;
362 int new_rev = uci->cpu_sig.rev;
363 unsigned int leftover = size;
364
365 while (leftover) {
366 struct microcode_header_intel mc_header;
367 unsigned int mc_size;
368
369 if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header)))
370 break;
371
372 mc_size = get_totalsize(&mc_header);
373 if (!mc_size || mc_size > leftover) {
374 printk(KERN_ERR "microcode: error!"
375 "Bad data in microcode data file\n");
376 break;
377 }
378
379 mc = vmalloc(mc_size);
380 if (!mc)
381 break;
382
383 if (get_ucode_data(mc, ucode_ptr, mc_size) ||
384 microcode_sanity_check(mc) < 0) {
385 vfree(mc);
386 break;
387 }
388
389 if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) {
390 if (new_mc)
391 vfree(new_mc);
392 new_rev = mc_header.rev;
393 new_mc = mc;
394 } else
395 vfree(mc);
396
397 ucode_ptr += mc_size;
398 leftover -= mc_size;
399 }
400
401 if (new_mc) {
402 if (!leftover) {
403 if (uci->mc)
404 vfree(uci->mc);
405 uci->mc = (struct microcode_intel *)new_mc;
406 pr_debug("microcode: CPU%d found a matching microcode update with"
407 " version 0x%x (current=0x%x)\n",
408 cpu, new_rev, uci->cpu_sig.rev);
409 } else
410 vfree(new_mc);
411 }
412
413 return (int)leftover;
414}
415
416static int get_ucode_fw(void *to, const void *from, size_t n)
417{
418 memcpy(to, from, n);
419 return 0;
420}
421
422static int request_microcode_fw(int cpu, struct device *device)
423{
424 char name[30];
425 struct cpuinfo_x86 *c = &cpu_data(cpu);
426 const struct firmware *firmware;
427 int ret;
428
429 /* We should bind the task to the CPU */
430 BUG_ON(cpu != raw_smp_processor_id());
431 sprintf(name, "intel-ucode/%02x-%02x-%02x",
432 c->x86, c->x86_model, c->x86_mask);
433 ret = request_firmware(&firmware, name, device);
434 if (ret) {
435 pr_debug("microcode: data file %s load failed\n", name);
436 return ret;
437 }
438
439 ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size,
440 &get_ucode_fw);
441
442 release_firmware(firmware);
443
444 return ret;
445}
446
447static int get_ucode_user(void *to, const void *from, size_t n)
448{
449 return copy_from_user(to, from, n);
450}
451
452static int request_microcode_user(int cpu, const void __user *buf, size_t size)
453{
454 /* We should bind the task to the CPU */
455 BUG_ON(cpu != raw_smp_processor_id());
456
457 return generic_load_microcode(cpu, (void*)buf, size, &get_ucode_user);
458}
459
460static void microcode_fini_cpu(int cpu)
461{
462 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
463
464 vfree(uci->mc);
465 uci->mc = NULL;
466}
467
468struct microcode_ops microcode_intel_ops = {
469 .request_microcode_user = request_microcode_user,
470 .request_microcode_fw = request_microcode_fw,
471 .collect_cpu_info = collect_cpu_info,
472 .apply_microcode = apply_microcode,
473 .microcode_fini_cpu = microcode_fini_cpu,
474};
475
476struct microcode_ops * __init init_intel_microcode(void)
477{
478 return &microcode_intel_ops;
479}
480
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index edc5fbfe85c0..efc2f361fe85 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -12,6 +12,7 @@
12#include <asm/io.h> 12#include <asm/io.h>
13#include <asm/msr.h> 13#include <asm/msr.h>
14#include <asm/acpi.h> 14#include <asm/acpi.h>
15#include <asm/mmconfig.h>
15 16
16#include "../pci/pci.h" 17#include "../pci/pci.h"
17 18
@@ -237,7 +238,7 @@ static struct dmi_system_id __devinitdata mmconf_dmi_table[] = {
237 {} 238 {}
238}; 239};
239 240
240void __init check_enable_amd_mmconf_dmi(void) 241void __cpuinit check_enable_amd_mmconf_dmi(void)
241{ 242{
242 dmi_check_system(mmconf_dmi_table); 243 dmi_check_system(mmconf_dmi_table);
243} 244}
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c
index a888e67f5874..6ba87830d4b1 100644
--- a/arch/x86/kernel/module_64.c
+++ b/arch/x86/kernel/module_64.c
@@ -22,6 +22,7 @@
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/string.h> 23#include <linux/string.h>
24#include <linux/kernel.h> 24#include <linux/kernel.h>
25#include <linux/mm.h>
25#include <linux/slab.h> 26#include <linux/slab.h>
26#include <linux/bug.h> 27#include <linux/bug.h>
27 28
@@ -150,7 +151,8 @@ int module_finalize(const Elf_Ehdr *hdr,
150 const Elf_Shdr *sechdrs, 151 const Elf_Shdr *sechdrs,
151 struct module *me) 152 struct module *me)
152{ 153{
153 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; 154 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
155 *para = NULL;
154 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; 156 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
155 157
156 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 158 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -160,6 +162,8 @@ int module_finalize(const Elf_Ehdr *hdr,
160 alt = s; 162 alt = s;
161 if (!strcmp(".smp_locks", secstrings + s->sh_name)) 163 if (!strcmp(".smp_locks", secstrings + s->sh_name))
162 locks= s; 164 locks= s;
165 if (!strcmp(".parainstructions", secstrings + s->sh_name))
166 para = s;
163 } 167 }
164 168
165 if (alt) { 169 if (alt) {
@@ -175,6 +179,11 @@ int module_finalize(const Elf_Ehdr *hdr,
175 tseg, tseg + text->sh_size); 179 tseg, tseg + text->sh_size);
176 } 180 }
177 181
182 if (para) {
183 void *pseg = (void *)para->sh_addr;
184 apply_paravirt(pseg, pseg + para->sh_size);
185 }
186
178 return module_bug_finalize(hdr, sechdrs, me); 187 return module_bug_finalize(hdr, sechdrs, me);
179} 188}
180 189
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 404683b94e79..f98f4e1dba09 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -25,6 +25,9 @@
25#include <asm/proto.h> 25#include <asm/proto.h>
26#include <asm/acpi.h> 26#include <asm/acpi.h>
27#include <asm/bios_ebda.h> 27#include <asm/bios_ebda.h>
28#include <asm/e820.h>
29#include <asm/trampoline.h>
30#include <asm/setup.h>
28 31
29#include <mach_apic.h> 32#include <mach_apic.h>
30#ifdef CONFIG_X86_32 33#ifdef CONFIG_X86_32
@@ -32,28 +35,6 @@
32#include <mach_mpparse.h> 35#include <mach_mpparse.h>
33#endif 36#endif
34 37
35/* Have we found an MP table */
36int smp_found_config;
37
38/*
39 * Various Linux-internal data structures created from the
40 * MP-table.
41 */
42#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
43int mp_bus_id_to_type[MAX_MP_BUSSES];
44#endif
45
46DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
47int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 };
48
49static int mp_current_pci_id;
50
51int pic_mode;
52
53/*
54 * Intel MP BIOS table parsing routines:
55 */
56
57/* 38/*
58 * Checksum an MP configuration block. 39 * Checksum an MP configuration block.
59 */ 40 */
@@ -68,19 +49,7 @@ static int __init mpf_checksum(unsigned char *mp, int len)
68 return sum & 0xFF; 49 return sum & 0xFF;
69} 50}
70 51
71#ifdef CONFIG_X86_NUMAQ 52static void __init MP_processor_info(struct mpc_config_processor *m)
72/*
73 * Have to match translation table entries to main table entries by counter
74 * hence the mpc_record variable .... can't see a less disgusting way of
75 * doing this ....
76 */
77
78static int mpc_record;
79static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
80 __cpuinitdata;
81#endif
82
83static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
84{ 53{
85 int apicid; 54 int apicid;
86 char *bootup_cpu = ""; 55 char *bootup_cpu = "";
@@ -89,11 +58,12 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
89 disabled_cpus++; 58 disabled_cpus++;
90 return; 59 return;
91 } 60 }
92#ifdef CONFIG_X86_NUMAQ 61
93 apicid = mpc_apic_id(m, translation_table[mpc_record]); 62 if (x86_quirks->mpc_apic_id)
94#else 63 apicid = x86_quirks->mpc_apic_id(m);
95 apicid = m->mpc_apicid; 64 else
96#endif 65 apicid = m->mpc_apicid;
66
97 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { 67 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
98 bootup_cpu = " (Bootup-CPU)"; 68 bootup_cpu = " (Bootup-CPU)";
99 boot_cpu_physical_apicid = m->mpc_apicid; 69 boot_cpu_physical_apicid = m->mpc_apicid;
@@ -103,18 +73,17 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
103 generic_processor_info(apicid, m->mpc_apicver); 73 generic_processor_info(apicid, m->mpc_apicver);
104} 74}
105 75
76#ifdef CONFIG_X86_IO_APIC
106static void __init MP_bus_info(struct mpc_config_bus *m) 77static void __init MP_bus_info(struct mpc_config_bus *m)
107{ 78{
108 char str[7]; 79 char str[7];
109
110 memcpy(str, m->mpc_bustype, 6); 80 memcpy(str, m->mpc_bustype, 6);
111 str[6] = 0; 81 str[6] = 0;
112 82
113#ifdef CONFIG_X86_NUMAQ 83 if (x86_quirks->mpc_oem_bus_info)
114 mpc_oem_bus_info(m, str, translation_table[mpc_record]); 84 x86_quirks->mpc_oem_bus_info(m, str);
115#else 85 else
116 Dprintk("Bus #%d is %s\n", m->mpc_busid, str); 86 apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->mpc_busid, str);
117#endif
118 87
119#if MAX_MP_BUSSES < 256 88#if MAX_MP_BUSSES < 256
120 if (m->mpc_busid >= MAX_MP_BUSSES) { 89 if (m->mpc_busid >= MAX_MP_BUSSES) {
@@ -131,12 +100,10 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
131 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; 100 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
132#endif 101#endif
133 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { 102 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
134#ifdef CONFIG_X86_NUMAQ 103 if (x86_quirks->mpc_oem_pci_bus)
135 mpc_oem_pci_bus(m, translation_table[mpc_record]); 104 x86_quirks->mpc_oem_pci_bus(m);
136#endif 105
137 clear_bit(m->mpc_busid, mp_bus_not_pci); 106 clear_bit(m->mpc_busid, mp_bus_not_pci);
138 mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
139 mp_current_pci_id++;
140#if defined(CONFIG_EISA) || defined (CONFIG_MCA) 107#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
141 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; 108 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
142 } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) { 109 } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
@@ -147,6 +114,7 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
147 } else 114 } else
148 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); 115 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
149} 116}
117#endif
150 118
151#ifdef CONFIG_X86_IO_APIC 119#ifdef CONFIG_X86_IO_APIC
152 120
@@ -176,117 +144,111 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m)
176 if (bad_ioapic(m->mpc_apicaddr)) 144 if (bad_ioapic(m->mpc_apicaddr))
177 return; 145 return;
178 146
179 mp_ioapics[nr_ioapics] = *m; 147 mp_ioapics[nr_ioapics].mp_apicaddr = m->mpc_apicaddr;
148 mp_ioapics[nr_ioapics].mp_apicid = m->mpc_apicid;
149 mp_ioapics[nr_ioapics].mp_type = m->mpc_type;
150 mp_ioapics[nr_ioapics].mp_apicver = m->mpc_apicver;
151 mp_ioapics[nr_ioapics].mp_flags = m->mpc_flags;
180 nr_ioapics++; 152 nr_ioapics++;
181} 153}
182 154
183static void __init MP_intsrc_info(struct mpc_config_intsrc *m) 155static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
184{ 156{
185 mp_irqs[mp_irq_entries] = *m; 157 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
186 Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
187 " IRQ %02x, APIC ID %x, APIC INT %02x\n", 158 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
188 m->mpc_irqtype, m->mpc_irqflag & 3, 159 m->mpc_irqtype, m->mpc_irqflag & 3,
189 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, 160 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
190 m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); 161 m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
191 if (++mp_irq_entries == MAX_IRQ_SOURCES)
192 panic("Max # of irq sources exceeded!!\n");
193} 162}
194 163
195#endif 164static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
196
197static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
198{ 165{
199 Dprintk("Lint: type %d, pol %d, trig %d, bus %d," 166 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
200 " IRQ %02x, APIC ID %x, APIC LINT %02x\n", 167 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
201 m->mpc_irqtype, m->mpc_irqflag & 3, 168 mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
202 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid, 169 (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
203 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); 170 mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq);
204} 171}
205 172
206#ifdef CONFIG_X86_NUMAQ 173static void __init assign_to_mp_irq(struct mpc_config_intsrc *m,
207static void __init MP_translation_info(struct mpc_config_translation *m) 174 struct mp_config_intsrc *mp_irq)
208{ 175{
209 printk(KERN_INFO 176 mp_irq->mp_dstapic = m->mpc_dstapic;
210 "Translation: record %d, type %d, quad %d, global %d, local %d\n", 177 mp_irq->mp_type = m->mpc_type;
211 mpc_record, m->trans_type, m->trans_quad, m->trans_global, 178 mp_irq->mp_irqtype = m->mpc_irqtype;
212 m->trans_local); 179 mp_irq->mp_irqflag = m->mpc_irqflag;
180 mp_irq->mp_srcbus = m->mpc_srcbus;
181 mp_irq->mp_srcbusirq = m->mpc_srcbusirq;
182 mp_irq->mp_dstirq = m->mpc_dstirq;
183}
213 184
214 if (mpc_record >= MAX_MPC_ENTRY) 185static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq,
215 printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); 186 struct mpc_config_intsrc *m)
216 else 187{
217 translation_table[mpc_record] = m; /* stash this for later */ 188 m->mpc_dstapic = mp_irq->mp_dstapic;
218 if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) 189 m->mpc_type = mp_irq->mp_type;
219 node_set_online(m->trans_quad); 190 m->mpc_irqtype = mp_irq->mp_irqtype;
191 m->mpc_irqflag = mp_irq->mp_irqflag;
192 m->mpc_srcbus = mp_irq->mp_srcbus;
193 m->mpc_srcbusirq = mp_irq->mp_srcbusirq;
194 m->mpc_dstirq = mp_irq->mp_dstirq;
220} 195}
221 196
222/* 197static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq,
223 * Read/parse the MPC oem tables 198 struct mpc_config_intsrc *m)
224 */ 199{
200 if (mp_irq->mp_dstapic != m->mpc_dstapic)
201 return 1;
202 if (mp_irq->mp_type != m->mpc_type)
203 return 2;
204 if (mp_irq->mp_irqtype != m->mpc_irqtype)
205 return 3;
206 if (mp_irq->mp_irqflag != m->mpc_irqflag)
207 return 4;
208 if (mp_irq->mp_srcbus != m->mpc_srcbus)
209 return 5;
210 if (mp_irq->mp_srcbusirq != m->mpc_srcbusirq)
211 return 6;
212 if (mp_irq->mp_dstirq != m->mpc_dstirq)
213 return 7;
214
215 return 0;
216}
225 217
226static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, 218static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
227 unsigned short oemsize)
228{ 219{
229 int count = sizeof(*oemtable); /* the header size */ 220 int i;
230 unsigned char *oemptr = ((unsigned char *)oemtable) + count; 221
231 222 print_MP_intsrc_info(m);
232 mpc_record = 0; 223
233 printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", 224 for (i = 0; i < mp_irq_entries; i++) {
234 oemtable); 225 if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m))
235 if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) { 226 return;
236 printk(KERN_WARNING
237 "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
238 oemtable->oem_signature[0], oemtable->oem_signature[1],
239 oemtable->oem_signature[2], oemtable->oem_signature[3]);
240 return;
241 }
242 if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
243 printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
244 return;
245 }
246 while (count < oemtable->oem_length) {
247 switch (*oemptr) {
248 case MP_TRANSLATION:
249 {
250 struct mpc_config_translation *m =
251 (struct mpc_config_translation *)oemptr;
252 MP_translation_info(m);
253 oemptr += sizeof(*m);
254 count += sizeof(*m);
255 ++mpc_record;
256 break;
257 }
258 default:
259 {
260 printk(KERN_WARNING
261 "Unrecognised OEM table entry type! - %d\n",
262 (int)*oemptr);
263 return;
264 }
265 }
266 } 227 }
228
229 assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
230 if (++mp_irq_entries == MAX_IRQ_SOURCES)
231 panic("Max # of irq sources exceeded!!\n");
267} 232}
268 233
269static inline void mps_oem_check(struct mp_config_table *mpc, char *oem, 234#endif
270 char *productid) 235
236static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
271{ 237{
272 if (strncmp(oem, "IBM NUMA", 8)) 238 apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
273 printk("Warning! May not be a NUMA-Q system!\n"); 239 " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
274 if (mpc->mpc_oemptr) 240 m->mpc_irqtype, m->mpc_irqflag & 3,
275 smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr, 241 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
276 mpc->mpc_oemsize); 242 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
277} 243}
278#endif /* CONFIG_X86_NUMAQ */
279 244
280/* 245/*
281 * Read/parse the MPC 246 * Read/parse the MPC
282 */ 247 */
283 248
284static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) 249static int __init smp_check_mpc(struct mp_config_table *mpc, char *oem,
250 char *str)
285{ 251{
286 char str[16];
287 char oem[10];
288 int count = sizeof(*mpc);
289 unsigned char *mpt = ((unsigned char *)mpc) + count;
290 252
291 if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) { 253 if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) {
292 printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n", 254 printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
@@ -309,19 +271,41 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
309 } 271 }
310 memcpy(oem, mpc->mpc_oem, 8); 272 memcpy(oem, mpc->mpc_oem, 8);
311 oem[8] = 0; 273 oem[8] = 0;
312 printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem); 274 printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
313 275
314 memcpy(str, mpc->mpc_productid, 12); 276 memcpy(str, mpc->mpc_productid, 12);
315 str[12] = 0; 277 str[12] = 0;
316 printk("Product ID: %s ", str);
317 278
318#ifdef CONFIG_X86_32 279 printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
319 mps_oem_check(mpc, oem, str);
320#endif
321 printk(KERN_INFO "MPTABLE: Product ID: %s ", str);
322 280
323 printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic); 281 printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
324 282
283 return 1;
284}
285
286static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
287{
288 char str[16];
289 char oem[10];
290
291 int count = sizeof(*mpc);
292 unsigned char *mpt = ((unsigned char *)mpc) + count;
293
294 if (!smp_check_mpc(mpc, oem, str))
295 return 0;
296
297#ifdef CONFIG_X86_32
298 /*
299 * need to make sure summit and es7000's mps_oem_check is safe to be
300 * called early via genericarch 's mps_oem_check
301 */
302 if (early) {
303#ifdef CONFIG_X86_NUMAQ
304 numaq_mps_oem_check(mpc, oem, str);
305#endif
306 } else
307 mps_oem_check(mpc, oem, str);
308#endif
325 /* save the local APIC address, it might be non-default */ 309 /* save the local APIC address, it might be non-default */
326 if (!acpi_lapic) 310 if (!acpi_lapic)
327 mp_lapic_addr = mpc->mpc_lapic; 311 mp_lapic_addr = mpc->mpc_lapic;
@@ -329,12 +313,17 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
329 if (early) 313 if (early)
330 return 1; 314 return 1;
331 315
316 if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) {
317 struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr;
318 x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize);
319 }
320
332 /* 321 /*
333 * Now process the configuration blocks. 322 * Now process the configuration blocks.
334 */ 323 */
335#ifdef CONFIG_X86_NUMAQ 324 if (x86_quirks->mpc_record)
336 mpc_record = 0; 325 *x86_quirks->mpc_record = 0;
337#endif 326
338 while (count < mpc->mpc_length) { 327 while (count < mpc->mpc_length) {
339 switch (*mpt) { 328 switch (*mpt) {
340 case MP_PROCESSOR: 329 case MP_PROCESSOR:
@@ -352,7 +341,9 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
352 { 341 {
353 struct mpc_config_bus *m = 342 struct mpc_config_bus *m =
354 (struct mpc_config_bus *)mpt; 343 (struct mpc_config_bus *)mpt;
344#ifdef CONFIG_X86_IO_APIC
355 MP_bus_info(m); 345 MP_bus_info(m);
346#endif
356 mpt += sizeof(*m); 347 mpt += sizeof(*m);
357 count += sizeof(*m); 348 count += sizeof(*m);
358 break; 349 break;
@@ -398,11 +389,17 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
398 count = mpc->mpc_length; 389 count = mpc->mpc_length;
399 break; 390 break;
400 } 391 }
401#ifdef CONFIG_X86_NUMAQ 392 if (x86_quirks->mpc_record)
402 ++mpc_record; 393 (*x86_quirks->mpc_record)++;
403#endif
404 } 394 }
395
396#ifdef CONFIG_X86_GENERICARCH
397 generic_bigsmp_probe();
398#endif
399
400#ifdef CONFIG_X86_32
405 setup_apic_routing(); 401 setup_apic_routing();
402#endif
406 if (!num_processors) 403 if (!num_processors)
407 printk(KERN_ERR "MPTABLE: no processors registered!\n"); 404 printk(KERN_ERR "MPTABLE: no processors registered!\n");
408 return num_processors; 405 return num_processors;
@@ -427,7 +424,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
427 intsrc.mpc_type = MP_INTSRC; 424 intsrc.mpc_type = MP_INTSRC;
428 intsrc.mpc_irqflag = 0; /* conforming */ 425 intsrc.mpc_irqflag = 0; /* conforming */
429 intsrc.mpc_srcbus = 0; 426 intsrc.mpc_srcbus = 0;
430 intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; 427 intsrc.mpc_dstapic = mp_ioapics[0].mp_apicid;
431 428
432 intsrc.mpc_irqtype = mp_INT; 429 intsrc.mpc_irqtype = mp_INT;
433 430
@@ -488,40 +485,11 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
488 MP_intsrc_info(&intsrc); 485 MP_intsrc_info(&intsrc);
489} 486}
490 487
491#endif
492 488
493static inline void __init construct_default_ISA_mptable(int mpc_default_type) 489static void __init construct_ioapic_table(int mpc_default_type)
494{ 490{
495 struct mpc_config_processor processor;
496 struct mpc_config_bus bus;
497#ifdef CONFIG_X86_IO_APIC
498 struct mpc_config_ioapic ioapic; 491 struct mpc_config_ioapic ioapic;
499#endif 492 struct mpc_config_bus bus;
500 struct mpc_config_lintsrc lintsrc;
501 int linttypes[2] = { mp_ExtINT, mp_NMI };
502 int i;
503
504 /*
505 * local APIC has default address
506 */
507 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
508
509 /*
510 * 2 CPUs, numbered 0 & 1.
511 */
512 processor.mpc_type = MP_PROCESSOR;
513 /* Either an integrated APIC or a discrete 82489DX. */
514 processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
515 processor.mpc_cpuflag = CPU_ENABLED;
516 processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
517 (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
518 processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
519 processor.mpc_reserved[0] = 0;
520 processor.mpc_reserved[1] = 0;
521 for (i = 0; i < 2; i++) {
522 processor.mpc_apicid = i;
523 MP_processor_info(&processor);
524 }
525 493
526 bus.mpc_type = MP_BUS; 494 bus.mpc_type = MP_BUS;
527 bus.mpc_busid = 0; 495 bus.mpc_busid = 0;
@@ -550,7 +518,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
550 MP_bus_info(&bus); 518 MP_bus_info(&bus);
551 } 519 }
552 520
553#ifdef CONFIG_X86_IO_APIC
554 ioapic.mpc_type = MP_IOAPIC; 521 ioapic.mpc_type = MP_IOAPIC;
555 ioapic.mpc_apicid = 2; 522 ioapic.mpc_apicid = 2;
556 ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; 523 ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
@@ -562,7 +529,42 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
562 * We set up most of the low 16 IO-APIC pins according to MPS rules. 529 * We set up most of the low 16 IO-APIC pins according to MPS rules.
563 */ 530 */
564 construct_default_ioirq_mptable(mpc_default_type); 531 construct_default_ioirq_mptable(mpc_default_type);
532}
533#else
534static inline void __init construct_ioapic_table(int mpc_default_type) { }
565#endif 535#endif
536
537static inline void __init construct_default_ISA_mptable(int mpc_default_type)
538{
539 struct mpc_config_processor processor;
540 struct mpc_config_lintsrc lintsrc;
541 int linttypes[2] = { mp_ExtINT, mp_NMI };
542 int i;
543
544 /*
545 * local APIC has default address
546 */
547 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
548
549 /*
550 * 2 CPUs, numbered 0 & 1.
551 */
552 processor.mpc_type = MP_PROCESSOR;
553 /* Either an integrated APIC or a discrete 82489DX. */
554 processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
555 processor.mpc_cpuflag = CPU_ENABLED;
556 processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
557 (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
558 processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
559 processor.mpc_reserved[0] = 0;
560 processor.mpc_reserved[1] = 0;
561 for (i = 0; i < 2; i++) {
562 processor.mpc_apicid = i;
563 MP_processor_info(&processor);
564 }
565
566 construct_ioapic_table(mpc_default_type);
567
566 lintsrc.mpc_type = MP_LINTSRC; 568 lintsrc.mpc_type = MP_LINTSRC;
567 lintsrc.mpc_irqflag = 0; /* conforming */ 569 lintsrc.mpc_irqflag = 0; /* conforming */
568 lintsrc.mpc_srcbusid = 0; 570 lintsrc.mpc_srcbusid = 0;
@@ -580,10 +582,14 @@ static struct intel_mp_floating *mpf_found;
580/* 582/*
581 * Scan the memory blocks for an SMP configuration block. 583 * Scan the memory blocks for an SMP configuration block.
582 */ 584 */
583static void __init __get_smp_config(unsigned early) 585static void __init __get_smp_config(unsigned int early)
584{ 586{
585 struct intel_mp_floating *mpf = mpf_found; 587 struct intel_mp_floating *mpf = mpf_found;
586 588
589 if (x86_quirks->mach_get_smp_config) {
590 if (x86_quirks->mach_get_smp_config(early))
591 return;
592 }
587 if (acpi_lapic && early) 593 if (acpi_lapic && early)
588 return; 594 return;
589 /* 595 /*
@@ -600,7 +606,7 @@ static void __init __get_smp_config(unsigned early)
600 606
601 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", 607 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
602 mpf->mpf_specification); 608 mpf->mpf_specification);
603#ifdef CONFIG_X86_32 609#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
604 if (mpf->mpf_feature2 & (1 << 7)) { 610 if (mpf->mpf_feature2 & (1 << 7)) {
605 printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); 611 printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
606 pic_mode = 1; 612 pic_mode = 1;
@@ -632,7 +638,9 @@ static void __init __get_smp_config(unsigned early)
632 * override the defaults. 638 * override the defaults.
633 */ 639 */
634 if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr), early)) { 640 if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr), early)) {
641#ifdef CONFIG_X86_LOCAL_APIC
635 smp_found_config = 0; 642 smp_found_config = 0;
643#endif
636 printk(KERN_ERR 644 printk(KERN_ERR
637 "BIOS bug, MP table errors detected!...\n"); 645 "BIOS bug, MP table errors detected!...\n");
638 printk(KERN_ERR "... disabling SMP support. " 646 printk(KERN_ERR "... disabling SMP support. "
@@ -689,7 +697,8 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
689 unsigned int *bp = phys_to_virt(base); 697 unsigned int *bp = phys_to_virt(base);
690 struct intel_mp_floating *mpf; 698 struct intel_mp_floating *mpf;
691 699
692 Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length); 700 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
701 bp, length);
693 BUILD_BUG_ON(sizeof(*mpf) != 16); 702 BUILD_BUG_ON(sizeof(*mpf) != 16);
694 703
695 while (length > 0) { 704 while (length > 0) {
@@ -699,15 +708,21 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
699 !mpf_checksum((unsigned char *)bp, 16) && 708 !mpf_checksum((unsigned char *)bp, 16) &&
700 ((mpf->mpf_specification == 1) 709 ((mpf->mpf_specification == 1)
701 || (mpf->mpf_specification == 4))) { 710 || (mpf->mpf_specification == 4))) {
702 711#ifdef CONFIG_X86_LOCAL_APIC
703 smp_found_config = 1; 712 smp_found_config = 1;
713#endif
704 mpf_found = mpf; 714 mpf_found = mpf;
705#ifdef CONFIG_X86_32 715
706 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", 716 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
707 mpf, virt_to_phys(mpf)); 717 mpf, virt_to_phys(mpf));
708 reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE, 718
719 if (!reserve)
720 return 1;
721 reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
709 BOOTMEM_DEFAULT); 722 BOOTMEM_DEFAULT);
710 if (mpf->mpf_physptr) { 723 if (mpf->mpf_physptr) {
724 unsigned long size = PAGE_SIZE;
725#ifdef CONFIG_X86_32
711 /* 726 /*
712 * We cannot access to MPC table to compute 727 * We cannot access to MPC table to compute
713 * table size yet, as only few megabytes from 728 * table size yet, as only few megabytes from
@@ -717,24 +732,15 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
717 * PAGE_SIZE from mpg->mpf_physptr yields BUG() 732 * PAGE_SIZE from mpg->mpf_physptr yields BUG()
718 * in reserve_bootmem. 733 * in reserve_bootmem.
719 */ 734 */
720 unsigned long size = PAGE_SIZE;
721 unsigned long end = max_low_pfn * PAGE_SIZE; 735 unsigned long end = max_low_pfn * PAGE_SIZE;
722 if (mpf->mpf_physptr + size > end) 736 if (mpf->mpf_physptr + size > end)
723 size = end - mpf->mpf_physptr; 737 size = end - mpf->mpf_physptr;
724 reserve_bootmem(mpf->mpf_physptr, size, 738#endif
739 reserve_bootmem_generic(mpf->mpf_physptr, size,
725 BOOTMEM_DEFAULT); 740 BOOTMEM_DEFAULT);
726 } 741 }
727 742
728#else 743 return 1;
729 if (!reserve)
730 return 1;
731
732 reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
733 if (mpf->mpf_physptr)
734 reserve_bootmem_generic(mpf->mpf_physptr,
735 PAGE_SIZE);
736#endif
737 return 1;
738 } 744 }
739 bp += 4; 745 bp += 4;
740 length -= 16; 746 length -= 16;
@@ -742,10 +748,14 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
742 return 0; 748 return 0;
743} 749}
744 750
745static void __init __find_smp_config(unsigned reserve) 751static void __init __find_smp_config(unsigned int reserve)
746{ 752{
747 unsigned int address; 753 unsigned int address;
748 754
755 if (x86_quirks->mach_find_smp_config) {
756 if (x86_quirks->mach_find_smp_config(reserve))
757 return;
758 }
749 /* 759 /*
750 * FIXME: Linux assumes you have 640K of base ram.. 760 * FIXME: Linux assumes you have 640K of base ram..
751 * this continues the error... 761 * this continues the error...
@@ -790,298 +800,294 @@ void __init find_smp_config(void)
790 __find_smp_config(1); 800 __find_smp_config(1);
791} 801}
792 802
793/* -------------------------------------------------------------------------- 803#ifdef CONFIG_X86_IO_APIC
794 ACPI-based MP Configuration 804static u8 __initdata irq_used[MAX_IRQ_SOURCES];
795 -------------------------------------------------------------------------- */
796 805
797/* 806static int __init get_MP_intsrc_index(struct mpc_config_intsrc *m)
798 * Keep this outside and initialized to 0, for !CONFIG_ACPI builds: 807{
799 */ 808 int i;
800int es7000_plat;
801 809
802#ifdef CONFIG_ACPI 810 if (m->mpc_irqtype != mp_INT)
811 return 0;
803 812
804#ifdef CONFIG_X86_IO_APIC 813 if (m->mpc_irqflag != 0x0f)
814 return 0;
805 815
806#define MP_ISA_BUS 0 816 /* not legacy */
807 817
808extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS]; 818 for (i = 0; i < mp_irq_entries; i++) {
819 if (mp_irqs[i].mp_irqtype != mp_INT)
820 continue;
809 821
810static int mp_find_ioapic(int gsi) 822 if (mp_irqs[i].mp_irqflag != 0x0f)
811{ 823 continue;
812 int i = 0;
813 824
814 /* Find the IOAPIC that manages this GSI. */ 825 if (mp_irqs[i].mp_srcbus != m->mpc_srcbus)
815 for (i = 0; i < nr_ioapics; i++) { 826 continue;
816 if ((gsi >= mp_ioapic_routing[i].gsi_base) 827 if (mp_irqs[i].mp_srcbusirq != m->mpc_srcbusirq)
817 && (gsi <= mp_ioapic_routing[i].gsi_end)) 828 continue;
818 return i; 829 if (irq_used[i]) {
830 /* already claimed */
831 return -2;
832 }
833 irq_used[i] = 1;
834 return i;
819 } 835 }
820 836
821 printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); 837 /* not found */
822 return -1; 838 return -1;
823} 839}
824 840
825static u8 __init uniq_ioapic_id(u8 id) 841#define SPARE_SLOT_NUM 20
826{ 842
827#ifdef CONFIG_X86_32 843static struct mpc_config_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
828 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
829 !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
830 return io_apic_get_unique_id(nr_ioapics, id);
831 else
832 return id;
833#else
834 int i;
835 DECLARE_BITMAP(used, 256);
836 bitmap_zero(used, 256);
837 for (i = 0; i < nr_ioapics; i++) {
838 struct mpc_config_ioapic *ia = &mp_ioapics[i];
839 __set_bit(ia->mpc_apicid, used);
840 }
841 if (!test_bit(id, used))
842 return id;
843 return find_first_zero_bit(used, 256);
844#endif 844#endif
845}
846 845
847void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) 846static int __init replace_intsrc_all(struct mp_config_table *mpc,
847 unsigned long mpc_new_phys,
848 unsigned long mpc_new_length)
848{ 849{
849 int idx = 0; 850#ifdef CONFIG_X86_IO_APIC
850 851 int i;
851 if (bad_ioapic(address)) 852 int nr_m_spare = 0;
852 return; 853#endif
853 854
854 idx = nr_ioapics; 855 int count = sizeof(*mpc);
856 unsigned char *mpt = ((unsigned char *)mpc) + count;
855 857
856 mp_ioapics[idx].mpc_type = MP_IOAPIC; 858 printk(KERN_INFO "mpc_length %x\n", mpc->mpc_length);
857 mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; 859 while (count < mpc->mpc_length) {
858 mp_ioapics[idx].mpc_apicaddr = address; 860 switch (*mpt) {
861 case MP_PROCESSOR:
862 {
863 struct mpc_config_processor *m =
864 (struct mpc_config_processor *)mpt;
865 mpt += sizeof(*m);
866 count += sizeof(*m);
867 break;
868 }
869 case MP_BUS:
870 {
871 struct mpc_config_bus *m =
872 (struct mpc_config_bus *)mpt;
873 mpt += sizeof(*m);
874 count += sizeof(*m);
875 break;
876 }
877 case MP_IOAPIC:
878 {
879 mpt += sizeof(struct mpc_config_ioapic);
880 count += sizeof(struct mpc_config_ioapic);
881 break;
882 }
883 case MP_INTSRC:
884 {
885#ifdef CONFIG_X86_IO_APIC
886 struct mpc_config_intsrc *m =
887 (struct mpc_config_intsrc *)mpt;
859 888
860 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); 889 printk(KERN_INFO "OLD ");
861 mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id); 890 print_MP_intsrc_info(m);
862#ifdef CONFIG_X86_32 891 i = get_MP_intsrc_index(m);
863 mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); 892 if (i > 0) {
864#else 893 assign_to_mpc_intsrc(&mp_irqs[i], m);
865 mp_ioapics[idx].mpc_apicver = 0; 894 printk(KERN_INFO "NEW ");
895 print_mp_irq_info(&mp_irqs[i]);
896 } else if (!i) {
897 /* legacy, do nothing */
898 } else if (nr_m_spare < SPARE_SLOT_NUM) {
899 /*
900 * not found (-1), or duplicated (-2)
901 * are invalid entries,
902 * we need to use the slot later
903 */
904 m_spare[nr_m_spare] = m;
905 nr_m_spare++;
906 }
866#endif 907#endif
867 /* 908 mpt += sizeof(struct mpc_config_intsrc);
868 * Build basic GSI lookup table to facilitate gsi->io_apic lookups 909 count += sizeof(struct mpc_config_intsrc);
869 * and to prevent reprogramming of IOAPIC pins (PCI GSIs). 910 break;
870 */ 911 }
871 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; 912 case MP_LINTSRC:
872 mp_ioapic_routing[idx].gsi_base = gsi_base; 913 {
873 mp_ioapic_routing[idx].gsi_end = gsi_base + 914 struct mpc_config_lintsrc *m =
874 io_apic_get_redir_entries(idx); 915 (struct mpc_config_lintsrc *)mpt;
875 916 mpt += sizeof(*m);
876 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " 917 count += sizeof(*m);
877 "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, 918 break;
878 mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, 919 }
879 mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end); 920 default:
880 921 /* wrong mptable */
881 nr_ioapics++; 922 printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
882} 923 printk(KERN_ERR "type %x\n", *mpt);
924 print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
925 1, mpc, mpc->mpc_length, 1);
926 goto out;
927 }
928 }
883 929
884void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) 930#ifdef CONFIG_X86_IO_APIC
885{ 931 for (i = 0; i < mp_irq_entries; i++) {
886 struct mpc_config_intsrc intsrc; 932 if (irq_used[i])
887 int ioapic = -1; 933 continue;
888 int pin = -1;
889 934
890 /* 935 if (mp_irqs[i].mp_irqtype != mp_INT)
891 * Convert 'gsi' to 'ioapic.pin'. 936 continue;
892 */
893 ioapic = mp_find_ioapic(gsi);
894 if (ioapic < 0)
895 return;
896 pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
897 937
898 /* 938 if (mp_irqs[i].mp_irqflag != 0x0f)
899 * TBD: This check is for faulty timer entries, where the override 939 continue;
900 * erroneously sets the trigger to level, resulting in a HUGE
901 * increase of timer interrupts!
902 */
903 if ((bus_irq == 0) && (trigger == 3))
904 trigger = 1;
905 940
906 intsrc.mpc_type = MP_INTSRC; 941 if (nr_m_spare > 0) {
907 intsrc.mpc_irqtype = mp_INT; 942 printk(KERN_INFO "*NEW* found ");
908 intsrc.mpc_irqflag = (trigger << 2) | polarity; 943 nr_m_spare--;
909 intsrc.mpc_srcbus = MP_ISA_BUS; 944 assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
910 intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ 945 m_spare[nr_m_spare] = NULL;
911 intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ 946 } else {
912 intsrc.mpc_dstirq = pin; /* INTIN# */ 947 struct mpc_config_intsrc *m =
948 (struct mpc_config_intsrc *)mpt;
949 count += sizeof(struct mpc_config_intsrc);
950 if (!mpc_new_phys) {
951 printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count);
952 } else {
953 if (count <= mpc_new_length)
954 printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count);
955 else {
956 printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length);
957 goto out;
958 }
959 }
960 assign_to_mpc_intsrc(&mp_irqs[i], m);
961 mpc->mpc_length = count;
962 mpt += sizeof(struct mpc_config_intsrc);
963 }
964 print_mp_irq_info(&mp_irqs[i]);
965 }
966#endif
967out:
968 /* update checksum */
969 mpc->mpc_checksum = 0;
970 mpc->mpc_checksum -= mpf_checksum((unsigned char *)mpc,
971 mpc->mpc_length);
913 972
914 MP_intsrc_info(&intsrc); 973 return 0;
915} 974}
916 975
917void __init mp_config_acpi_legacy_irqs(void) 976static int __initdata enable_update_mptable;
918{
919 struct mpc_config_intsrc intsrc;
920 int i = 0;
921 int ioapic = -1;
922 977
923#if defined (CONFIG_MCA) || defined (CONFIG_EISA) 978static int __init update_mptable_setup(char *str)
924 /* 979{
925 * Fabricate the legacy ISA bus (bus #31). 980 enable_update_mptable = 1;
926 */ 981 return 0;
927 mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; 982}
928#endif 983early_param("update_mptable", update_mptable_setup);
929 set_bit(MP_ISA_BUS, mp_bus_not_pci);
930 Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
931 984
932 /* 985static unsigned long __initdata mpc_new_phys;
933 * Older generations of ES7000 have no legacy identity mappings 986static unsigned long mpc_new_length __initdata = 4096;
934 */
935 if (es7000_plat == 1)
936 return;
937 987
938 /* 988/* alloc_mptable or alloc_mptable=4k */
939 * Locate the IOAPIC that manages the ISA IRQs (0-15). 989static int __initdata alloc_mptable;
940 */ 990static int __init parse_alloc_mptable_opt(char *p)
941 ioapic = mp_find_ioapic(0); 991{
942 if (ioapic < 0) 992 enable_update_mptable = 1;
943 return; 993 alloc_mptable = 1;
994 if (!p)
995 return 0;
996 mpc_new_length = memparse(p, &p);
997 return 0;
998}
999early_param("alloc_mptable", parse_alloc_mptable_opt);
944 1000
945 intsrc.mpc_type = MP_INTSRC; 1001void __init early_reserve_e820_mpc_new(void)
946 intsrc.mpc_irqflag = 0; /* Conforming */ 1002{
947 intsrc.mpc_srcbus = MP_ISA_BUS; 1003 if (enable_update_mptable && alloc_mptable) {
948#ifdef CONFIG_X86_IO_APIC 1004 u64 startt = 0;
949 intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; 1005#ifdef CONFIG_X86_TRAMPOLINE
1006 startt = TRAMPOLINE_BASE;
950#endif 1007#endif
951 /* 1008 mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
952 * Use the default configuration for the IRQs 0-15. Unless
953 * overridden by (MADT) interrupt source override entries.
954 */
955 for (i = 0; i < 16; i++) {
956 int idx;
957
958 for (idx = 0; idx < mp_irq_entries; idx++) {
959 struct mpc_config_intsrc *irq = mp_irqs + idx;
960
961 /* Do we already have a mapping for this ISA IRQ? */
962 if (irq->mpc_srcbus == MP_ISA_BUS
963 && irq->mpc_srcbusirq == i)
964 break;
965
966 /* Do we already have a mapping for this IOAPIC pin */
967 if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
968 (irq->mpc_dstirq == i))
969 break;
970 }
971
972 if (idx != mp_irq_entries) {
973 printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
974 continue; /* IRQ already used */
975 }
976
977 intsrc.mpc_irqtype = mp_INT;
978 intsrc.mpc_srcbusirq = i; /* Identity mapped */
979 intsrc.mpc_dstirq = i;
980
981 MP_intsrc_info(&intsrc);
982 } 1009 }
983} 1010}
984 1011
985int mp_register_gsi(u32 gsi, int triggering, int polarity) 1012static int __init update_mp_table(void)
986{ 1013{
987 int ioapic; 1014 char str[16];
988 int ioapic_pin; 1015 char oem[10];
989#ifdef CONFIG_X86_32 1016 struct intel_mp_floating *mpf;
990#define MAX_GSI_NUM 4096 1017 struct mp_config_table *mpc;
991#define IRQ_COMPRESSION_START 64 1018 struct mp_config_table *mpc_new;
1019
1020 if (!enable_update_mptable)
1021 return 0;
1022
1023 mpf = mpf_found;
1024 if (!mpf)
1025 return 0;
992 1026
993 static int pci_irq = IRQ_COMPRESSION_START;
994 /* 1027 /*
995 * Mapping between Global System Interrupts, which 1028 * Now see if we need to go further.
996 * represent all possible interrupts, and IRQs
997 * assigned to actual devices.
998 */ 1029 */
999 static int gsi_to_irq[MAX_GSI_NUM]; 1030 if (mpf->mpf_feature1 != 0)
1000#else 1031 return 0;
1001 1032
1002 if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) 1033 if (!mpf->mpf_physptr)
1003 return gsi; 1034 return 0;
1004#endif
1005 1035
1006 /* Don't set up the ACPI SCI because it's already set up */ 1036 mpc = phys_to_virt(mpf->mpf_physptr);
1007 if (acpi_gbl_FADT.sci_interrupt == gsi)
1008 return gsi;
1009 1037
1010 ioapic = mp_find_ioapic(gsi); 1038 if (!smp_check_mpc(mpc, oem, str))
1011 if (ioapic < 0) { 1039 return 0;
1012 printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
1013 return gsi;
1014 }
1015 1040
1016 ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base; 1041 printk(KERN_INFO "mpf: %lx\n", virt_to_phys(mpf));
1042 printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr);
1017 1043
1018#ifdef CONFIG_X86_32 1044 if (mpc_new_phys && mpc->mpc_length > mpc_new_length) {
1019 if (ioapic_renumber_irq) 1045 mpc_new_phys = 0;
1020 gsi = ioapic_renumber_irq(ioapic, gsi); 1046 printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
1021#endif 1047 mpc_new_length);
1022
1023 /*
1024 * Avoid pin reprogramming. PRTs typically include entries
1025 * with redundant pin->gsi mappings (but unique PCI devices);
1026 * we only program the IOAPIC on the first.
1027 */
1028 if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
1029 printk(KERN_ERR "Invalid reference to IOAPIC pin "
1030 "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
1031 ioapic_pin);
1032 return gsi;
1033 } 1048 }
1034 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { 1049
1035 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", 1050 if (!mpc_new_phys) {
1036 mp_ioapic_routing[ioapic].apic_id, ioapic_pin); 1051 unsigned char old, new;
1037#ifdef CONFIG_X86_32 1052 /* check if we can change the postion */
1038 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); 1053 mpc->mpc_checksum = 0;
1039#else 1054 old = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
1040 return gsi; 1055 mpc->mpc_checksum = 0xff;
1041#endif 1056 new = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
1057 if (old == new) {
1058 printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
1059 return 0;
1060 }
1061 printk(KERN_INFO "use in-positon replacing\n");
1062 } else {
1063 mpf->mpf_physptr = mpc_new_phys;
1064 mpc_new = phys_to_virt(mpc_new_phys);
1065 memcpy(mpc_new, mpc, mpc->mpc_length);
1066 mpc = mpc_new;
1067 /* check if we can modify that */
1068 if (mpc_new_phys - mpf->mpf_physptr) {
1069 struct intel_mp_floating *mpf_new;
1070 /* steal 16 bytes from [0, 1k) */
1071 printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
1072 mpf_new = phys_to_virt(0x400 - 16);
1073 memcpy(mpf_new, mpf, 16);
1074 mpf = mpf_new;
1075 mpf->mpf_physptr = mpc_new_phys;
1076 }
1077 mpf->mpf_checksum = 0;
1078 mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16);
1079 printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr);
1042 } 1080 }
1043 1081
1044 set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
1045#ifdef CONFIG_X86_32
1046 /* 1082 /*
1047 * For GSI >= 64, use IRQ compression 1083 * only replace the one with mp_INT and
1084 * MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW,
1085 * already in mp_irqs , stored by ... and mp_config_acpi_gsi,
1086 * may need pci=routeirq for all coverage
1048 */ 1087 */
1049 if ((gsi >= IRQ_COMPRESSION_START) 1088 replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
1050 && (triggering == ACPI_LEVEL_SENSITIVE)) { 1089
1051 /* 1090 return 0;
1052 * For PCI devices assign IRQs in order, avoiding gaps
1053 * due to unused I/O APIC pins.
1054 */
1055 int irq = gsi;
1056 if (gsi < MAX_GSI_NUM) {
1057 /*
1058 * Retain the VIA chipset work-around (gsi > 15), but
1059 * avoid a problem where the 8254 timer (IRQ0) is setup
1060 * via an override (so it's not on pin 0 of the ioapic),
1061 * and at the same time, the pin 0 interrupt is a PCI
1062 * type. The gsi > 15 test could cause these two pins
1063 * to be shared as IRQ0, and they are not shareable.
1064 * So test for this condition, and if necessary, avoid
1065 * the pin collision.
1066 */
1067 gsi = pci_irq++;
1068 /*
1069 * Don't assign IRQ used by ACPI SCI
1070 */
1071 if (gsi == acpi_gbl_FADT.sci_interrupt)
1072 gsi = pci_irq++;
1073 gsi_to_irq[irq] = gsi;
1074 } else {
1075 printk(KERN_ERR "GSI %u is too high\n", gsi);
1076 return gsi;
1077 }
1078 }
1079#endif
1080 io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
1081 triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
1082 polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
1083 return gsi;
1084} 1091}
1085 1092
1086#endif /* CONFIG_X86_IO_APIC */ 1093late_initcall(update_mp_table);
1087#endif /* CONFIG_ACPI */
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 1f3abe048e93..2e2af5d18191 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -72,21 +72,28 @@ static ssize_t msr_read(struct file *file, char __user *buf,
72 u32 data[2]; 72 u32 data[2];
73 u32 reg = *ppos; 73 u32 reg = *ppos;
74 int cpu = iminor(file->f_path.dentry->d_inode); 74 int cpu = iminor(file->f_path.dentry->d_inode);
75 int err; 75 int err = 0;
76 ssize_t bytes = 0;
76 77
77 if (count % 8) 78 if (count % 8)
78 return -EINVAL; /* Invalid chunk size */ 79 return -EINVAL; /* Invalid chunk size */
79 80
80 for (; count; count -= 8) { 81 for (; count; count -= 8) {
81 err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]); 82 err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]);
82 if (err) 83 if (err) {
83 return -EIO; 84 if (err == -EFAULT) /* Fix idiotic error code */
84 if (copy_to_user(tmp, &data, 8)) 85 err = -EIO;
85 return -EFAULT; 86 break;
87 }
88 if (copy_to_user(tmp, &data, 8)) {
89 err = -EFAULT;
90 break;
91 }
86 tmp += 2; 92 tmp += 2;
93 bytes += 8;
87 } 94 }
88 95
89 return ((char __user *)tmp) - buf; 96 return bytes ? bytes : err;
90} 97}
91 98
92static ssize_t msr_write(struct file *file, const char __user *buf, 99static ssize_t msr_write(struct file *file, const char __user *buf,
@@ -96,34 +103,49 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
96 u32 data[2]; 103 u32 data[2];
97 u32 reg = *ppos; 104 u32 reg = *ppos;
98 int cpu = iminor(file->f_path.dentry->d_inode); 105 int cpu = iminor(file->f_path.dentry->d_inode);
99 int err; 106 int err = 0;
107 ssize_t bytes = 0;
100 108
101 if (count % 8) 109 if (count % 8)
102 return -EINVAL; /* Invalid chunk size */ 110 return -EINVAL; /* Invalid chunk size */
103 111
104 for (; count; count -= 8) { 112 for (; count; count -= 8) {
105 if (copy_from_user(&data, tmp, 8)) 113 if (copy_from_user(&data, tmp, 8)) {
106 return -EFAULT; 114 err = -EFAULT;
115 break;
116 }
107 err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]); 117 err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]);
108 if (err) 118 if (err) {
109 return -EIO; 119 if (err == -EFAULT) /* Fix idiotic error code */
120 err = -EIO;
121 break;
122 }
110 tmp += 2; 123 tmp += 2;
124 bytes += 8;
111 } 125 }
112 126
113 return ((char __user *)tmp) - buf; 127 return bytes ? bytes : err;
114} 128}
115 129
116static int msr_open(struct inode *inode, struct file *file) 130static int msr_open(struct inode *inode, struct file *file)
117{ 131{
118 unsigned int cpu = iminor(file->f_path.dentry->d_inode); 132 unsigned int cpu = iminor(file->f_path.dentry->d_inode);
119 struct cpuinfo_x86 *c = &cpu_data(cpu); 133 struct cpuinfo_x86 *c = &cpu_data(cpu);
134 int ret = 0;
120 135
121 if (cpu >= NR_CPUS || !cpu_online(cpu)) 136 lock_kernel();
122 return -ENXIO; /* No such CPU */ 137 cpu = iminor(file->f_path.dentry->d_inode);
123 if (!cpu_has(c, X86_FEATURE_MSR))
124 return -EIO; /* MSR not supported */
125 138
126 return 0; 139 if (cpu >= NR_CPUS || !cpu_online(cpu)) {
140 ret = -ENXIO; /* No such CPU */
141 goto out;
142 }
143 c = &cpu_data(cpu);
144 if (!cpu_has(c, X86_FEATURE_MSR))
145 ret = -EIO; /* MSR not supported */
146out:
147 unlock_kernel();
148 return ret;
127} 149}
128 150
129/* 151/*
@@ -141,8 +163,8 @@ static int __cpuinit msr_device_create(int cpu)
141{ 163{
142 struct device *dev; 164 struct device *dev;
143 165
144 dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), 166 dev = device_create_drvdata(msr_class, NULL, MKDEV(MSR_MAJOR, cpu),
145 "msr%d", cpu); 167 NULL, "msr%d", cpu);
146 return IS_ERR(dev) ? PTR_ERR(dev) : 0; 168 return IS_ERR(dev) ? PTR_ERR(dev) : 0;
147} 169}
148 170
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi.c
index 5a29ded994fa..2c97f07f1c2c 100644
--- a/arch/x86/kernel/nmi_64.c
+++ b/arch/x86/kernel/nmi.c
@@ -6,10 +6,13 @@
6 * Fixes: 6 * Fixes:
7 * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. 7 * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
8 * Mikael Pettersson : Power Management for local APIC NMI watchdog. 8 * Mikael Pettersson : Power Management for local APIC NMI watchdog.
9 * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog.
9 * Pavel Machek and 10 * Pavel Machek and
10 * Mikael Pettersson : PM converted to driver model. Disable/enable API. 11 * Mikael Pettersson : PM converted to driver model. Disable/enable API.
11 */ 12 */
12 13
14#include <asm/apic.h>
15
13#include <linux/nmi.h> 16#include <linux/nmi.h>
14#include <linux/mm.h> 17#include <linux/mm.h>
15#include <linux/delay.h> 18#include <linux/delay.h>
@@ -17,20 +20,26 @@
17#include <linux/module.h> 20#include <linux/module.h>
18#include <linux/sysdev.h> 21#include <linux/sysdev.h>
19#include <linux/sysctl.h> 22#include <linux/sysctl.h>
23#include <linux/percpu.h>
20#include <linux/kprobes.h> 24#include <linux/kprobes.h>
21#include <linux/cpumask.h> 25#include <linux/cpumask.h>
26#include <linux/kernel_stat.h>
22#include <linux/kdebug.h> 27#include <linux/kdebug.h>
28#include <linux/smp.h>
23 29
30#include <asm/i8259.h>
31#include <asm/io_apic.h>
24#include <asm/smp.h> 32#include <asm/smp.h>
25#include <asm/nmi.h> 33#include <asm/nmi.h>
26#include <asm/proto.h> 34#include <asm/proto.h>
35#include <asm/timer.h>
36
27#include <asm/mce.h> 37#include <asm/mce.h>
28 38
29#include <mach_traps.h> 39#include <mach_traps.h>
30 40
31int unknown_nmi_panic; 41int unknown_nmi_panic;
32int nmi_watchdog_enabled; 42int nmi_watchdog_enabled;
33int panic_on_unrecovered_nmi;
34 43
35static cpumask_t backtrace_mask = CPU_MASK_NONE; 44static cpumask_t backtrace_mask = CPU_MASK_NONE;
36 45
@@ -41,104 +50,148 @@ static cpumask_t backtrace_mask = CPU_MASK_NONE;
41 * 0: the lapic NMI watchdog is disabled, but can be enabled 50 * 0: the lapic NMI watchdog is disabled, but can be enabled
42 */ 51 */
43atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ 52atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
53EXPORT_SYMBOL(nmi_active);
54
55unsigned int nmi_watchdog = NMI_NONE;
56EXPORT_SYMBOL(nmi_watchdog);
57
44static int panic_on_timeout; 58static int panic_on_timeout;
45 59
46unsigned int nmi_watchdog = NMI_DEFAULT;
47static unsigned int nmi_hz = HZ; 60static unsigned int nmi_hz = HZ;
48
49static DEFINE_PER_CPU(short, wd_enabled); 61static DEFINE_PER_CPU(short, wd_enabled);
62static int endflag __initdata;
50 63
51/* Run after command line and cpu_init init, but before all other checks */ 64static inline unsigned int get_nmi_count(int cpu)
52void nmi_watchdog_default(void)
53{ 65{
54 if (nmi_watchdog != NMI_DEFAULT) 66#ifdef CONFIG_X86_64
55 return; 67 return cpu_pda(cpu)->__nmi_count;
56 nmi_watchdog = NMI_NONE; 68#else
69 return nmi_count(cpu);
70#endif
57} 71}
58 72
59static int endflag __initdata = 0; 73static inline int mce_in_progress(void)
74{
75#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
76 return atomic_read(&mce_entry) > 0;
77#endif
78 return 0;
79}
80
81/*
82 * Take the local apic timer and PIT/HPET into account. We don't
83 * know which one is active, when we have highres/dyntick on
84 */
85static inline unsigned int get_timer_irqs(int cpu)
86{
87#ifdef CONFIG_X86_64
88 return read_pda(apic_timer_irqs) + read_pda(irq0_irqs);
89#else
90 return per_cpu(irq_stat, cpu).apic_timer_irqs +
91 per_cpu(irq_stat, cpu).irq0_irqs;
92#endif
93}
60 94
61#ifdef CONFIG_SMP 95#ifdef CONFIG_SMP
62/* The performance counters used by NMI_LOCAL_APIC don't trigger when 96/*
97 * The performance counters used by NMI_LOCAL_APIC don't trigger when
63 * the CPU is idle. To make sure the NMI watchdog really ticks on all 98 * the CPU is idle. To make sure the NMI watchdog really ticks on all
64 * CPUs during the test make them busy. 99 * CPUs during the test make them busy.
65 */ 100 */
66static __init void nmi_cpu_busy(void *data) 101static __init void nmi_cpu_busy(void *data)
67{ 102{
68 local_irq_enable_in_hardirq(); 103 local_irq_enable_in_hardirq();
69 /* Intentionally don't use cpu_relax here. This is 104 /*
70 to make sure that the performance counter really ticks, 105 * Intentionally don't use cpu_relax here. This is
71 even if there is a simulator or similar that catches the 106 * to make sure that the performance counter really ticks,
72 pause instruction. On a real HT machine this is fine because 107 * even if there is a simulator or similar that catches the
73 all other CPUs are busy with "useless" delay loops and don't 108 * pause instruction. On a real HT machine this is fine because
74 care if they get somewhat less cycles. */ 109 * all other CPUs are busy with "useless" delay loops and don't
110 * care if they get somewhat less cycles.
111 */
75 while (endflag == 0) 112 while (endflag == 0)
76 mb(); 113 mb();
77} 114}
78#endif 115#endif
79 116
117static void report_broken_nmi(int cpu, int *prev_nmi_count)
118{
119 printk(KERN_CONT "\n");
120
121 printk(KERN_WARNING
122 "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n",
123 cpu, prev_nmi_count[cpu], get_nmi_count(cpu));
124
125 printk(KERN_WARNING
126 "Please report this to bugzilla.kernel.org,\n");
127 printk(KERN_WARNING
128 "and attach the output of the 'dmesg' command.\n");
129
130 per_cpu(wd_enabled, cpu) = 0;
131 atomic_dec(&nmi_active);
132}
133
80int __init check_nmi_watchdog(void) 134int __init check_nmi_watchdog(void)
81{ 135{
82 int *prev_nmi_count; 136 unsigned int *prev_nmi_count;
83 int cpu; 137 int cpu;
84 138
85 if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) 139 if (!nmi_watchdog_active() || !atomic_read(&nmi_active))
86 return 0;
87
88 if (!atomic_read(&nmi_active))
89 return 0; 140 return 0;
90 141
91 prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); 142 prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
92 if (!prev_nmi_count) 143 if (!prev_nmi_count)
93 return -1; 144 goto error;
94 145
95 printk(KERN_INFO "Testing NMI watchdog ... "); 146 printk(KERN_INFO "Testing NMI watchdog ... ");
96 147
97#ifdef CONFIG_SMP 148#ifdef CONFIG_SMP
98 if (nmi_watchdog == NMI_LOCAL_APIC) 149 if (nmi_watchdog == NMI_LOCAL_APIC)
99 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); 150 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0);
100#endif 151#endif
101 152
102 for (cpu = 0; cpu < NR_CPUS; cpu++) 153 for_each_possible_cpu(cpu)
103 prev_nmi_count[cpu] = cpu_pda(cpu)->__nmi_count; 154 prev_nmi_count[cpu] = get_nmi_count(cpu);
104 local_irq_enable(); 155 local_irq_enable();
105 mdelay((20*1000)/nmi_hz); // wait 20 ticks 156 mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */
106 157
107 for_each_online_cpu(cpu) { 158 for_each_online_cpu(cpu) {
108 if (!per_cpu(wd_enabled, cpu)) 159 if (!per_cpu(wd_enabled, cpu))
109 continue; 160 continue;
110 if (cpu_pda(cpu)->__nmi_count - prev_nmi_count[cpu] <= 5) { 161 if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5)
111 printk(KERN_WARNING "WARNING: CPU#%d: NMI " 162 report_broken_nmi(cpu, prev_nmi_count);
112 "appears to be stuck (%d->%d)!\n",
113 cpu,
114 prev_nmi_count[cpu],
115 cpu_pda(cpu)->__nmi_count);
116 per_cpu(wd_enabled, cpu) = 0;
117 atomic_dec(&nmi_active);
118 }
119 } 163 }
120 endflag = 1; 164 endflag = 1;
121 if (!atomic_read(&nmi_active)) { 165 if (!atomic_read(&nmi_active)) {
122 kfree(prev_nmi_count); 166 kfree(prev_nmi_count);
123 atomic_set(&nmi_active, -1); 167 atomic_set(&nmi_active, -1);
124 return -1; 168 goto error;
125 } 169 }
126 printk("OK.\n"); 170 printk("OK.\n");
127 171
128 /* now that we know it works we can reduce NMI frequency to 172 /*
129 something more reasonable; makes a difference in some configs */ 173 * now that we know it works we can reduce NMI frequency to
174 * something more reasonable; makes a difference in some configs
175 */
130 if (nmi_watchdog == NMI_LOCAL_APIC) 176 if (nmi_watchdog == NMI_LOCAL_APIC)
131 nmi_hz = lapic_adjust_nmi_hz(1); 177 nmi_hz = lapic_adjust_nmi_hz(1);
132 178
133 kfree(prev_nmi_count); 179 kfree(prev_nmi_count);
134 return 0; 180 return 0;
181error:
182 if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259)
183 disable_8259A_irq(0);
184#ifdef CONFIG_X86_32
185 timer_ack = 0;
186#endif
187 return -1;
135} 188}
136 189
137static int __init setup_nmi_watchdog(char *str) 190static int __init setup_nmi_watchdog(char *str)
138{ 191{
139 int nmi; 192 unsigned int nmi;
140 193
141 if (!strncmp(str,"panic",5)) { 194 if (!strncmp(str, "panic", 5)) {
142 panic_on_timeout = 1; 195 panic_on_timeout = 1;
143 str = strchr(str, ','); 196 str = strchr(str, ',');
144 if (!str) 197 if (!str)
@@ -148,15 +201,17 @@ static int __init setup_nmi_watchdog(char *str)
148 201
149 get_option(&str, &nmi); 202 get_option(&str, &nmi);
150 203
151 if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE)) 204 if (nmi >= NMI_INVALID)
152 return 0; 205 return 0;
153 206
154 nmi_watchdog = nmi; 207 nmi_watchdog = nmi;
155 return 1; 208 return 1;
156} 209}
157
158__setup("nmi_watchdog=", setup_nmi_watchdog); 210__setup("nmi_watchdog=", setup_nmi_watchdog);
159 211
212/*
213 * Suspend/resume support
214 */
160#ifdef CONFIG_PM 215#ifdef CONFIG_PM
161 216
162static int nmi_pm_active; /* nmi_active before suspend */ 217static int nmi_pm_active; /* nmi_active before suspend */
@@ -195,7 +250,8 @@ static int __init init_lapic_nmi_sysfs(void)
195{ 250{
196 int error; 251 int error;
197 252
198 /* should really be a BUG_ON but b/c this is an 253 /*
254 * should really be a BUG_ON but b/c this is an
199 * init call, it just doesn't work. -dcz 255 * init call, it just doesn't work. -dcz
200 */ 256 */
201 if (nmi_watchdog != NMI_LOCAL_APIC) 257 if (nmi_watchdog != NMI_LOCAL_APIC)
@@ -209,6 +265,7 @@ static int __init init_lapic_nmi_sysfs(void)
209 error = sysdev_register(&device_lapic_nmi); 265 error = sysdev_register(&device_lapic_nmi);
210 return error; 266 return error;
211} 267}
268
212/* must come after the local APIC's device_initcall() */ 269/* must come after the local APIC's device_initcall() */
213late_initcall(init_lapic_nmi_sysfs); 270late_initcall(init_lapic_nmi_sysfs);
214 271
@@ -225,7 +282,7 @@ static void __acpi_nmi_enable(void *__unused)
225void acpi_nmi_enable(void) 282void acpi_nmi_enable(void)
226{ 283{
227 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) 284 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
228 on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); 285 on_each_cpu(__acpi_nmi_enable, NULL, 1);
229} 286}
230 287
231static void __acpi_nmi_disable(void *__unused) 288static void __acpi_nmi_disable(void *__unused)
@@ -239,7 +296,16 @@ static void __acpi_nmi_disable(void *__unused)
239void acpi_nmi_disable(void) 296void acpi_nmi_disable(void)
240{ 297{
241 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) 298 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
242 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); 299 on_each_cpu(__acpi_nmi_disable, NULL, 1);
300}
301
302/*
303 * This function is called as soon the LAPIC NMI watchdog driver has everything
304 * in place and it's ready to check if the NMIs belong to the NMI watchdog
305 */
306void cpu_nmi_set_wd_enabled(void)
307{
308 __get_cpu_var(wd_enabled) = 1;
243} 309}
244 310
245void setup_apic_nmi_watchdog(void *unused) 311void setup_apic_nmi_watchdog(void *unused)
@@ -249,12 +315,11 @@ void setup_apic_nmi_watchdog(void *unused)
249 315
250 /* cheap hack to support suspend/resume */ 316 /* cheap hack to support suspend/resume */
251 /* if cpu0 is not active neither should the other cpus */ 317 /* if cpu0 is not active neither should the other cpus */
252 if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) 318 if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0)
253 return; 319 return;
254 320
255 switch (nmi_watchdog) { 321 switch (nmi_watchdog) {
256 case NMI_LOCAL_APIC: 322 case NMI_LOCAL_APIC:
257 __get_cpu_var(wd_enabled) = 1;
258 if (lapic_watchdog_init(nmi_hz) < 0) { 323 if (lapic_watchdog_init(nmi_hz) < 0) {
259 __get_cpu_var(wd_enabled) = 0; 324 __get_cpu_var(wd_enabled) = 0;
260 return; 325 return;
@@ -269,9 +334,8 @@ void setup_apic_nmi_watchdog(void *unused)
269void stop_apic_nmi_watchdog(void *unused) 334void stop_apic_nmi_watchdog(void *unused)
270{ 335{
271 /* only support LOCAL and IO APICs for now */ 336 /* only support LOCAL and IO APICs for now */
272 if ((nmi_watchdog != NMI_LOCAL_APIC) && 337 if (!nmi_watchdog_active())
273 (nmi_watchdog != NMI_IO_APIC)) 338 return;
274 return;
275 if (__get_cpu_var(wd_enabled) == 0) 339 if (__get_cpu_var(wd_enabled) == 0)
276 return; 340 return;
277 if (nmi_watchdog == NMI_LOCAL_APIC) 341 if (nmi_watchdog == NMI_LOCAL_APIC)
@@ -287,6 +351,11 @@ void stop_apic_nmi_watchdog(void *unused)
287 * 351 *
288 * as these watchdog NMI IRQs are generated on every CPU, we only 352 * as these watchdog NMI IRQs are generated on every CPU, we only
289 * have to check the current processor. 353 * have to check the current processor.
354 *
355 * since NMIs don't listen to _any_ locks, we have to be extremely
356 * careful not to rely on unsafe variables. The printk might lock
357 * up though, so we have to break up any console locks first ...
358 * [when there will be more tty-related locks, break them up here too!]
290 */ 359 */
291 360
292static DEFINE_PER_CPU(unsigned, last_irq_sum); 361static DEFINE_PER_CPU(unsigned, last_irq_sum);
@@ -295,11 +364,11 @@ static DEFINE_PER_CPU(int, nmi_touch);
295 364
296void touch_nmi_watchdog(void) 365void touch_nmi_watchdog(void)
297{ 366{
298 if (nmi_watchdog > 0) { 367 if (nmi_watchdog_active()) {
299 unsigned cpu; 368 unsigned cpu;
300 369
301 /* 370 /*
302 * Tell other CPUs to reset their alert counters. We cannot 371 * Tell other CPUs to reset their alert counters. We cannot
303 * do it ourselves because the alert count increase is not 372 * do it ourselves because the alert count increase is not
304 * atomic. 373 * atomic.
305 */ 374 */
@@ -309,6 +378,9 @@ void touch_nmi_watchdog(void)
309 } 378 }
310 } 379 }
311 380
381 /*
382 * Tickle the softlockup detector too:
383 */
312 touch_softlockup_watchdog(); 384 touch_softlockup_watchdog();
313} 385}
314EXPORT_SYMBOL(touch_nmi_watchdog); 386EXPORT_SYMBOL(touch_nmi_watchdog);
@@ -316,7 +388,12 @@ EXPORT_SYMBOL(touch_nmi_watchdog);
316notrace __kprobes int 388notrace __kprobes int
317nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) 389nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
318{ 390{
319 int sum; 391 /*
392 * Since current_thread_info()-> is always on the stack, and we
393 * always switch the stack NMI-atomically, it's safe to use
394 * smp_processor_id().
395 */
396 unsigned int sum;
320 int touched = 0; 397 int touched = 0;
321 int cpu = smp_processor_id(); 398 int cpu = smp_processor_id();
322 int rc = 0; 399 int rc = 0;
@@ -328,7 +405,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
328 touched = 1; 405 touched = 1;
329 } 406 }
330 407
331 sum = read_pda(apic_timer_irqs) + read_pda(irq0_irqs); 408 sum = get_timer_irqs(cpu);
409
332 if (__get_cpu_var(nmi_touch)) { 410 if (__get_cpu_var(nmi_touch)) {
333 __get_cpu_var(nmi_touch) = 0; 411 __get_cpu_var(nmi_touch) = 0;
334 touched = 1; 412 touched = 1;
@@ -338,28 +416,29 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
338 static DEFINE_SPINLOCK(lock); /* Serialise the printks */ 416 static DEFINE_SPINLOCK(lock); /* Serialise the printks */
339 417
340 spin_lock(&lock); 418 spin_lock(&lock);
341 printk("NMI backtrace for cpu %d\n", cpu); 419 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
342 dump_stack(); 420 dump_stack();
343 spin_unlock(&lock); 421 spin_unlock(&lock);
344 cpu_clear(cpu, backtrace_mask); 422 cpu_clear(cpu, backtrace_mask);
345 } 423 }
346 424
347#ifdef CONFIG_X86_MCE 425 /* Could check oops_in_progress here too, but it's safer not to */
348 /* Could check oops_in_progress here too, but it's safer 426 if (mce_in_progress())
349 not too */
350 if (atomic_read(&mce_entry) > 0)
351 touched = 1; 427 touched = 1;
352#endif 428
353 /* if the apic timer isn't firing, this cpu isn't doing much */ 429 /* if the none of the timers isn't firing, this cpu isn't doing much */
354 if (!touched && __get_cpu_var(last_irq_sum) == sum) { 430 if (!touched && __get_cpu_var(last_irq_sum) == sum) {
355 /* 431 /*
356 * Ayiee, looks like this CPU is stuck ... 432 * Ayiee, looks like this CPU is stuck ...
357 * wait a few IRQs (5 seconds) before doing the oops ... 433 * wait a few IRQs (5 seconds) before doing the oops ...
358 */ 434 */
359 local_inc(&__get_cpu_var(alert_counter)); 435 local_inc(&__get_cpu_var(alert_counter));
360 if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) 436 if (local_read(&__get_cpu_var(alert_counter)) == 5 * nmi_hz)
361 die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs, 437 /*
362 panic_on_timeout); 438 * die_nmi will return ONLY if NOTIFY_STOP happens..
439 */
440 die_nmi("BUG: NMI Watchdog detected LOCKUP",
441 regs, panic_on_timeout);
363 } else { 442 } else {
364 __get_cpu_var(last_irq_sum) = sum; 443 __get_cpu_var(last_irq_sum) = sum;
365 local_set(&__get_cpu_var(alert_counter), 0); 444 local_set(&__get_cpu_var(alert_counter), 0);
@@ -373,7 +452,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
373 rc |= lapic_wd_event(nmi_hz); 452 rc |= lapic_wd_event(nmi_hz);
374 break; 453 break;
375 case NMI_IO_APIC: 454 case NMI_IO_APIC:
376 /* don't know how to accurately check for this. 455 /*
456 * don't know how to accurately check for this.
377 * just assume it was a watchdog timer interrupt 457 * just assume it was a watchdog timer interrupt
378 * This matches the old behaviour. 458 * This matches the old behaviour.
379 */ 459 */
@@ -383,31 +463,14 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
383 return rc; 463 return rc;
384} 464}
385 465
386static unsigned ignore_nmis; 466#ifdef CONFIG_SYSCTL
387
388asmlinkage notrace __kprobes void
389do_nmi(struct pt_regs *regs, long error_code)
390{
391 nmi_enter();
392 add_pda(__nmi_count,1);
393 if (!ignore_nmis)
394 default_do_nmi(regs);
395 nmi_exit();
396}
397
398void stop_nmi(void)
399{
400 acpi_nmi_disable();
401 ignore_nmis++;
402}
403 467
404void restart_nmi(void) 468static int __init setup_unknown_nmi_panic(char *str)
405{ 469{
406 ignore_nmis--; 470 unknown_nmi_panic = 1;
407 acpi_nmi_enable(); 471 return 1;
408} 472}
409 473__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
410#ifdef CONFIG_SYSCTL
411 474
412static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) 475static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
413{ 476{
@@ -415,7 +478,7 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
415 char buf[64]; 478 char buf[64];
416 479
417 sprintf(buf, "NMI received for unknown reason %02x\n", reason); 480 sprintf(buf, "NMI received for unknown reason %02x\n", reason);
418 die_nmi(buf, regs, 1); /* Always panic here */ 481 die_nmi(buf, regs, 1); /* Always panic here */
419 return 0; 482 return 0;
420} 483}
421 484
@@ -433,28 +496,26 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
433 if (!!old_state == !!nmi_watchdog_enabled) 496 if (!!old_state == !!nmi_watchdog_enabled)
434 return 0; 497 return 0;
435 498
436 if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) { 499 if (atomic_read(&nmi_active) < 0 || !nmi_watchdog_active()) {
437 printk( KERN_WARNING "NMI watchdog is permanently disabled\n"); 500 printk(KERN_WARNING
501 "NMI watchdog is permanently disabled\n");
438 return -EIO; 502 return -EIO;
439 } 503 }
440 504
441 /* if nmi_watchdog is not set yet, then set it */
442 nmi_watchdog_default();
443
444 if (nmi_watchdog == NMI_LOCAL_APIC) { 505 if (nmi_watchdog == NMI_LOCAL_APIC) {
445 if (nmi_watchdog_enabled) 506 if (nmi_watchdog_enabled)
446 enable_lapic_nmi_watchdog(); 507 enable_lapic_nmi_watchdog();
447 else 508 else
448 disable_lapic_nmi_watchdog(); 509 disable_lapic_nmi_watchdog();
449 } else { 510 } else {
450 printk( KERN_WARNING 511 printk(KERN_WARNING
451 "NMI watchdog doesn't know what hardware to touch\n"); 512 "NMI watchdog doesn't know what hardware to touch\n");
452 return -EIO; 513 return -EIO;
453 } 514 }
454 return 0; 515 return 0;
455} 516}
456 517
457#endif 518#endif /* CONFIG_SYSCTL */
458 519
459int do_nmi_callback(struct pt_regs *regs, int cpu) 520int do_nmi_callback(struct pt_regs *regs, int cpu)
460{ 521{
@@ -477,6 +538,3 @@ void __trigger_all_cpu_backtrace(void)
477 mdelay(1); 538 mdelay(1);
478 } 539 }
479} 540}
480
481EXPORT_SYMBOL(nmi_active);
482EXPORT_SYMBOL(nmi_watchdog);
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
deleted file mode 100644
index 84160f74eeb0..000000000000
--- a/arch/x86/kernel/nmi_32.c
+++ /dev/null
@@ -1,467 +0,0 @@
1/*
2 * NMI watchdog support on APIC systems
3 *
4 * Started by Ingo Molnar <mingo@redhat.com>
5 *
6 * Fixes:
7 * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
8 * Mikael Pettersson : Power Management for local APIC NMI watchdog.
9 * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog.
10 * Pavel Machek and
11 * Mikael Pettersson : PM converted to driver model. Disable/enable API.
12 */
13
14#include <linux/delay.h>
15#include <linux/interrupt.h>
16#include <linux/module.h>
17#include <linux/nmi.h>
18#include <linux/sysdev.h>
19#include <linux/sysctl.h>
20#include <linux/percpu.h>
21#include <linux/kprobes.h>
22#include <linux/cpumask.h>
23#include <linux/kernel_stat.h>
24#include <linux/kdebug.h>
25#include <linux/slab.h>
26
27#include <asm/smp.h>
28#include <asm/nmi.h>
29
30#include "mach_traps.h"
31
32int unknown_nmi_panic;
33int nmi_watchdog_enabled;
34
35static cpumask_t backtrace_mask = CPU_MASK_NONE;
36
37/* nmi_active:
38 * >0: the lapic NMI watchdog is active, but can be disabled
39 * <0: the lapic NMI watchdog has not been set up, and cannot
40 * be enabled
41 * 0: the lapic NMI watchdog is disabled, but can be enabled
42 */
43atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
44
45unsigned int nmi_watchdog = NMI_DEFAULT;
46static unsigned int nmi_hz = HZ;
47
48static DEFINE_PER_CPU(short, wd_enabled);
49
50static int endflag __initdata = 0;
51
52#ifdef CONFIG_SMP
53/* The performance counters used by NMI_LOCAL_APIC don't trigger when
54 * the CPU is idle. To make sure the NMI watchdog really ticks on all
55 * CPUs during the test make them busy.
56 */
57static __init void nmi_cpu_busy(void *data)
58{
59 local_irq_enable_in_hardirq();
60 /* Intentionally don't use cpu_relax here. This is
61 to make sure that the performance counter really ticks,
62 even if there is a simulator or similar that catches the
63 pause instruction. On a real HT machine this is fine because
64 all other CPUs are busy with "useless" delay loops and don't
65 care if they get somewhat less cycles. */
66 while (endflag == 0)
67 mb();
68}
69#endif
70
71int __init check_nmi_watchdog(void)
72{
73 unsigned int *prev_nmi_count;
74 int cpu;
75
76 if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED))
77 return 0;
78
79 if (!atomic_read(&nmi_active))
80 return 0;
81
82 prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
83 if (!prev_nmi_count)
84 return -1;
85
86 printk(KERN_INFO "Testing NMI watchdog ... ");
87
88#ifdef CONFIG_SMP
89 if (nmi_watchdog == NMI_LOCAL_APIC)
90 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
91#endif
92
93 for_each_possible_cpu(cpu)
94 prev_nmi_count[cpu] = nmi_count(cpu);
95 local_irq_enable();
96 mdelay((20*1000)/nmi_hz); // wait 20 ticks
97
98 for_each_possible_cpu(cpu) {
99#ifdef CONFIG_SMP
100 /* Check cpu_callin_map here because that is set
101 after the timer is started. */
102 if (!cpu_isset(cpu, cpu_callin_map))
103 continue;
104#endif
105 if (!per_cpu(wd_enabled, cpu))
106 continue;
107 if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
108 printk(KERN_WARNING "WARNING: CPU#%d: NMI "
109 "appears to be stuck (%d->%d)!\n",
110 cpu,
111 prev_nmi_count[cpu],
112 nmi_count(cpu));
113 per_cpu(wd_enabled, cpu) = 0;
114 atomic_dec(&nmi_active);
115 }
116 }
117 endflag = 1;
118 if (!atomic_read(&nmi_active)) {
119 kfree(prev_nmi_count);
120 atomic_set(&nmi_active, -1);
121 return -1;
122 }
123 printk("OK.\n");
124
125 /* now that we know it works we can reduce NMI frequency to
126 something more reasonable; makes a difference in some configs */
127 if (nmi_watchdog == NMI_LOCAL_APIC)
128 nmi_hz = lapic_adjust_nmi_hz(1);
129
130 kfree(prev_nmi_count);
131 return 0;
132}
133
134static int __init setup_nmi_watchdog(char *str)
135{
136 int nmi;
137
138 get_option(&str, &nmi);
139
140 if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
141 return 0;
142
143 nmi_watchdog = nmi;
144 return 1;
145}
146
147__setup("nmi_watchdog=", setup_nmi_watchdog);
148
149
150/* Suspend/resume support */
151
152#ifdef CONFIG_PM
153
154static int nmi_pm_active; /* nmi_active before suspend */
155
156static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
157{
158 /* only CPU0 goes here, other CPUs should be offline */
159 nmi_pm_active = atomic_read(&nmi_active);
160 stop_apic_nmi_watchdog(NULL);
161 BUG_ON(atomic_read(&nmi_active) != 0);
162 return 0;
163}
164
165static int lapic_nmi_resume(struct sys_device *dev)
166{
167 /* only CPU0 goes here, other CPUs should be offline */
168 if (nmi_pm_active > 0) {
169 setup_apic_nmi_watchdog(NULL);
170 touch_nmi_watchdog();
171 }
172 return 0;
173}
174
175
176static struct sysdev_class nmi_sysclass = {
177 .name = "lapic_nmi",
178 .resume = lapic_nmi_resume,
179 .suspend = lapic_nmi_suspend,
180};
181
182static struct sys_device device_lapic_nmi = {
183 .id = 0,
184 .cls = &nmi_sysclass,
185};
186
187static int __init init_lapic_nmi_sysfs(void)
188{
189 int error;
190
191 /* should really be a BUG_ON but b/c this is an
192 * init call, it just doesn't work. -dcz
193 */
194 if (nmi_watchdog != NMI_LOCAL_APIC)
195 return 0;
196
197 if (atomic_read(&nmi_active) < 0)
198 return 0;
199
200 error = sysdev_class_register(&nmi_sysclass);
201 if (!error)
202 error = sysdev_register(&device_lapic_nmi);
203 return error;
204}
205/* must come after the local APIC's device_initcall() */
206late_initcall(init_lapic_nmi_sysfs);
207
208#endif /* CONFIG_PM */
209
210static void __acpi_nmi_enable(void *__unused)
211{
212 apic_write_around(APIC_LVT0, APIC_DM_NMI);
213}
214
215/*
216 * Enable timer based NMIs on all CPUs:
217 */
218void acpi_nmi_enable(void)
219{
220 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
221 on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
222}
223
224static void __acpi_nmi_disable(void *__unused)
225{
226 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
227}
228
229/*
230 * Disable timer based NMIs on all CPUs:
231 */
232void acpi_nmi_disable(void)
233{
234 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
235 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
236}
237
238void setup_apic_nmi_watchdog(void *unused)
239{
240 if (__get_cpu_var(wd_enabled))
241 return;
242
243 /* cheap hack to support suspend/resume */
244 /* if cpu0 is not active neither should the other cpus */
245 if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
246 return;
247
248 switch (nmi_watchdog) {
249 case NMI_LOCAL_APIC:
250 __get_cpu_var(wd_enabled) = 1; /* enable it before to avoid race with handler */
251 if (lapic_watchdog_init(nmi_hz) < 0) {
252 __get_cpu_var(wd_enabled) = 0;
253 return;
254 }
255 /* FALL THROUGH */
256 case NMI_IO_APIC:
257 __get_cpu_var(wd_enabled) = 1;
258 atomic_inc(&nmi_active);
259 }
260}
261
262void stop_apic_nmi_watchdog(void *unused)
263{
264 /* only support LOCAL and IO APICs for now */
265 if ((nmi_watchdog != NMI_LOCAL_APIC) &&
266 (nmi_watchdog != NMI_IO_APIC))
267 return;
268 if (__get_cpu_var(wd_enabled) == 0)
269 return;
270 if (nmi_watchdog == NMI_LOCAL_APIC)
271 lapic_watchdog_stop();
272 __get_cpu_var(wd_enabled) = 0;
273 atomic_dec(&nmi_active);
274}
275
276/*
277 * the best way to detect whether a CPU has a 'hard lockup' problem
278 * is to check it's local APIC timer IRQ counts. If they are not
279 * changing then that CPU has some problem.
280 *
281 * as these watchdog NMI IRQs are generated on every CPU, we only
282 * have to check the current processor.
283 *
284 * since NMIs don't listen to _any_ locks, we have to be extremely
285 * careful not to rely on unsafe variables. The printk might lock
286 * up though, so we have to break up any console locks first ...
287 * [when there will be more tty-related locks, break them up
288 * here too!]
289 */
290
291static unsigned int
292 last_irq_sums [NR_CPUS],
293 alert_counter [NR_CPUS];
294
295void touch_nmi_watchdog(void)
296{
297 if (nmi_watchdog > 0) {
298 unsigned cpu;
299
300 /*
301 * Just reset the alert counters, (other CPUs might be
302 * spinning on locks we hold):
303 */
304 for_each_present_cpu(cpu) {
305 if (alert_counter[cpu])
306 alert_counter[cpu] = 0;
307 }
308 }
309
310 /*
311 * Tickle the softlockup detector too:
312 */
313 touch_softlockup_watchdog();
314}
315EXPORT_SYMBOL(touch_nmi_watchdog);
316
317extern void die_nmi(struct pt_regs *, const char *msg);
318
319notrace __kprobes int
320nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
321{
322
323 /*
324 * Since current_thread_info()-> is always on the stack, and we
325 * always switch the stack NMI-atomically, it's safe to use
326 * smp_processor_id().
327 */
328 unsigned int sum;
329 int touched = 0;
330 int cpu = smp_processor_id();
331 int rc = 0;
332
333 /* check for other users first */
334 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
335 == NOTIFY_STOP) {
336 rc = 1;
337 touched = 1;
338 }
339
340 if (cpu_isset(cpu, backtrace_mask)) {
341 static DEFINE_SPINLOCK(lock); /* Serialise the printks */
342
343 spin_lock(&lock);
344 printk("NMI backtrace for cpu %d\n", cpu);
345 dump_stack();
346 spin_unlock(&lock);
347 cpu_clear(cpu, backtrace_mask);
348 }
349
350 /*
351 * Take the local apic timer and PIT/HPET into account. We don't
352 * know which one is active, when we have highres/dyntick on
353 */
354 sum = per_cpu(irq_stat, cpu).apic_timer_irqs +
355 per_cpu(irq_stat, cpu).irq0_irqs;
356
357 /* if the none of the timers isn't firing, this cpu isn't doing much */
358 if (!touched && last_irq_sums[cpu] == sum) {
359 /*
360 * Ayiee, looks like this CPU is stuck ...
361 * wait a few IRQs (5 seconds) before doing the oops ...
362 */
363 alert_counter[cpu]++;
364 if (alert_counter[cpu] == 5*nmi_hz)
365 /*
366 * die_nmi will return ONLY if NOTIFY_STOP happens..
367 */
368 die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP");
369 } else {
370 last_irq_sums[cpu] = sum;
371 alert_counter[cpu] = 0;
372 }
373 /* see if the nmi watchdog went off */
374 if (!__get_cpu_var(wd_enabled))
375 return rc;
376 switch (nmi_watchdog) {
377 case NMI_LOCAL_APIC:
378 rc |= lapic_wd_event(nmi_hz);
379 break;
380 case NMI_IO_APIC:
381 /* don't know how to accurately check for this.
382 * just assume it was a watchdog timer interrupt
383 * This matches the old behaviour.
384 */
385 rc = 1;
386 break;
387 }
388 return rc;
389}
390
391#ifdef CONFIG_SYSCTL
392
393static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
394{
395 unsigned char reason = get_nmi_reason();
396 char buf[64];
397
398 sprintf(buf, "NMI received for unknown reason %02x\n", reason);
399 die_nmi(regs, buf);
400 return 0;
401}
402
403/*
404 * proc handler for /proc/sys/kernel/nmi
405 */
406int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
407 void __user *buffer, size_t *length, loff_t *ppos)
408{
409 int old_state;
410
411 nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
412 old_state = nmi_watchdog_enabled;
413 proc_dointvec(table, write, file, buffer, length, ppos);
414 if (!!old_state == !!nmi_watchdog_enabled)
415 return 0;
416
417 if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) {
418 printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
419 return -EIO;
420 }
421
422 if (nmi_watchdog == NMI_DEFAULT) {
423 if (lapic_watchdog_ok())
424 nmi_watchdog = NMI_LOCAL_APIC;
425 else
426 nmi_watchdog = NMI_IO_APIC;
427 }
428
429 if (nmi_watchdog == NMI_LOCAL_APIC) {
430 if (nmi_watchdog_enabled)
431 enable_lapic_nmi_watchdog();
432 else
433 disable_lapic_nmi_watchdog();
434 } else {
435 printk( KERN_WARNING
436 "NMI watchdog doesn't know what hardware to touch\n");
437 return -EIO;
438 }
439 return 0;
440}
441
442#endif
443
444int do_nmi_callback(struct pt_regs *regs, int cpu)
445{
446#ifdef CONFIG_SYSCTL
447 if (unknown_nmi_panic)
448 return unknown_nmi_panic_callback(regs, cpu);
449#endif
450 return 0;
451}
452
453void __trigger_all_cpu_backtrace(void)
454{
455 int i;
456
457 backtrace_mask = cpu_online_map;
458 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
459 for (i = 0; i < 10 * 1000; i++) {
460 if (cpus_empty(backtrace_mask))
461 break;
462 mdelay(1);
463 }
464}
465
466EXPORT_SYMBOL(nmi_active);
467EXPORT_SYMBOL(nmi_watchdog);
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index e65281b1634b..4caff39078e0 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -31,6 +31,9 @@
31#include <asm/numaq.h> 31#include <asm/numaq.h>
32#include <asm/topology.h> 32#include <asm/topology.h>
33#include <asm/processor.h> 33#include <asm/processor.h>
34#include <asm/mpspec.h>
35#include <asm/e820.h>
36#include <asm/setup.h>
34 37
35#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) 38#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
36 39
@@ -58,6 +61,8 @@ static void __init smp_dump_qct(void)
58 node_end_pfn[node] = MB_TO_PAGES( 61 node_end_pfn[node] = MB_TO_PAGES(
59 eq->hi_shrd_mem_start + eq->hi_shrd_mem_size); 62 eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
60 63
64 e820_register_active_regions(node, node_start_pfn[node],
65 node_end_pfn[node]);
61 memory_present(node, 66 memory_present(node,
62 node_start_pfn[node], node_end_pfn[node]); 67 node_start_pfn[node], node_end_pfn[node]);
63 node_remap_size[node] = node_memmap_size_bytes(node, 68 node_remap_size[node] = node_memmap_size_bytes(node,
@@ -67,23 +72,216 @@ static void __init smp_dump_qct(void)
67 } 72 }
68} 73}
69 74
70/*
71 * Unlike Summit, we don't really care to let the NUMA-Q
72 * fall back to flat mode. Don't compile for NUMA-Q
73 * unless you really need it!
74 */
75int __init get_memcfg_numaq(void)
76{
77 smp_dump_qct();
78 return 1;
79}
80 75
81static int __init numaq_tsc_disable(void) 76void __cpuinit numaq_tsc_disable(void)
82{ 77{
78 if (!found_numaq)
79 return;
80
83 if (num_online_nodes() > 1) { 81 if (num_online_nodes() > 1) {
84 printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); 82 printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
85 setup_clear_cpu_cap(X86_FEATURE_TSC); 83 setup_clear_cpu_cap(X86_FEATURE_TSC);
86 } 84 }
85}
86
87static int __init numaq_pre_time_init(void)
88{
89 numaq_tsc_disable();
87 return 0; 90 return 0;
88} 91}
89arch_initcall(numaq_tsc_disable); 92
93int found_numaq;
94/*
95 * Have to match translation table entries to main table entries by counter
96 * hence the mpc_record variable .... can't see a less disgusting way of
97 * doing this ....
98 */
99struct mpc_config_translation {
100 unsigned char mpc_type;
101 unsigned char trans_len;
102 unsigned char trans_type;
103 unsigned char trans_quad;
104 unsigned char trans_global;
105 unsigned char trans_local;
106 unsigned short trans_reserved;
107};
108
109/* x86_quirks member */
110static int mpc_record;
111static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
112 __cpuinitdata;
113
114static inline int generate_logical_apicid(int quad, int phys_apicid)
115{
116 return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
117}
118
119/* x86_quirks member */
120static int mpc_apic_id(struct mpc_config_processor *m)
121{
122 int quad = translation_table[mpc_record]->trans_quad;
123 int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
124
125 printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
126 m->mpc_apicid,
127 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
128 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
129 m->mpc_apicver, quad, logical_apicid);
130 return logical_apicid;
131}
132
133int mp_bus_id_to_node[MAX_MP_BUSSES];
134
135int mp_bus_id_to_local[MAX_MP_BUSSES];
136
137/* x86_quirks member */
138static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name)
139{
140 int quad = translation_table[mpc_record]->trans_quad;
141 int local = translation_table[mpc_record]->trans_local;
142
143 mp_bus_id_to_node[m->mpc_busid] = quad;
144 mp_bus_id_to_local[m->mpc_busid] = local;
145 printk(KERN_INFO "Bus #%d is %s (node %d)\n",
146 m->mpc_busid, name, quad);
147}
148
149int quad_local_to_mp_bus_id [NR_CPUS/4][4];
150
151/* x86_quirks member */
152static void mpc_oem_pci_bus(struct mpc_config_bus *m)
153{
154 int quad = translation_table[mpc_record]->trans_quad;
155 int local = translation_table[mpc_record]->trans_local;
156
157 quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
158}
159
160static void __init MP_translation_info(struct mpc_config_translation *m)
161{
162 printk(KERN_INFO
163 "Translation: record %d, type %d, quad %d, global %d, local %d\n",
164 mpc_record, m->trans_type, m->trans_quad, m->trans_global,
165 m->trans_local);
166
167 if (mpc_record >= MAX_MPC_ENTRY)
168 printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
169 else
170 translation_table[mpc_record] = m; /* stash this for later */
171 if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
172 node_set_online(m->trans_quad);
173}
174
175static int __init mpf_checksum(unsigned char *mp, int len)
176{
177 int sum = 0;
178
179 while (len--)
180 sum += *mp++;
181
182 return sum & 0xFF;
183}
184
185/*
186 * Read/parse the MPC oem tables
187 */
188
189static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
190 unsigned short oemsize)
191{
192 int count = sizeof(*oemtable); /* the header size */
193 unsigned char *oemptr = ((unsigned char *)oemtable) + count;
194
195 mpc_record = 0;
196 printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
197 oemtable);
198 if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
199 printk(KERN_WARNING
200 "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
201 oemtable->oem_signature[0], oemtable->oem_signature[1],
202 oemtable->oem_signature[2], oemtable->oem_signature[3]);
203 return;
204 }
205 if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
206 printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
207 return;
208 }
209 while (count < oemtable->oem_length) {
210 switch (*oemptr) {
211 case MP_TRANSLATION:
212 {
213 struct mpc_config_translation *m =
214 (struct mpc_config_translation *)oemptr;
215 MP_translation_info(m);
216 oemptr += sizeof(*m);
217 count += sizeof(*m);
218 ++mpc_record;
219 break;
220 }
221 default:
222 {
223 printk(KERN_WARNING
224 "Unrecognised OEM table entry type! - %d\n",
225 (int)*oemptr);
226 return;
227 }
228 }
229 }
230}
231
232static int __init numaq_setup_ioapic_ids(void)
233{
234 /* so can skip it */
235 return 1;
236}
237
238static struct x86_quirks numaq_x86_quirks __initdata = {
239 .arch_pre_time_init = numaq_pre_time_init,
240 .arch_time_init = NULL,
241 .arch_pre_intr_init = NULL,
242 .arch_memory_setup = NULL,
243 .arch_intr_init = NULL,
244 .arch_trap_init = NULL,
245 .mach_get_smp_config = NULL,
246 .mach_find_smp_config = NULL,
247 .mpc_record = &mpc_record,
248 .mpc_apic_id = mpc_apic_id,
249 .mpc_oem_bus_info = mpc_oem_bus_info,
250 .mpc_oem_pci_bus = mpc_oem_pci_bus,
251 .smp_read_mpc_oem = smp_read_mpc_oem,
252 .setup_ioapic_ids = numaq_setup_ioapic_ids,
253};
254
255void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
256 char *productid)
257{
258 if (strncmp(oem, "IBM NUMA", 8))
259 printk("Warning! Not a NUMA-Q system!\n");
260 else
261 found_numaq = 1;
262}
263
264static __init void early_check_numaq(void)
265{
266 /*
267 * Find possible boot-time SMP configuration:
268 */
269 early_find_smp_config();
270 /*
271 * get boot-time SMP configuration:
272 */
273 if (smp_found_config)
274 early_get_smp_config();
275
276 if (found_numaq)
277 x86_quirks = &numaq_x86_quirks;
278}
279
280int __init get_memcfg_numaq(void)
281{
282 early_check_numaq();
283 if (!found_numaq)
284 return 0;
285 smp_dump_qct();
286 return 1;
287}
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 3e6672274807..7a13fac63a1f 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -190,12 +190,12 @@ EXPORT_SYMBOL_GPL(olpc_ec_cmd);
190static void __init platform_detect(void) 190static void __init platform_detect(void)
191{ 191{
192 size_t propsize; 192 size_t propsize;
193 u32 rev; 193 __be32 rev;
194 194
195 if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4, 195 if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4,
196 &propsize) || propsize != 4) { 196 &propsize) || propsize != 4) {
197 printk(KERN_ERR "ofw: getprop call failed!\n"); 197 printk(KERN_ERR "ofw: getprop call failed!\n");
198 rev = 0; 198 rev = cpu_to_be32(0);
199 } 199 }
200 olpc_platform_info.boardrev = be32_to_cpu(rev); 200 olpc_platform_info.boardrev = be32_to_cpu(rev);
201} 201}
@@ -203,7 +203,7 @@ static void __init platform_detect(void)
203static void __init platform_detect(void) 203static void __init platform_detect(void)
204{ 204{
205 /* stopgap until OFW support is added to the kernel */ 205 /* stopgap until OFW support is added to the kernel */
206 olpc_platform_info.boardrev = be32_to_cpu(0xc2); 206 olpc_platform_info.boardrev = 0xc2;
207} 207}
208#endif 208#endif
209 209
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
new file mode 100644
index 000000000000..0e9f1982b1dd
--- /dev/null
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -0,0 +1,37 @@
1/*
2 * Split spinlock implementation out into its own file, so it can be
3 * compiled in a FTRACE-compatible way.
4 */
5#include <linux/spinlock.h>
6#include <linux/module.h>
7
8#include <asm/paravirt.h>
9
10static void default_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
11{
12 __raw_spin_lock(lock);
13}
14
15struct pv_lock_ops pv_lock_ops = {
16#ifdef CONFIG_SMP
17 .spin_is_locked = __ticket_spin_is_locked,
18 .spin_is_contended = __ticket_spin_is_contended,
19
20 .spin_lock = __ticket_spin_lock,
21 .spin_lock_flags = default_spin_lock_flags,
22 .spin_trylock = __ticket_spin_trylock,
23 .spin_unlock = __ticket_spin_unlock,
24#endif
25};
26EXPORT_SYMBOL(pv_lock_ops);
27
28void __init paravirt_use_bytelocks(void)
29{
30#ifdef CONFIG_SMP
31 pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
32 pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
33 pv_lock_ops.spin_lock = __byte_spin_lock;
34 pv_lock_ops.spin_trylock = __byte_spin_trylock;
35 pv_lock_ops.spin_unlock = __byte_spin_unlock;
36#endif
37}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 74f0c5ea2a03..e4c8fb608873 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -29,7 +29,9 @@
29#include <asm/desc.h> 29#include <asm/desc.h>
30#include <asm/setup.h> 30#include <asm/setup.h>
31#include <asm/arch_hooks.h> 31#include <asm/arch_hooks.h>
32#include <asm/pgtable.h>
32#include <asm/time.h> 33#include <asm/time.h>
34#include <asm/pgalloc.h>
33#include <asm/irq.h> 35#include <asm/irq.h>
34#include <asm/delay.h> 36#include <asm/delay.h>
35#include <asm/fixmap.h> 37#include <asm/fixmap.h>
@@ -122,6 +124,7 @@ static void *get_call_destination(u8 type)
122 .pv_irq_ops = pv_irq_ops, 124 .pv_irq_ops = pv_irq_ops,
123 .pv_apic_ops = pv_apic_ops, 125 .pv_apic_ops = pv_apic_ops,
124 .pv_mmu_ops = pv_mmu_ops, 126 .pv_mmu_ops = pv_mmu_ops,
127 .pv_lock_ops = pv_lock_ops,
125 }; 128 };
126 return *((void **)&tmpl + type); 129 return *((void **)&tmpl + type);
127} 130}
@@ -139,7 +142,9 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
139 /* If the operation is a nop, then nop the callsite */ 142 /* If the operation is a nop, then nop the callsite */
140 ret = paravirt_patch_nop(); 143 ret = paravirt_patch_nop();
141 else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || 144 else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
142 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret)) 145 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
146 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
147 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64))
143 /* If operation requires a jmp, then jmp */ 148 /* If operation requires a jmp, then jmp */
144 ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len); 149 ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
145 else 150 else
@@ -190,7 +195,9 @@ static void native_flush_tlb_single(unsigned long addr)
190 195
191/* These are in entry.S */ 196/* These are in entry.S */
192extern void native_iret(void); 197extern void native_iret(void);
193extern void native_irq_enable_syscall_ret(void); 198extern void native_irq_enable_sysexit(void);
199extern void native_usergs_sysret32(void);
200extern void native_usergs_sysret64(void);
194 201
195static int __init print_banner(void) 202static int __init print_banner(void)
196{ 203{
@@ -280,7 +287,7 @@ struct pv_time_ops pv_time_ops = {
280 .get_wallclock = native_get_wallclock, 287 .get_wallclock = native_get_wallclock,
281 .set_wallclock = native_set_wallclock, 288 .set_wallclock = native_set_wallclock,
282 .sched_clock = native_sched_clock, 289 .sched_clock = native_sched_clock,
283 .get_cpu_khz = native_calculate_cpu_khz, 290 .get_tsc_khz = native_calibrate_tsc,
284}; 291};
285 292
286struct pv_irq_ops pv_irq_ops = { 293struct pv_irq_ops pv_irq_ops = {
@@ -291,6 +298,9 @@ struct pv_irq_ops pv_irq_ops = {
291 .irq_enable = native_irq_enable, 298 .irq_enable = native_irq_enable,
292 .safe_halt = native_safe_halt, 299 .safe_halt = native_safe_halt,
293 .halt = native_halt, 300 .halt = native_halt,
301#ifdef CONFIG_X86_64
302 .adjust_exception_frame = paravirt_nop,
303#endif
294}; 304};
295 305
296struct pv_cpu_ops pv_cpu_ops = { 306struct pv_cpu_ops pv_cpu_ops = {
@@ -309,6 +319,7 @@ struct pv_cpu_ops pv_cpu_ops = {
309#endif 319#endif
310 .wbinvd = native_wbinvd, 320 .wbinvd = native_wbinvd,
311 .read_msr = native_read_msr_safe, 321 .read_msr = native_read_msr_safe,
322 .read_msr_amd = native_read_msr_amd_safe,
312 .write_msr = native_write_msr_safe, 323 .write_msr = native_write_msr_safe,
313 .read_tsc = native_read_tsc, 324 .read_tsc = native_read_tsc,
314 .read_pmc = native_read_pmc, 325 .read_pmc = native_read_pmc,
@@ -321,12 +332,27 @@ struct pv_cpu_ops pv_cpu_ops = {
321 .store_idt = native_store_idt, 332 .store_idt = native_store_idt,
322 .store_tr = native_store_tr, 333 .store_tr = native_store_tr,
323 .load_tls = native_load_tls, 334 .load_tls = native_load_tls,
335#ifdef CONFIG_X86_64
336 .load_gs_index = native_load_gs_index,
337#endif
324 .write_ldt_entry = native_write_ldt_entry, 338 .write_ldt_entry = native_write_ldt_entry,
325 .write_gdt_entry = native_write_gdt_entry, 339 .write_gdt_entry = native_write_gdt_entry,
326 .write_idt_entry = native_write_idt_entry, 340 .write_idt_entry = native_write_idt_entry,
341
342 .alloc_ldt = paravirt_nop,
343 .free_ldt = paravirt_nop,
344
327 .load_sp0 = native_load_sp0, 345 .load_sp0 = native_load_sp0,
328 346
329 .irq_enable_syscall_ret = native_irq_enable_syscall_ret, 347#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
348 .irq_enable_sysexit = native_irq_enable_sysexit,
349#endif
350#ifdef CONFIG_X86_64
351#ifdef CONFIG_IA32_EMULATION
352 .usergs_sysret32 = native_usergs_sysret32,
353#endif
354 .usergs_sysret64 = native_usergs_sysret64,
355#endif
330 .iret = native_iret, 356 .iret = native_iret,
331 .swapgs = native_swapgs, 357 .swapgs = native_swapgs,
332 358
@@ -341,9 +367,6 @@ struct pv_cpu_ops pv_cpu_ops = {
341 367
342struct pv_apic_ops pv_apic_ops = { 368struct pv_apic_ops pv_apic_ops = {
343#ifdef CONFIG_X86_LOCAL_APIC 369#ifdef CONFIG_X86_LOCAL_APIC
344 .apic_write = native_apic_write,
345 .apic_write_atomic = native_apic_write_atomic,
346 .apic_read = native_apic_read,
347 .setup_boot_clock = setup_boot_APIC_clock, 370 .setup_boot_clock = setup_boot_APIC_clock,
348 .setup_secondary_clock = setup_secondary_APIC_clock, 371 .setup_secondary_clock = setup_secondary_APIC_clock,
349 .startup_ipi_hook = paravirt_nop, 372 .startup_ipi_hook = paravirt_nop,
@@ -354,6 +377,9 @@ struct pv_mmu_ops pv_mmu_ops = {
354#ifndef CONFIG_X86_64 377#ifndef CONFIG_X86_64
355 .pagetable_setup_start = native_pagetable_setup_start, 378 .pagetable_setup_start = native_pagetable_setup_start,
356 .pagetable_setup_done = native_pagetable_setup_done, 379 .pagetable_setup_done = native_pagetable_setup_done,
380#else
381 .pagetable_setup_start = paravirt_nop,
382 .pagetable_setup_done = paravirt_nop,
357#endif 383#endif
358 384
359 .read_cr2 = native_read_cr2, 385 .read_cr2 = native_read_cr2,
@@ -366,6 +392,9 @@ struct pv_mmu_ops pv_mmu_ops = {
366 .flush_tlb_single = native_flush_tlb_single, 392 .flush_tlb_single = native_flush_tlb_single,
367 .flush_tlb_others = native_flush_tlb_others, 393 .flush_tlb_others = native_flush_tlb_others,
368 394
395 .pgd_alloc = __paravirt_pgd_alloc,
396 .pgd_free = paravirt_nop,
397
369 .alloc_pte = paravirt_nop, 398 .alloc_pte = paravirt_nop,
370 .alloc_pmd = paravirt_nop, 399 .alloc_pmd = paravirt_nop,
371 .alloc_pmd_clone = paravirt_nop, 400 .alloc_pmd_clone = paravirt_nop,
@@ -380,6 +409,9 @@ struct pv_mmu_ops pv_mmu_ops = {
380 .pte_update = paravirt_nop, 409 .pte_update = paravirt_nop,
381 .pte_update_defer = paravirt_nop, 410 .pte_update_defer = paravirt_nop,
382 411
412 .ptep_modify_prot_start = __ptep_modify_prot_start,
413 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
414
383#ifdef CONFIG_HIGHPTE 415#ifdef CONFIG_HIGHPTE
384 .kmap_atomic_pte = kmap_atomic, 416 .kmap_atomic_pte = kmap_atomic,
385#endif 417#endif
@@ -403,6 +435,7 @@ struct pv_mmu_ops pv_mmu_ops = {
403#endif /* PAGETABLE_LEVELS >= 3 */ 435#endif /* PAGETABLE_LEVELS >= 3 */
404 436
405 .pte_val = native_pte_val, 437 .pte_val = native_pte_val,
438 .pte_flags = native_pte_flags,
406 .pgd_val = native_pgd_val, 439 .pgd_val = native_pgd_val,
407 440
408 .make_pte = native_make_pte, 441 .make_pte = native_make_pte,
@@ -416,6 +449,8 @@ struct pv_mmu_ops pv_mmu_ops = {
416 .enter = paravirt_nop, 449 .enter = paravirt_nop,
417 .leave = paravirt_nop, 450 .leave = paravirt_nop,
418 }, 451 },
452
453 .set_fixmap = native_set_fixmap,
419}; 454};
420 455
421EXPORT_SYMBOL_GPL(pv_time_ops); 456EXPORT_SYMBOL_GPL(pv_time_ops);
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 82fc5fcab4f4..9fe644f4861d 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -5,7 +5,7 @@ DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
5DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf"); 5DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
6DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax"); 6DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
7DEF_NATIVE(pv_cpu_ops, iret, "iret"); 7DEF_NATIVE(pv_cpu_ops, iret, "iret");
8DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "sti; sysexit"); 8DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit");
9DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax"); 9DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
10DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); 10DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
11DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); 11DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
@@ -23,13 +23,13 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
23 start = start_##ops##_##x; \ 23 start = start_##ops##_##x; \
24 end = end_##ops##_##x; \ 24 end = end_##ops##_##x; \
25 goto patch_site 25 goto patch_site
26 switch(type) { 26 switch (type) {
27 PATCH_SITE(pv_irq_ops, irq_disable); 27 PATCH_SITE(pv_irq_ops, irq_disable);
28 PATCH_SITE(pv_irq_ops, irq_enable); 28 PATCH_SITE(pv_irq_ops, irq_enable);
29 PATCH_SITE(pv_irq_ops, restore_fl); 29 PATCH_SITE(pv_irq_ops, restore_fl);
30 PATCH_SITE(pv_irq_ops, save_fl); 30 PATCH_SITE(pv_irq_ops, save_fl);
31 PATCH_SITE(pv_cpu_ops, iret); 31 PATCH_SITE(pv_cpu_ops, iret);
32 PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret); 32 PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
33 PATCH_SITE(pv_mmu_ops, read_cr2); 33 PATCH_SITE(pv_mmu_ops, read_cr2);
34 PATCH_SITE(pv_mmu_ops, read_cr3); 34 PATCH_SITE(pv_mmu_ops, read_cr3);
35 PATCH_SITE(pv_mmu_ops, write_cr3); 35 PATCH_SITE(pv_mmu_ops, write_cr3);
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 7d904e138d7e..061d01df9ae6 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -14,8 +14,9 @@ DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
14DEF_NATIVE(pv_cpu_ops, clts, "clts"); 14DEF_NATIVE(pv_cpu_ops, clts, "clts");
15DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); 15DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
16 16
17/* the three commands give us more control to how to return from a syscall */ 17DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "swapgs; sti; sysexit");
18DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "movq %gs:" __stringify(pda_oldrsp) ", %rsp; swapgs; sysretq;"); 18DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
19DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl");
19DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); 20DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
20 21
21unsigned native_patch(u8 type, u16 clobbers, void *ibuf, 22unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
@@ -35,7 +36,9 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
35 PATCH_SITE(pv_irq_ops, irq_enable); 36 PATCH_SITE(pv_irq_ops, irq_enable);
36 PATCH_SITE(pv_irq_ops, irq_disable); 37 PATCH_SITE(pv_irq_ops, irq_disable);
37 PATCH_SITE(pv_cpu_ops, iret); 38 PATCH_SITE(pv_cpu_ops, iret);
38 PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret); 39 PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
40 PATCH_SITE(pv_cpu_ops, usergs_sysret32);
41 PATCH_SITE(pv_cpu_ops, usergs_sysret64);
39 PATCH_SITE(pv_cpu_ops, swapgs); 42 PATCH_SITE(pv_cpu_ops, swapgs);
40 PATCH_SITE(pv_mmu_ops, read_cr2); 43 PATCH_SITE(pv_mmu_ops, read_cr2);
41 PATCH_SITE(pv_mmu_ops, read_cr3); 44 PATCH_SITE(pv_mmu_ops, read_cr3);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index e28ec497e142..080d1d27f37a 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -29,6 +29,7 @@
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/spinlock.h> 30#include <linux/spinlock.h>
31#include <linux/string.h> 31#include <linux/string.h>
32#include <linux/crash_dump.h>
32#include <linux/dma-mapping.h> 33#include <linux/dma-mapping.h>
33#include <linux/bitops.h> 34#include <linux/bitops.h>
34#include <linux/pci_ids.h> 35#include <linux/pci_ids.h>
@@ -36,7 +37,8 @@
36#include <linux/delay.h> 37#include <linux/delay.h>
37#include <linux/scatterlist.h> 38#include <linux/scatterlist.h>
38#include <linux/iommu-helper.h> 39#include <linux/iommu-helper.h>
39#include <asm/gart.h> 40
41#include <asm/iommu.h>
40#include <asm/calgary.h> 42#include <asm/calgary.h>
41#include <asm/tce.h> 43#include <asm/tce.h>
42#include <asm/pci-direct.h> 44#include <asm/pci-direct.h>
@@ -167,6 +169,8 @@ static void calgary_dump_error_regs(struct iommu_table *tbl);
167static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); 169static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
168static void calioc2_tce_cache_blast(struct iommu_table *tbl); 170static void calioc2_tce_cache_blast(struct iommu_table *tbl);
169static void calioc2_dump_error_regs(struct iommu_table *tbl); 171static void calioc2_dump_error_regs(struct iommu_table *tbl);
172static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl);
173static void get_tce_space_from_tar(void);
170 174
171static struct cal_chipset_ops calgary_chip_ops = { 175static struct cal_chipset_ops calgary_chip_ops = {
172 .handle_quirks = calgary_handle_quirks, 176 .handle_quirks = calgary_handle_quirks,
@@ -257,7 +261,7 @@ static void iommu_range_reserve(struct iommu_table *tbl,
257 badbit, tbl, start_addr, npages); 261 badbit, tbl, start_addr, npages);
258 } 262 }
259 263
260 set_bit_string(tbl->it_map, index, npages); 264 iommu_area_reserve(tbl->it_map, index, npages);
261 265
262 spin_unlock_irqrestore(&tbl->it_lock, flags); 266 spin_unlock_irqrestore(&tbl->it_lock, flags);
263} 267}
@@ -339,9 +343,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
339 /* were we called with bad_dma_address? */ 343 /* were we called with bad_dma_address? */
340 badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); 344 badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE);
341 if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { 345 if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) {
342 printk(KERN_ERR "Calgary: driver tried unmapping bad DMA " 346 WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
343 "address 0x%Lx\n", dma_addr); 347 "address 0x%Lx\n", dma_addr);
344 WARN_ON(1);
345 return; 348 return;
346 } 349 }
347 350
@@ -410,22 +413,6 @@ static void calgary_unmap_sg(struct device *dev,
410 } 413 }
411} 414}
412 415
413static int calgary_nontranslate_map_sg(struct device* dev,
414 struct scatterlist *sg, int nelems, int direction)
415{
416 struct scatterlist *s;
417 int i;
418
419 for_each_sg(sg, s, nelems, i) {
420 struct page *p = sg_page(s);
421
422 BUG_ON(!p);
423 s->dma_address = virt_to_bus(sg_virt(s));
424 s->dma_length = s->length;
425 }
426 return nelems;
427}
428
429static int calgary_map_sg(struct device *dev, struct scatterlist *sg, 416static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
430 int nelems, int direction) 417 int nelems, int direction)
431{ 418{
@@ -436,9 +423,6 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
436 unsigned long entry; 423 unsigned long entry;
437 int i; 424 int i;
438 425
439 if (!translation_enabled(tbl))
440 return calgary_nontranslate_map_sg(dev, sg, nelems, direction);
441
442 for_each_sg(sg, s, nelems, i) { 426 for_each_sg(sg, s, nelems, i) {
443 BUG_ON(!sg_page(s)); 427 BUG_ON(!sg_page(s));
444 428
@@ -474,7 +458,6 @@ error:
474static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, 458static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
475 size_t size, int direction) 459 size_t size, int direction)
476{ 460{
477 dma_addr_t dma_handle = bad_dma_address;
478 void *vaddr = phys_to_virt(paddr); 461 void *vaddr = phys_to_virt(paddr);
479 unsigned long uaddr; 462 unsigned long uaddr;
480 unsigned int npages; 463 unsigned int npages;
@@ -483,12 +466,7 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
483 uaddr = (unsigned long)vaddr; 466 uaddr = (unsigned long)vaddr;
484 npages = num_dma_pages(uaddr, size); 467 npages = num_dma_pages(uaddr, size);
485 468
486 if (translation_enabled(tbl)) 469 return iommu_alloc(dev, tbl, vaddr, npages, direction);
487 dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction);
488 else
489 dma_handle = virt_to_bus(vaddr);
490
491 return dma_handle;
492} 470}
493 471
494static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, 472static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
@@ -497,9 +475,6 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
497 struct iommu_table *tbl = find_iommu_table(dev); 475 struct iommu_table *tbl = find_iommu_table(dev);
498 unsigned int npages; 476 unsigned int npages;
499 477
500 if (!translation_enabled(tbl))
501 return;
502
503 npages = num_dma_pages(dma_handle, size); 478 npages = num_dma_pages(dma_handle, size);
504 iommu_free(tbl, dma_handle, npages); 479 iommu_free(tbl, dma_handle, npages);
505} 480}
@@ -516,24 +491,20 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
516 npages = size >> PAGE_SHIFT; 491 npages = size >> PAGE_SHIFT;
517 order = get_order(size); 492 order = get_order(size);
518 493
494 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
495
519 /* alloc enough pages (and possibly more) */ 496 /* alloc enough pages (and possibly more) */
520 ret = (void *)__get_free_pages(flag, order); 497 ret = (void *)__get_free_pages(flag, order);
521 if (!ret) 498 if (!ret)
522 goto error; 499 goto error;
523 memset(ret, 0, size); 500 memset(ret, 0, size);
524 501
525 if (translation_enabled(tbl)) { 502 /* set up tces to cover the allocated range */
526 /* set up tces to cover the allocated range */ 503 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
527 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); 504 if (mapping == bad_dma_address)
528 if (mapping == bad_dma_address) 505 goto free;
529 goto free; 506 *dma_handle = mapping;
530
531 *dma_handle = mapping;
532 } else /* non translated slot */
533 *dma_handle = virt_to_bus(ret);
534
535 return ret; 507 return ret;
536
537free: 508free:
538 free_pages((unsigned long)ret, get_order(size)); 509 free_pages((unsigned long)ret, get_order(size));
539 ret = NULL; 510 ret = NULL;
@@ -541,8 +512,22 @@ error:
541 return ret; 512 return ret;
542} 513}
543 514
544static const struct dma_mapping_ops calgary_dma_ops = { 515static void calgary_free_coherent(struct device *dev, size_t size,
516 void *vaddr, dma_addr_t dma_handle)
517{
518 unsigned int npages;
519 struct iommu_table *tbl = find_iommu_table(dev);
520
521 size = PAGE_ALIGN(size);
522 npages = size >> PAGE_SHIFT;
523
524 iommu_free(tbl, dma_handle, npages);
525 free_pages((unsigned long)vaddr, get_order(size));
526}
527
528static struct dma_mapping_ops calgary_dma_ops = {
545 .alloc_coherent = calgary_alloc_coherent, 529 .alloc_coherent = calgary_alloc_coherent,
530 .free_coherent = calgary_free_coherent,
546 .map_single = calgary_map_single, 531 .map_single = calgary_map_single,
547 .unmap_single = calgary_unmap_single, 532 .unmap_single = calgary_unmap_single,
548 .map_sg = calgary_map_sg, 533 .map_sg = calgary_map_sg,
@@ -830,7 +815,11 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
830 815
831 tbl = pci_iommu(dev->bus); 816 tbl = pci_iommu(dev->bus);
832 tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; 817 tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space;
833 tce_free(tbl, 0, tbl->it_size); 818
819 if (is_kdump_kernel())
820 calgary_init_bitmap_from_tce_table(tbl);
821 else
822 tce_free(tbl, 0, tbl->it_size);
834 823
835 if (is_calgary(dev->device)) 824 if (is_calgary(dev->device))
836 tbl->chip_ops = &calgary_chip_ops; 825 tbl->chip_ops = &calgary_chip_ops;
@@ -1209,6 +1198,10 @@ static int __init calgary_init(void)
1209 if (ret) 1198 if (ret)
1210 return ret; 1199 return ret;
1211 1200
1201 /* Purely for kdump kernel case */
1202 if (is_kdump_kernel())
1203 get_tce_space_from_tar();
1204
1212 do { 1205 do {
1213 dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); 1206 dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
1214 if (!dev) 1207 if (!dev)
@@ -1230,6 +1223,16 @@ static int __init calgary_init(void)
1230 goto error; 1223 goto error;
1231 } while (1); 1224 } while (1);
1232 1225
1226 dev = NULL;
1227 for_each_pci_dev(dev) {
1228 struct iommu_table *tbl;
1229
1230 tbl = find_iommu_table(&dev->dev);
1231
1232 if (translation_enabled(tbl))
1233 dev->dev.archdata.dma_ops = &calgary_dma_ops;
1234 }
1235
1233 return ret; 1236 return ret;
1234 1237
1235error: 1238error:
@@ -1251,6 +1254,7 @@ error:
1251 calgary_disable_translation(dev); 1254 calgary_disable_translation(dev);
1252 calgary_free_bus(dev); 1255 calgary_free_bus(dev);
1253 pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ 1256 pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
1257 dev->dev.archdata.dma_ops = NULL;
1254 } while (1); 1258 } while (1);
1255 1259
1256 return ret; 1260 return ret;
@@ -1280,13 +1284,15 @@ static inline int __init determine_tce_table_size(u64 ram)
1280static int __init build_detail_arrays(void) 1284static int __init build_detail_arrays(void)
1281{ 1285{
1282 unsigned long ptr; 1286 unsigned long ptr;
1283 int i, scal_detail_size, rio_detail_size; 1287 unsigned numnodes, i;
1288 int scal_detail_size, rio_detail_size;
1284 1289
1285 if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){ 1290 numnodes = rio_table_hdr->num_scal_dev;
1291 if (numnodes > MAX_NUMNODES){
1286 printk(KERN_WARNING 1292 printk(KERN_WARNING
1287 "Calgary: MAX_NUMNODES too low! Defined as %d, " 1293 "Calgary: MAX_NUMNODES too low! Defined as %d, "
1288 "but system has %d nodes.\n", 1294 "but system has %d nodes.\n",
1289 MAX_NUMNODES, rio_table_hdr->num_scal_dev); 1295 MAX_NUMNODES, numnodes);
1290 return -ENODEV; 1296 return -ENODEV;
1291 } 1297 }
1292 1298
@@ -1307,8 +1313,7 @@ static int __init build_detail_arrays(void)
1307 } 1313 }
1308 1314
1309 ptr = ((unsigned long)rio_table_hdr) + 3; 1315 ptr = ((unsigned long)rio_table_hdr) + 3;
1310 for (i = 0; i < rio_table_hdr->num_scal_dev; 1316 for (i = 0; i < numnodes; i++, ptr += scal_detail_size)
1311 i++, ptr += scal_detail_size)
1312 scal_devs[i] = (struct scal_detail *)ptr; 1317 scal_devs[i] = (struct scal_detail *)ptr;
1313 1318
1314 for (i = 0; i < rio_table_hdr->num_rio_dev; 1319 for (i = 0; i < rio_table_hdr->num_rio_dev;
@@ -1339,6 +1344,61 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
1339 return (val != 0xffffffff); 1344 return (val != 0xffffffff);
1340} 1345}
1341 1346
1347/*
1348 * calgary_init_bitmap_from_tce_table():
1349 * Funtion for kdump case. In the second/kdump kernel initialize
1350 * the bitmap based on the tce table entries obtained from first kernel
1351 */
1352static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
1353{
1354 u64 *tp;
1355 unsigned int index;
1356 tp = ((u64 *)tbl->it_base);
1357 for (index = 0 ; index < tbl->it_size; index++) {
1358 if (*tp != 0x0)
1359 set_bit(index, tbl->it_map);
1360 tp++;
1361 }
1362}
1363
1364/*
1365 * get_tce_space_from_tar():
1366 * Function for kdump case. Get the tce tables from first kernel
1367 * by reading the contents of the base adress register of calgary iommu
1368 */
1369static void __init get_tce_space_from_tar(void)
1370{
1371 int bus;
1372 void __iomem *target;
1373 unsigned long tce_space;
1374
1375 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
1376 struct calgary_bus_info *info = &bus_info[bus];
1377 unsigned short pci_device;
1378 u32 val;
1379
1380 val = read_pci_config(bus, 0, 0, 0);
1381 pci_device = (val & 0xFFFF0000) >> 16;
1382
1383 if (!is_cal_pci_dev(pci_device))
1384 continue;
1385 if (info->translation_disabled)
1386 continue;
1387
1388 if (calgary_bus_has_devices(bus, pci_device) ||
1389 translate_empty_slots) {
1390 target = calgary_reg(bus_info[bus].bbar,
1391 tar_offset(bus));
1392 tce_space = be64_to_cpu(readq(target));
1393 tce_space = tce_space & TAR_SW_BITS;
1394
1395 tce_space = tce_space & (~specified_table_size);
1396 info->tce_space = (u64 *)__va(tce_space);
1397 }
1398 }
1399 return;
1400}
1401
1342void __init detect_calgary(void) 1402void __init detect_calgary(void)
1343{ 1403{
1344 int bus; 1404 int bus;
@@ -1394,7 +1454,8 @@ void __init detect_calgary(void)
1394 return; 1454 return;
1395 } 1455 }
1396 1456
1397 specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); 1457 specified_table_size = determine_tce_table_size((is_kdump_kernel() ?
1458 saved_max_pfn : max_pfn) * PAGE_SIZE);
1398 1459
1399 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { 1460 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
1400 struct calgary_bus_info *info = &bus_info[bus]; 1461 struct calgary_bus_info *info = &bus_info[bus];
@@ -1412,10 +1473,16 @@ void __init detect_calgary(void)
1412 1473
1413 if (calgary_bus_has_devices(bus, pci_device) || 1474 if (calgary_bus_has_devices(bus, pci_device) ||
1414 translate_empty_slots) { 1475 translate_empty_slots) {
1415 tbl = alloc_tce_table(); 1476 /*
1416 if (!tbl) 1477 * If it is kdump kernel, find and use tce tables
1417 goto cleanup; 1478 * from first kernel, else allocate tce tables here
1418 info->tce_space = tbl; 1479 */
1480 if (!is_kdump_kernel()) {
1481 tbl = alloc_tce_table();
1482 if (!tbl)
1483 goto cleanup;
1484 info->tce_space = tbl;
1485 }
1419 calgary_found = 1; 1486 calgary_found = 1;
1420 } 1487 }
1421 } 1488 }
@@ -1430,6 +1497,10 @@ void __init detect_calgary(void)
1430 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " 1497 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, "
1431 "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, 1498 "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size,
1432 debugging ? "enabled" : "disabled"); 1499 debugging ? "enabled" : "disabled");
1500
1501 /* swiotlb for devices that aren't behind the Calgary. */
1502 if (max_pfn > MAX_DMA32_PFN)
1503 swiotlb = 1;
1433 } 1504 }
1434 return; 1505 return;
1435 1506
@@ -1446,7 +1517,7 @@ int __init calgary_iommu_init(void)
1446{ 1517{
1447 int ret; 1518 int ret;
1448 1519
1449 if (no_iommu || swiotlb) 1520 if (no_iommu || (swiotlb && !calgary_detected))
1450 return -ENODEV; 1521 return -ENODEV;
1451 1522
1452 if (!calgary_detected) 1523 if (!calgary_detected)
@@ -1459,15 +1530,14 @@ int __init calgary_iommu_init(void)
1459 if (ret) { 1530 if (ret) {
1460 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " 1531 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
1461 "falling back to no_iommu\n", ret); 1532 "falling back to no_iommu\n", ret);
1462 if (end_pfn > MAX_DMA32_PFN)
1463 printk(KERN_ERR "WARNING more than 4GB of memory, "
1464 "32bit PCI may malfunction.\n");
1465 return ret; 1533 return ret;
1466 } 1534 }
1467 1535
1468 force_iommu = 1; 1536 force_iommu = 1;
1469 bad_dma_address = 0x0; 1537 bad_dma_address = 0x0;
1470 dma_ops = &calgary_dma_ops; 1538 /* dma_ops is set to swiotlb or nommu */
1539 if (!dma_ops)
1540 dma_ops = &nommu_dma_ops;
1471 1541
1472 return 0; 1542 return 0;
1473} 1543}
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index dc00a1331ace..0a3824e837b4 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -5,13 +5,13 @@
5 5
6#include <asm/proto.h> 6#include <asm/proto.h>
7#include <asm/dma.h> 7#include <asm/dma.h>
8#include <asm/gart.h> 8#include <asm/iommu.h>
9#include <asm/calgary.h> 9#include <asm/calgary.h>
10#include <asm/amd_iommu.h>
10 11
11int forbid_dac __read_mostly; 12static int forbid_dac __read_mostly;
12EXPORT_SYMBOL(forbid_dac);
13 13
14const struct dma_mapping_ops *dma_ops; 14struct dma_mapping_ops *dma_ops;
15EXPORT_SYMBOL(dma_ops); 15EXPORT_SYMBOL(dma_ops);
16 16
17static int iommu_sac_force __read_mostly; 17static int iommu_sac_force __read_mostly;
@@ -41,11 +41,12 @@ EXPORT_SYMBOL(bad_dma_address);
41/* Dummy device used for NULL arguments (normally ISA). Better would 41/* Dummy device used for NULL arguments (normally ISA). Better would
42 be probably a smaller DMA mask, but this is bug-to-bug compatible 42 be probably a smaller DMA mask, but this is bug-to-bug compatible
43 to older i386. */ 43 to older i386. */
44struct device fallback_dev = { 44struct device x86_dma_fallback_dev = {
45 .bus_id = "fallback device", 45 .bus_id = "fallback device",
46 .coherent_dma_mask = DMA_32BIT_MASK, 46 .coherent_dma_mask = DMA_32BIT_MASK,
47 .dma_mask = &fallback_dev.coherent_dma_mask, 47 .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
48}; 48};
49EXPORT_SYMBOL(x86_dma_fallback_dev);
49 50
50int dma_set_mask(struct device *dev, u64 mask) 51int dma_set_mask(struct device *dev, u64 mask)
51{ 52{
@@ -74,13 +75,17 @@ early_param("dma32_size", parse_dma32_size_opt);
74void __init dma32_reserve_bootmem(void) 75void __init dma32_reserve_bootmem(void)
75{ 76{
76 unsigned long size, align; 77 unsigned long size, align;
77 if (end_pfn <= MAX_DMA32_PFN) 78 if (max_pfn <= MAX_DMA32_PFN)
78 return; 79 return;
79 80
81 /*
82 * check aperture_64.c allocate_aperture() for reason about
83 * using 512M as goal
84 */
80 align = 64ULL<<20; 85 align = 64ULL<<20;
81 size = round_up(dma32_bootmem_size, align); 86 size = roundup(dma32_bootmem_size, align);
82 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, 87 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
83 __pa(MAX_DMA_ADDRESS)); 88 512ULL<<20);
84 if (dma32_bootmem_ptr) 89 if (dma32_bootmem_ptr)
85 dma32_bootmem_size = size; 90 dma32_bootmem_size = size;
86 else 91 else
@@ -88,17 +93,14 @@ void __init dma32_reserve_bootmem(void)
88} 93}
89static void __init dma32_free_bootmem(void) 94static void __init dma32_free_bootmem(void)
90{ 95{
91 int node;
92 96
93 if (end_pfn <= MAX_DMA32_PFN) 97 if (max_pfn <= MAX_DMA32_PFN)
94 return; 98 return;
95 99
96 if (!dma32_bootmem_ptr) 100 if (!dma32_bootmem_ptr)
97 return; 101 return;
98 102
99 for_each_online_node(node) 103 free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size);
100 free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr),
101 dma32_bootmem_size);
102 104
103 dma32_bootmem_ptr = NULL; 105 dma32_bootmem_ptr = NULL;
104 dma32_bootmem_size = 0; 106 dma32_bootmem_size = 0;
@@ -112,22 +114,57 @@ void __init pci_iommu_alloc(void)
112 * The order of these functions is important for 114 * The order of these functions is important for
113 * fall-back/fail-over reasons 115 * fall-back/fail-over reasons
114 */ 116 */
115#ifdef CONFIG_GART_IOMMU
116 gart_iommu_hole_init(); 117 gart_iommu_hole_init();
117#endif
118 118
119#ifdef CONFIG_CALGARY_IOMMU
120 detect_calgary(); 119 detect_calgary();
121#endif
122 120
123 detect_intel_iommu(); 121 detect_intel_iommu();
124 122
125#ifdef CONFIG_SWIOTLB 123 amd_iommu_detect();
124
126 pci_swiotlb_init(); 125 pci_swiotlb_init();
127#endif
128} 126}
127
128unsigned long iommu_num_pages(unsigned long addr, unsigned long len)
129{
130 unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
131
132 return size >> PAGE_SHIFT;
133}
134EXPORT_SYMBOL(iommu_num_pages);
129#endif 135#endif
130 136
137void *dma_generic_alloc_coherent(struct device *dev, size_t size,
138 dma_addr_t *dma_addr, gfp_t flag)
139{
140 unsigned long dma_mask;
141 struct page *page;
142 dma_addr_t addr;
143
144 dma_mask = dma_alloc_coherent_mask(dev, flag);
145
146 flag |= __GFP_ZERO;
147again:
148 page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
149 if (!page)
150 return NULL;
151
152 addr = page_to_phys(page);
153 if (!is_buffer_dma_capable(dma_mask, addr, size)) {
154 __free_pages(page, get_order(size));
155
156 if (dma_mask < DMA_32BIT_MASK && !(flag & GFP_DMA)) {
157 flag = (flag & ~GFP_DMA32) | GFP_DMA;
158 goto again;
159 }
160
161 return NULL;
162 }
163
164 *dma_addr = addr;
165 return page_address(page);
166}
167
131/* 168/*
132 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter 169 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
133 * documentation. 170 * documentation.
@@ -180,9 +217,7 @@ static __init int iommu_setup(char *p)
180 swiotlb = 1; 217 swiotlb = 1;
181#endif 218#endif
182 219
183#ifdef CONFIG_GART_IOMMU
184 gart_parse_options(p); 220 gart_parse_options(p);
185#endif
186 221
187#ifdef CONFIG_CALGARY_IOMMU 222#ifdef CONFIG_CALGARY_IOMMU
188 if (!strncmp(p, "calgary", 7)) 223 if (!strncmp(p, "calgary", 7))
@@ -197,136 +232,19 @@ static __init int iommu_setup(char *p)
197} 232}
198early_param("iommu", iommu_setup); 233early_param("iommu", iommu_setup);
199 234
200#ifdef CONFIG_X86_32
201int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
202 dma_addr_t device_addr, size_t size, int flags)
203{
204 void __iomem *mem_base = NULL;
205 int pages = size >> PAGE_SHIFT;
206 int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
207
208 if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
209 goto out;
210 if (!size)
211 goto out;
212 if (dev->dma_mem)
213 goto out;
214
215 /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
216
217 mem_base = ioremap(bus_addr, size);
218 if (!mem_base)
219 goto out;
220
221 dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
222 if (!dev->dma_mem)
223 goto out;
224 dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
225 if (!dev->dma_mem->bitmap)
226 goto free1_out;
227
228 dev->dma_mem->virt_base = mem_base;
229 dev->dma_mem->device_base = device_addr;
230 dev->dma_mem->size = pages;
231 dev->dma_mem->flags = flags;
232
233 if (flags & DMA_MEMORY_MAP)
234 return DMA_MEMORY_MAP;
235
236 return DMA_MEMORY_IO;
237
238 free1_out:
239 kfree(dev->dma_mem);
240 out:
241 if (mem_base)
242 iounmap(mem_base);
243 return 0;
244}
245EXPORT_SYMBOL(dma_declare_coherent_memory);
246
247void dma_release_declared_memory(struct device *dev)
248{
249 struct dma_coherent_mem *mem = dev->dma_mem;
250
251 if (!mem)
252 return;
253 dev->dma_mem = NULL;
254 iounmap(mem->virt_base);
255 kfree(mem->bitmap);
256 kfree(mem);
257}
258EXPORT_SYMBOL(dma_release_declared_memory);
259
260void *dma_mark_declared_memory_occupied(struct device *dev,
261 dma_addr_t device_addr, size_t size)
262{
263 struct dma_coherent_mem *mem = dev->dma_mem;
264 int pos, err;
265 int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
266
267 pages >>= PAGE_SHIFT;
268
269 if (!mem)
270 return ERR_PTR(-EINVAL);
271
272 pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
273 err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
274 if (err != 0)
275 return ERR_PTR(err);
276 return mem->virt_base + (pos << PAGE_SHIFT);
277}
278EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
279
280static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size,
281 dma_addr_t *dma_handle, void **ret)
282{
283 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
284 int order = get_order(size);
285
286 if (mem) {
287 int page = bitmap_find_free_region(mem->bitmap, mem->size,
288 order);
289 if (page >= 0) {
290 *dma_handle = mem->device_base + (page << PAGE_SHIFT);
291 *ret = mem->virt_base + (page << PAGE_SHIFT);
292 memset(*ret, 0, size);
293 }
294 if (mem->flags & DMA_MEMORY_EXCLUSIVE)
295 *ret = NULL;
296 }
297 return (mem != NULL);
298}
299
300static int dma_release_coherent(struct device *dev, int order, void *vaddr)
301{
302 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
303
304 if (mem && vaddr >= mem->virt_base && vaddr <
305 (mem->virt_base + (mem->size << PAGE_SHIFT))) {
306 int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
307
308 bitmap_release_region(mem->bitmap, page, order);
309 return 1;
310 }
311 return 0;
312}
313#else
314#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0)
315#define dma_release_coherent(dev, order, vaddr) (0)
316#endif /* CONFIG_X86_32 */
317
318int dma_supported(struct device *dev, u64 mask) 235int dma_supported(struct device *dev, u64 mask)
319{ 236{
237 struct dma_mapping_ops *ops = get_dma_ops(dev);
238
320#ifdef CONFIG_PCI 239#ifdef CONFIG_PCI
321 if (mask > 0xffffffff && forbid_dac > 0) { 240 if (mask > 0xffffffff && forbid_dac > 0) {
322 printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", 241 dev_info(dev, "PCI: Disallowing DAC for device\n");
323 dev->bus_id);
324 return 0; 242 return 0;
325 } 243 }
326#endif 244#endif
327 245
328 if (dma_ops->dma_supported) 246 if (ops->dma_supported)
329 return dma_ops->dma_supported(dev, mask); 247 return ops->dma_supported(dev, mask);
330 248
331 /* Copied from i386. Doesn't make much sense, because it will 249 /* Copied from i386. Doesn't make much sense, because it will
332 only work for pci_alloc_coherent. 250 only work for pci_alloc_coherent.
@@ -347,8 +265,7 @@ int dma_supported(struct device *dev, u64 mask)
347 type. Normally this doesn't make any difference, but gives 265 type. Normally this doesn't make any difference, but gives
348 more gentle handling of IOMMU overflow. */ 266 more gentle handling of IOMMU overflow. */
349 if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) { 267 if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
350 printk(KERN_INFO "%s: Force SAC with mask %Lx\n", 268 dev_info(dev, "Force SAC with mask %Lx\n", mask);
351 dev->bus_id, mask);
352 return 0; 269 return 0;
353 } 270 }
354 271
@@ -356,155 +273,15 @@ int dma_supported(struct device *dev, u64 mask)
356} 273}
357EXPORT_SYMBOL(dma_supported); 274EXPORT_SYMBOL(dma_supported);
358 275
359/* Allocate DMA memory on node near device */
360noinline struct page *
361dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
362{
363 int node;
364
365 node = dev_to_node(dev);
366
367 return alloc_pages_node(node, gfp, order);
368}
369
370/*
371 * Allocate memory for a coherent mapping.
372 */
373void *
374dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
375 gfp_t gfp)
376{
377 void *memory = NULL;
378 struct page *page;
379 unsigned long dma_mask = 0;
380 dma_addr_t bus;
381 int noretry = 0;
382
383 /* ignore region specifiers */
384 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
385
386 if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory))
387 return memory;
388
389 if (!dev) {
390 dev = &fallback_dev;
391 gfp |= GFP_DMA;
392 }
393 dma_mask = dev->coherent_dma_mask;
394 if (dma_mask == 0)
395 dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK;
396
397 /* Device not DMA able */
398 if (dev->dma_mask == NULL)
399 return NULL;
400
401 /* Don't invoke OOM killer or retry in lower 16MB DMA zone */
402 if (gfp & __GFP_DMA)
403 noretry = 1;
404
405#ifdef CONFIG_X86_64
406 /* Why <=? Even when the mask is smaller than 4GB it is often
407 larger than 16MB and in this case we have a chance of
408 finding fitting memory in the next higher zone first. If
409 not retry with true GFP_DMA. -AK */
410 if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
411 gfp |= GFP_DMA32;
412 if (dma_mask < DMA_32BIT_MASK)
413 noretry = 1;
414 }
415#endif
416
417 again:
418 page = dma_alloc_pages(dev,
419 noretry ? gfp | __GFP_NORETRY : gfp, get_order(size));
420 if (page == NULL)
421 return NULL;
422
423 {
424 int high, mmu;
425 bus = page_to_phys(page);
426 memory = page_address(page);
427 high = (bus + size) >= dma_mask;
428 mmu = high;
429 if (force_iommu && !(gfp & GFP_DMA))
430 mmu = 1;
431 else if (high) {
432 free_pages((unsigned long)memory,
433 get_order(size));
434
435 /* Don't use the 16MB ZONE_DMA unless absolutely
436 needed. It's better to use remapping first. */
437 if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
438 gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
439 goto again;
440 }
441
442 /* Let low level make its own zone decisions */
443 gfp &= ~(GFP_DMA32|GFP_DMA);
444
445 if (dma_ops->alloc_coherent)
446 return dma_ops->alloc_coherent(dev, size,
447 dma_handle, gfp);
448 return NULL;
449 }
450
451 memset(memory, 0, size);
452 if (!mmu) {
453 *dma_handle = bus;
454 return memory;
455 }
456 }
457
458 if (dma_ops->alloc_coherent) {
459 free_pages((unsigned long)memory, get_order(size));
460 gfp &= ~(GFP_DMA|GFP_DMA32);
461 return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
462 }
463
464 if (dma_ops->map_simple) {
465 *dma_handle = dma_ops->map_simple(dev, virt_to_phys(memory),
466 size,
467 PCI_DMA_BIDIRECTIONAL);
468 if (*dma_handle != bad_dma_address)
469 return memory;
470 }
471
472 if (panic_on_overflow)
473 panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",
474 (unsigned long)size);
475 free_pages((unsigned long)memory, get_order(size));
476 return NULL;
477}
478EXPORT_SYMBOL(dma_alloc_coherent);
479
480/*
481 * Unmap coherent memory.
482 * The caller must ensure that the device has finished accessing the mapping.
483 */
484void dma_free_coherent(struct device *dev, size_t size,
485 void *vaddr, dma_addr_t bus)
486{
487 int order = get_order(size);
488 WARN_ON(irqs_disabled()); /* for portability */
489 if (dma_release_coherent(dev, order, vaddr))
490 return;
491 if (dma_ops->unmap_single)
492 dma_ops->unmap_single(dev, bus, size, 0);
493 free_pages((unsigned long)vaddr, order);
494}
495EXPORT_SYMBOL(dma_free_coherent);
496
497static int __init pci_iommu_init(void) 276static int __init pci_iommu_init(void)
498{ 277{
499#ifdef CONFIG_CALGARY_IOMMU
500 calgary_iommu_init(); 278 calgary_iommu_init();
501#endif
502 279
503 intel_iommu_init(); 280 intel_iommu_init();
504 281
505#ifdef CONFIG_GART_IOMMU 282 amd_iommu_init();
283
506 gart_iommu_init(); 284 gart_iommu_init();
507#endif
508 285
509 no_iommu_init(); 286 no_iommu_init();
510 return 0; 287 return 0;
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index aa8ec928caa8..145f1c83369f 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -27,11 +27,12 @@
27#include <linux/scatterlist.h> 27#include <linux/scatterlist.h>
28#include <linux/iommu-helper.h> 28#include <linux/iommu-helper.h>
29#include <linux/sysdev.h> 29#include <linux/sysdev.h>
30#include <linux/io.h>
30#include <asm/atomic.h> 31#include <asm/atomic.h>
31#include <asm/io.h>
32#include <asm/mtrr.h> 32#include <asm/mtrr.h>
33#include <asm/pgtable.h> 33#include <asm/pgtable.h>
34#include <asm/proto.h> 34#include <asm/proto.h>
35#include <asm/iommu.h>
35#include <asm/gart.h> 36#include <asm/gart.h>
36#include <asm/cacheflush.h> 37#include <asm/cacheflush.h>
37#include <asm/swiotlb.h> 38#include <asm/swiotlb.h>
@@ -66,9 +67,6 @@ static u32 gart_unmapped_entry;
66 (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) 67 (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
67#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) 68#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
68 69
69#define to_pages(addr, size) \
70 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
71
72#define EMERGENCY_PAGES 32 /* = 128KB */ 70#define EMERGENCY_PAGES 32 /* = 128KB */
73 71
74#ifdef CONFIG_AGP 72#ifdef CONFIG_AGP
@@ -82,9 +80,10 @@ AGPEXTERN int agp_memory_reserved;
82AGPEXTERN __u32 *agp_gatt_table; 80AGPEXTERN __u32 *agp_gatt_table;
83 81
84static unsigned long next_bit; /* protected by iommu_bitmap_lock */ 82static unsigned long next_bit; /* protected by iommu_bitmap_lock */
85static int need_flush; /* global flush state. set for each gart wrap */ 83static bool need_flush; /* global flush state. set for each gart wrap */
86 84
87static unsigned long alloc_iommu(struct device *dev, int size) 85static unsigned long alloc_iommu(struct device *dev, int size,
86 unsigned long align_mask)
88{ 87{
89 unsigned long offset, flags; 88 unsigned long offset, flags;
90 unsigned long boundary_size; 89 unsigned long boundary_size;
@@ -92,27 +91,27 @@ static unsigned long alloc_iommu(struct device *dev, int size)
92 91
93 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), 92 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
94 PAGE_SIZE) >> PAGE_SHIFT; 93 PAGE_SIZE) >> PAGE_SHIFT;
95 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, 94 boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1,
96 PAGE_SIZE) >> PAGE_SHIFT; 95 PAGE_SIZE) >> PAGE_SHIFT;
97 96
98 spin_lock_irqsave(&iommu_bitmap_lock, flags); 97 spin_lock_irqsave(&iommu_bitmap_lock, flags);
99 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit, 98 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit,
100 size, base_index, boundary_size, 0); 99 size, base_index, boundary_size, align_mask);
101 if (offset == -1) { 100 if (offset == -1) {
102 need_flush = 1; 101 need_flush = true;
103 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0, 102 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0,
104 size, base_index, boundary_size, 0); 103 size, base_index, boundary_size,
104 align_mask);
105 } 105 }
106 if (offset != -1) { 106 if (offset != -1) {
107 set_bit_string(iommu_gart_bitmap, offset, size);
108 next_bit = offset+size; 107 next_bit = offset+size;
109 if (next_bit >= iommu_pages) { 108 if (next_bit >= iommu_pages) {
110 next_bit = 0; 109 next_bit = 0;
111 need_flush = 1; 110 need_flush = true;
112 } 111 }
113 } 112 }
114 if (iommu_fullflush) 113 if (iommu_fullflush)
115 need_flush = 1; 114 need_flush = true;
116 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 115 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
117 116
118 return offset; 117 return offset;
@@ -137,7 +136,7 @@ static void flush_gart(void)
137 spin_lock_irqsave(&iommu_bitmap_lock, flags); 136 spin_lock_irqsave(&iommu_bitmap_lock, flags);
138 if (need_flush) { 137 if (need_flush) {
139 k8_flush_garts(); 138 k8_flush_garts();
140 need_flush = 0; 139 need_flush = false;
141 } 140 }
142 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 141 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
143} 142}
@@ -176,7 +175,8 @@ static void dump_leak(void)
176 iommu_leak_pages); 175 iommu_leak_pages);
177 for (i = 0; i < iommu_leak_pages; i += 2) { 176 for (i = 0; i < iommu_leak_pages; i += 2) {
178 printk(KERN_DEBUG "%lu: ", iommu_pages-i); 177 printk(KERN_DEBUG "%lu: ", iommu_pages-i);
179 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], 0); 178 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i],
179 0);
180 printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' '); 180 printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
181 } 181 }
182 printk(KERN_DEBUG "\n"); 182 printk(KERN_DEBUG "\n");
@@ -198,9 +198,7 @@ static void iommu_full(struct device *dev, size_t size, int dir)
198 * out. Hopefully no network devices use single mappings that big. 198 * out. Hopefully no network devices use single mappings that big.
199 */ 199 */
200 200
201 printk(KERN_ERR 201 dev_err(dev, "PCI-DMA: Out of IOMMU space for %lu bytes\n", size);
202 "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
203 size, dev->bus_id);
204 202
205 if (size > PAGE_SIZE*EMERGENCY_PAGES) { 203 if (size > PAGE_SIZE*EMERGENCY_PAGES) {
206 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) 204 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
@@ -217,34 +215,24 @@ static void iommu_full(struct device *dev, size_t size, int dir)
217static inline int 215static inline int
218need_iommu(struct device *dev, unsigned long addr, size_t size) 216need_iommu(struct device *dev, unsigned long addr, size_t size)
219{ 217{
220 u64 mask = *dev->dma_mask; 218 return force_iommu ||
221 int high = addr + size > mask; 219 !is_buffer_dma_capable(*dev->dma_mask, addr, size);
222 int mmu = high;
223
224 if (force_iommu)
225 mmu = 1;
226
227 return mmu;
228} 220}
229 221
230static inline int 222static inline int
231nonforced_iommu(struct device *dev, unsigned long addr, size_t size) 223nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
232{ 224{
233 u64 mask = *dev->dma_mask; 225 return !is_buffer_dma_capable(*dev->dma_mask, addr, size);
234 int high = addr + size > mask;
235 int mmu = high;
236
237 return mmu;
238} 226}
239 227
240/* Map a single continuous physical area into the IOMMU. 228/* Map a single continuous physical area into the IOMMU.
241 * Caller needs to check if the iommu is needed and flush. 229 * Caller needs to check if the iommu is needed and flush.
242 */ 230 */
243static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, 231static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
244 size_t size, int dir) 232 size_t size, int dir, unsigned long align_mask)
245{ 233{
246 unsigned long npages = to_pages(phys_mem, size); 234 unsigned long npages = iommu_num_pages(phys_mem, size);
247 unsigned long iommu_page = alloc_iommu(dev, npages); 235 unsigned long iommu_page = alloc_iommu(dev, npages, align_mask);
248 int i; 236 int i;
249 237
250 if (iommu_page == -1) { 238 if (iommu_page == -1) {
@@ -264,16 +252,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
264 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); 252 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
265} 253}
266 254
267static dma_addr_t
268gart_map_simple(struct device *dev, phys_addr_t paddr, size_t size, int dir)
269{
270 dma_addr_t map = dma_map_area(dev, paddr, size, dir);
271
272 flush_gart();
273
274 return map;
275}
276
277/* Map a single area into the IOMMU */ 255/* Map a single area into the IOMMU */
278static dma_addr_t 256static dma_addr_t
279gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir) 257gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
@@ -281,12 +259,13 @@ gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
281 unsigned long bus; 259 unsigned long bus;
282 260
283 if (!dev) 261 if (!dev)
284 dev = &fallback_dev; 262 dev = &x86_dma_fallback_dev;
285 263
286 if (!need_iommu(dev, paddr, size)) 264 if (!need_iommu(dev, paddr, size))
287 return paddr; 265 return paddr;
288 266
289 bus = gart_map_simple(dev, paddr, size, dir); 267 bus = dma_map_area(dev, paddr, size, dir, 0);
268 flush_gart();
290 269
291 return bus; 270 return bus;
292} 271}
@@ -306,7 +285,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
306 return; 285 return;
307 286
308 iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; 287 iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
309 npages = to_pages(dma_addr, size); 288 npages = iommu_num_pages(dma_addr, size);
310 for (i = 0; i < npages; i++) { 289 for (i = 0; i < npages; i++) {
311 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; 290 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
312 CLEAR_LEAK(iommu_page + i); 291 CLEAR_LEAK(iommu_page + i);
@@ -345,7 +324,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
345 unsigned long addr = sg_phys(s); 324 unsigned long addr = sg_phys(s);
346 325
347 if (nonforced_iommu(dev, addr, s->length)) { 326 if (nonforced_iommu(dev, addr, s->length)) {
348 addr = dma_map_area(dev, addr, s->length, dir); 327 addr = dma_map_area(dev, addr, s->length, dir, 0);
349 if (addr == bad_dma_address) { 328 if (addr == bad_dma_address) {
350 if (i > 0) 329 if (i > 0)
351 gart_unmap_sg(dev, sg, i, dir); 330 gart_unmap_sg(dev, sg, i, dir);
@@ -367,7 +346,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
367 int nelems, struct scatterlist *sout, 346 int nelems, struct scatterlist *sout,
368 unsigned long pages) 347 unsigned long pages)
369{ 348{
370 unsigned long iommu_start = alloc_iommu(dev, pages); 349 unsigned long iommu_start = alloc_iommu(dev, pages, 0);
371 unsigned long iommu_page = iommu_start; 350 unsigned long iommu_page = iommu_start;
372 struct scatterlist *s; 351 struct scatterlist *s;
373 int i; 352 int i;
@@ -389,7 +368,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
389 } 368 }
390 369
391 addr = phys_addr; 370 addr = phys_addr;
392 pages = to_pages(s->offset, s->length); 371 pages = iommu_num_pages(s->offset, s->length);
393 while (pages--) { 372 while (pages--) {
394 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); 373 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
395 SET_LEAK(iommu_page); 374 SET_LEAK(iommu_page);
@@ -432,7 +411,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
432 return 0; 411 return 0;
433 412
434 if (!dev) 413 if (!dev)
435 dev = &fallback_dev; 414 dev = &x86_dma_fallback_dev;
436 415
437 out = 0; 416 out = 0;
438 start = 0; 417 start = 0;
@@ -472,7 +451,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
472 451
473 seg_size += s->length; 452 seg_size += s->length;
474 need = nextneed; 453 need = nextneed;
475 pages += to_pages(s->offset, s->length); 454 pages += iommu_num_pages(s->offset, s->length);
476 ps = s; 455 ps = s;
477 } 456 }
478 if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) 457 if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
@@ -504,6 +483,46 @@ error:
504 return 0; 483 return 0;
505} 484}
506 485
486/* allocate and map a coherent mapping */
487static void *
488gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
489 gfp_t flag)
490{
491 dma_addr_t paddr;
492 unsigned long align_mask;
493 struct page *page;
494
495 if (force_iommu && !(flag & GFP_DMA)) {
496 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
497 page = alloc_pages(flag | __GFP_ZERO, get_order(size));
498 if (!page)
499 return NULL;
500
501 align_mask = (1UL << get_order(size)) - 1;
502 paddr = dma_map_area(dev, page_to_phys(page), size,
503 DMA_BIDIRECTIONAL, align_mask);
504
505 flush_gart();
506 if (paddr != bad_dma_address) {
507 *dma_addr = paddr;
508 return page_address(page);
509 }
510 __free_pages(page, get_order(size));
511 } else
512 return dma_generic_alloc_coherent(dev, size, dma_addr, flag);
513
514 return NULL;
515}
516
517/* free a coherent mapping */
518static void
519gart_free_coherent(struct device *dev, size_t size, void *vaddr,
520 dma_addr_t dma_addr)
521{
522 gart_unmap_single(dev, dma_addr, size, DMA_BIDIRECTIONAL);
523 free_pages((unsigned long)vaddr, get_order(size));
524}
525
507static int no_agp; 526static int no_agp;
508 527
509static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) 528static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
@@ -534,8 +553,8 @@ static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
534 unsigned aper_size = 0, aper_base_32, aper_order; 553 unsigned aper_size = 0, aper_base_32, aper_order;
535 u64 aper_base; 554 u64 aper_base;
536 555
537 pci_read_config_dword(dev, 0x94, &aper_base_32); 556 pci_read_config_dword(dev, AMD64_GARTAPERTUREBASE, &aper_base_32);
538 pci_read_config_dword(dev, 0x90, &aper_order); 557 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &aper_order);
539 aper_order = (aper_order >> 1) & 7; 558 aper_order = (aper_order >> 1) & 7;
540 559
541 aper_base = aper_base_32 & 0x7fff; 560 aper_base = aper_base_32 & 0x7fff;
@@ -549,14 +568,63 @@ static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
549 return aper_base; 568 return aper_base;
550} 569}
551 570
571static void enable_gart_translations(void)
572{
573 int i;
574
575 for (i = 0; i < num_k8_northbridges; i++) {
576 struct pci_dev *dev = k8_northbridges[i];
577
578 enable_gart_translation(dev, __pa(agp_gatt_table));
579 }
580}
581
582/*
583 * If fix_up_north_bridges is set, the north bridges have to be fixed up on
584 * resume in the same way as they are handled in gart_iommu_hole_init().
585 */
586static bool fix_up_north_bridges;
587static u32 aperture_order;
588static u32 aperture_alloc;
589
590void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
591{
592 fix_up_north_bridges = true;
593 aperture_order = aper_order;
594 aperture_alloc = aper_alloc;
595}
596
552static int gart_resume(struct sys_device *dev) 597static int gart_resume(struct sys_device *dev)
553{ 598{
599 printk(KERN_INFO "PCI-DMA: Resuming GART IOMMU\n");
600
601 if (fix_up_north_bridges) {
602 int i;
603
604 printk(KERN_INFO "PCI-DMA: Restoring GART aperture settings\n");
605
606 for (i = 0; i < num_k8_northbridges; i++) {
607 struct pci_dev *dev = k8_northbridges[i];
608
609 /*
610 * Don't enable translations just yet. That is the next
611 * step. Restore the pre-suspend aperture settings.
612 */
613 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL,
614 aperture_order << 1);
615 pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE,
616 aperture_alloc >> 25);
617 }
618 }
619
620 enable_gart_translations();
621
554 return 0; 622 return 0;
555} 623}
556 624
557static int gart_suspend(struct sys_device *dev, pm_message_t state) 625static int gart_suspend(struct sys_device *dev, pm_message_t state)
558{ 626{
559 return -EINVAL; 627 return 0;
560} 628}
561 629
562static struct sysdev_class gart_sysdev_class = { 630static struct sysdev_class gart_sysdev_class = {
@@ -605,40 +673,29 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
605 info->aper_size = aper_size >> 20; 673 info->aper_size = aper_size >> 20;
606 674
607 gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); 675 gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32);
608 gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); 676 gatt = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
677 get_order(gatt_size));
609 if (!gatt) 678 if (!gatt)
610 panic("Cannot allocate GATT table"); 679 panic("Cannot allocate GATT table");
611 if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT)) 680 if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT))
612 panic("Could not set GART PTEs to uncacheable pages"); 681 panic("Could not set GART PTEs to uncacheable pages");
613 682
614 memset(gatt, 0, gatt_size);
615 agp_gatt_table = gatt; 683 agp_gatt_table = gatt;
616 684
617 for (i = 0; i < num_k8_northbridges; i++) { 685 enable_gart_translations();
618 u32 gatt_reg;
619 u32 ctl;
620
621 dev = k8_northbridges[i];
622 gatt_reg = __pa(gatt) >> 12;
623 gatt_reg <<= 4;
624 pci_write_config_dword(dev, 0x98, gatt_reg);
625 pci_read_config_dword(dev, 0x90, &ctl);
626
627 ctl |= 1;
628 ctl &= ~((1<<4) | (1<<5));
629
630 pci_write_config_dword(dev, 0x90, ctl);
631 }
632 686
633 error = sysdev_class_register(&gart_sysdev_class); 687 error = sysdev_class_register(&gart_sysdev_class);
634 if (!error) 688 if (!error)
635 error = sysdev_register(&device_gart); 689 error = sysdev_register(&device_gart);
636 if (error) 690 if (error)
637 panic("Could not register gart_sysdev -- would corrupt data on next suspend"); 691 panic("Could not register gart_sysdev -- "
692 "would corrupt data on next suspend");
693
638 flush_gart(); 694 flush_gart();
639 695
640 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", 696 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
641 aper_base, aper_size>>10); 697 aper_base, aper_size>>10);
698
642 return 0; 699 return 0;
643 700
644 nommu: 701 nommu:
@@ -648,21 +705,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
648 return -1; 705 return -1;
649} 706}
650 707
651extern int agp_amd64_init(void); 708static struct dma_mapping_ops gart_dma_ops = {
652
653static const struct dma_mapping_ops gart_dma_ops = {
654 .mapping_error = NULL,
655 .map_single = gart_map_single, 709 .map_single = gart_map_single,
656 .map_simple = gart_map_simple,
657 .unmap_single = gart_unmap_single, 710 .unmap_single = gart_unmap_single,
658 .sync_single_for_cpu = NULL,
659 .sync_single_for_device = NULL,
660 .sync_single_range_for_cpu = NULL,
661 .sync_single_range_for_device = NULL,
662 .sync_sg_for_cpu = NULL,
663 .sync_sg_for_device = NULL,
664 .map_sg = gart_map_sg, 711 .map_sg = gart_map_sg,
665 .unmap_sg = gart_unmap_sg, 712 .unmap_sg = gart_unmap_sg,
713 .alloc_coherent = gart_alloc_coherent,
714 .free_coherent = gart_free_coherent,
666}; 715};
667 716
668void gart_iommu_shutdown(void) 717void gart_iommu_shutdown(void)
@@ -677,11 +726,11 @@ void gart_iommu_shutdown(void)
677 u32 ctl; 726 u32 ctl;
678 727
679 dev = k8_northbridges[i]; 728 dev = k8_northbridges[i];
680 pci_read_config_dword(dev, 0x90, &ctl); 729 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
681 730
682 ctl &= ~1; 731 ctl &= ~GARTEN;
683 732
684 pci_write_config_dword(dev, 0x90, ctl); 733 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
685 } 734 }
686} 735}
687 736
@@ -689,7 +738,8 @@ void __init gart_iommu_init(void)
689{ 738{
690 struct agp_kern_info info; 739 struct agp_kern_info info;
691 unsigned long iommu_start; 740 unsigned long iommu_start;
692 unsigned long aper_size; 741 unsigned long aper_base, aper_size;
742 unsigned long start_pfn, end_pfn;
693 unsigned long scratch; 743 unsigned long scratch;
694 long i; 744 long i;
695 745
@@ -716,35 +766,40 @@ void __init gart_iommu_init(void)
716 return; 766 return;
717 767
718 if (no_iommu || 768 if (no_iommu ||
719 (!force_iommu && end_pfn <= MAX_DMA32_PFN) || 769 (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
720 !gart_iommu_aperture || 770 !gart_iommu_aperture ||
721 (no_agp && init_k8_gatt(&info) < 0)) { 771 (no_agp && init_k8_gatt(&info) < 0)) {
722 if (end_pfn > MAX_DMA32_PFN) { 772 if (max_pfn > MAX_DMA32_PFN) {
723 printk(KERN_WARNING "More than 4GB of memory " 773 printk(KERN_WARNING "More than 4GB of memory "
724 "but GART IOMMU not available.\n" 774 "but GART IOMMU not available.\n");
725 KERN_WARNING "falling back to iommu=soft.\n"); 775 printk(KERN_WARNING "falling back to iommu=soft.\n");
726 } 776 }
727 return; 777 return;
728 } 778 }
729 779
780 /* need to map that range */
781 aper_size = info.aper_size << 20;
782 aper_base = info.aper_base;
783 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
784 if (end_pfn > max_low_pfn_mapped) {
785 start_pfn = (aper_base>>PAGE_SHIFT);
786 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
787 }
788
730 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); 789 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
731 aper_size = info.aper_size * 1024 * 1024;
732 iommu_size = check_iommu_size(info.aper_base, aper_size); 790 iommu_size = check_iommu_size(info.aper_base, aper_size);
733 iommu_pages = iommu_size >> PAGE_SHIFT; 791 iommu_pages = iommu_size >> PAGE_SHIFT;
734 792
735 iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL, 793 iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO,
736 get_order(iommu_pages/8)); 794 get_order(iommu_pages/8));
737 if (!iommu_gart_bitmap) 795 if (!iommu_gart_bitmap)
738 panic("Cannot allocate iommu bitmap\n"); 796 panic("Cannot allocate iommu bitmap\n");
739 memset(iommu_gart_bitmap, 0, iommu_pages/8);
740 797
741#ifdef CONFIG_IOMMU_LEAK 798#ifdef CONFIG_IOMMU_LEAK
742 if (leak_trace) { 799 if (leak_trace) {
743 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, 800 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
744 get_order(iommu_pages*sizeof(void *))); 801 get_order(iommu_pages*sizeof(void *)));
745 if (iommu_leak_tab) 802 if (!iommu_leak_tab)
746 memset(iommu_leak_tab, 0, iommu_pages * 8);
747 else
748 printk(KERN_DEBUG 803 printk(KERN_DEBUG
749 "PCI-DMA: Cannot allocate leak trace area\n"); 804 "PCI-DMA: Cannot allocate leak trace area\n");
750 } 805 }
@@ -754,7 +809,7 @@ void __init gart_iommu_init(void)
754 * Out of IOMMU space handling. 809 * Out of IOMMU space handling.
755 * Reserve some invalid pages at the beginning of the GART. 810 * Reserve some invalid pages at the beginning of the GART.
756 */ 811 */
757 set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); 812 iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
758 813
759 agp_memory_reserved = iommu_size; 814 agp_memory_reserved = iommu_size;
760 printk(KERN_INFO 815 printk(KERN_INFO
@@ -788,10 +843,10 @@ void __init gart_iommu_init(void)
788 wbinvd(); 843 wbinvd();
789 844
790 /* 845 /*
791 * Try to workaround a bug (thanks to BenH) 846 * Try to workaround a bug (thanks to BenH):
792 * Set unmapped entries to a scratch page instead of 0. 847 * Set unmapped entries to a scratch page instead of 0.
793 * Any prefetches that hit unmapped entries won't get an bus abort 848 * Any prefetches that hit unmapped entries won't get an bus abort
794 * then. 849 * then. (P2P bridge may be prefetching on DMA reads).
795 */ 850 */
796 scratch = get_zeroed_page(GFP_KERNEL); 851 scratch = get_zeroed_page(GFP_KERNEL);
797 if (!scratch) 852 if (!scratch)
@@ -812,7 +867,8 @@ void __init gart_parse_options(char *p)
812 if (!strncmp(p, "leak", 4)) { 867 if (!strncmp(p, "leak", 4)) {
813 leak_trace = 1; 868 leak_trace = 1;
814 p += 4; 869 p += 4;
815 if (*p == '=') ++p; 870 if (*p == '=')
871 ++p;
816 if (isdigit(*p) && get_option(&p, &arg)) 872 if (isdigit(*p) && get_option(&p, &arg))
817 iommu_leak_pages = arg; 873 iommu_leak_pages = arg;
818 } 874 }
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index aec43d56f49c..c70ab5a5d4c8 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -7,14 +7,14 @@
7#include <linux/dma-mapping.h> 7#include <linux/dma-mapping.h>
8#include <linux/scatterlist.h> 8#include <linux/scatterlist.h>
9 9
10#include <asm/gart.h> 10#include <asm/iommu.h>
11#include <asm/processor.h> 11#include <asm/processor.h>
12#include <asm/dma.h> 12#include <asm/dma.h>
13 13
14static int 14static int
15check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) 15check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
16{ 16{
17 if (hwdev && bus + size > *hwdev->dma_mask) { 17 if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) {
18 if (*hwdev->dma_mask >= DMA_32BIT_MASK) 18 if (*hwdev->dma_mask >= DMA_32BIT_MASK)
19 printk(KERN_ERR 19 printk(KERN_ERR
20 "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", 20 "nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
@@ -72,21 +72,17 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
72 return nents; 72 return nents;
73} 73}
74 74
75/* Make sure we keep the same behaviour */ 75static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
76static int nommu_mapping_error(dma_addr_t dma_addr) 76 dma_addr_t dma_addr)
77{ 77{
78#ifdef CONFIG_X86_32 78 free_pages((unsigned long)vaddr, get_order(size));
79 return 0;
80#else
81 return (dma_addr == bad_dma_address);
82#endif
83} 79}
84 80
85 81struct dma_mapping_ops nommu_dma_ops = {
86const struct dma_mapping_ops nommu_dma_ops = { 82 .alloc_coherent = dma_generic_alloc_coherent,
83 .free_coherent = nommu_free_coherent,
87 .map_single = nommu_map_single, 84 .map_single = nommu_map_single,
88 .map_sg = nommu_map_sg, 85 .map_sg = nommu_map_sg,
89 .mapping_error = nommu_mapping_error,
90 .is_phys = 1, 86 .is_phys = 1,
91}; 87};
92 88
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 490da7f4b8d0..c4ce0332759e 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -5,7 +5,7 @@
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/dma-mapping.h> 6#include <linux/dma-mapping.h>
7 7
8#include <asm/gart.h> 8#include <asm/iommu.h>
9#include <asm/swiotlb.h> 9#include <asm/swiotlb.h>
10#include <asm/dma.h> 10#include <asm/dma.h>
11 11
@@ -18,7 +18,7 @@ swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size,
18 return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction); 18 return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction);
19} 19}
20 20
21const struct dma_mapping_ops swiotlb_dma_ops = { 21struct dma_mapping_ops swiotlb_dma_ops = {
22 .mapping_error = swiotlb_dma_mapping_error, 22 .mapping_error = swiotlb_dma_mapping_error,
23 .alloc_coherent = swiotlb_alloc_coherent, 23 .alloc_coherent = swiotlb_alloc_coherent,
24 .free_coherent = swiotlb_free_coherent, 24 .free_coherent = swiotlb_free_coherent,
@@ -38,7 +38,7 @@ const struct dma_mapping_ops swiotlb_dma_ops = {
38void __init pci_swiotlb_init(void) 38void __init pci_swiotlb_init(void)
39{ 39{
40 /* don't initialize swiotlb if iommu=off (no_iommu=1) */ 40 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
41 if (!iommu_detected && !no_iommu && end_pfn > MAX_DMA32_PFN) 41 if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)
42 swiotlb = 1; 42 swiotlb = 1;
43 if (swiotlb_force) 43 if (swiotlb_force)
44 swiotlb = 1; 44 swiotlb = 1;
diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c
index bc1f2d3ea277..a311ffcaad16 100644
--- a/arch/x86/kernel/pcspeaker.c
+++ b/arch/x86/kernel/pcspeaker.c
@@ -1,20 +1,13 @@
1#include <linux/platform_device.h> 1#include <linux/platform_device.h>
2#include <linux/errno.h> 2#include <linux/err.h>
3#include <linux/init.h> 3#include <linux/init.h>
4 4
5static __init int add_pcspkr(void) 5static __init int add_pcspkr(void)
6{ 6{
7 struct platform_device *pd; 7 struct platform_device *pd;
8 int ret;
9 8
10 pd = platform_device_alloc("pcspkr", -1); 9 pd = platform_device_register_simple("pcspkr", -1, NULL, 0);
11 if (!pd)
12 return -ENOMEM;
13 10
14 ret = platform_device_add(pd); 11 return IS_ERR(pd) ? PTR_ERR(pd) : 0;
15 if (ret)
16 platform_device_put(pd);
17
18 return ret;
19} 12}
20device_initcall(add_pcspkr); 13device_initcall(add_pcspkr);
diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms_32.c
new file mode 100644
index 000000000000..675a48c404a5
--- /dev/null
+++ b/arch/x86/kernel/probe_roms_32.c
@@ -0,0 +1,166 @@
1#include <linux/sched.h>
2#include <linux/mm.h>
3#include <linux/uaccess.h>
4#include <linux/mmzone.h>
5#include <linux/ioport.h>
6#include <linux/seq_file.h>
7#include <linux/console.h>
8#include <linux/init.h>
9#include <linux/edd.h>
10#include <linux/dmi.h>
11#include <linux/pfn.h>
12#include <linux/pci.h>
13#include <asm/pci-direct.h>
14
15
16#include <asm/e820.h>
17#include <asm/mmzone.h>
18#include <asm/setup.h>
19#include <asm/sections.h>
20#include <asm/io.h>
21#include <setup_arch.h>
22
23static struct resource system_rom_resource = {
24 .name = "System ROM",
25 .start = 0xf0000,
26 .end = 0xfffff,
27 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
28};
29
30static struct resource extension_rom_resource = {
31 .name = "Extension ROM",
32 .start = 0xe0000,
33 .end = 0xeffff,
34 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
35};
36
37static struct resource adapter_rom_resources[] = { {
38 .name = "Adapter ROM",
39 .start = 0xc8000,
40 .end = 0,
41 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
42}, {
43 .name = "Adapter ROM",
44 .start = 0,
45 .end = 0,
46 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
47}, {
48 .name = "Adapter ROM",
49 .start = 0,
50 .end = 0,
51 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
52}, {
53 .name = "Adapter ROM",
54 .start = 0,
55 .end = 0,
56 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
57}, {
58 .name = "Adapter ROM",
59 .start = 0,
60 .end = 0,
61 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
62}, {
63 .name = "Adapter ROM",
64 .start = 0,
65 .end = 0,
66 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
67} };
68
69static struct resource video_rom_resource = {
70 .name = "Video ROM",
71 .start = 0xc0000,
72 .end = 0xc7fff,
73 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
74};
75
76#define ROMSIGNATURE 0xaa55
77
78static int __init romsignature(const unsigned char *rom)
79{
80 const unsigned short * const ptr = (const unsigned short *)rom;
81 unsigned short sig;
82
83 return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
84}
85
86static int __init romchecksum(const unsigned char *rom, unsigned long length)
87{
88 unsigned char sum, c;
89
90 for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
91 sum += c;
92 return !length && !sum;
93}
94
95void __init probe_roms(void)
96{
97 const unsigned char *rom;
98 unsigned long start, length, upper;
99 unsigned char c;
100 int i;
101
102 /* video rom */
103 upper = adapter_rom_resources[0].start;
104 for (start = video_rom_resource.start; start < upper; start += 2048) {
105 rom = isa_bus_to_virt(start);
106 if (!romsignature(rom))
107 continue;
108
109 video_rom_resource.start = start;
110
111 if (probe_kernel_address(rom + 2, c) != 0)
112 continue;
113
114 /* 0 < length <= 0x7f * 512, historically */
115 length = c * 512;
116
117 /* if checksum okay, trust length byte */
118 if (length && romchecksum(rom, length))
119 video_rom_resource.end = start + length - 1;
120
121 request_resource(&iomem_resource, &video_rom_resource);
122 break;
123 }
124
125 start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
126 if (start < upper)
127 start = upper;
128
129 /* system rom */
130 request_resource(&iomem_resource, &system_rom_resource);
131 upper = system_rom_resource.start;
132
133 /* check for extension rom (ignore length byte!) */
134 rom = isa_bus_to_virt(extension_rom_resource.start);
135 if (romsignature(rom)) {
136 length = extension_rom_resource.end - extension_rom_resource.start + 1;
137 if (romchecksum(rom, length)) {
138 request_resource(&iomem_resource, &extension_rom_resource);
139 upper = extension_rom_resource.start;
140 }
141 }
142
143 /* check for adapter roms on 2k boundaries */
144 for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
145 rom = isa_bus_to_virt(start);
146 if (!romsignature(rom))
147 continue;
148
149 if (probe_kernel_address(rom + 2, c) != 0)
150 continue;
151
152 /* 0 < length <= 0x7f * 512, historically */
153 length = c * 512;
154
155 /* but accept any length that fits if checksum okay */
156 if (!length || start + length > upper || !romchecksum(rom, length))
157 continue;
158
159 adapter_rom_resources[i].start = start;
160 adapter_rom_resources[i].end = start + length - 1;
161 request_resource(&iomem_resource, &adapter_rom_resources[i]);
162
163 start = adapter_rom_resources[i++].end & ~2047UL;
164 }
165}
166
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ba370dc8685b..c622772744d8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -6,6 +6,13 @@
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/pm.h> 8#include <linux/pm.h>
9#include <linux/clockchips.h>
10#include <asm/system.h>
11
12unsigned long idle_halt;
13EXPORT_SYMBOL(idle_halt);
14unsigned long idle_nomwait;
15EXPORT_SYMBOL(idle_nomwait);
9 16
10struct kmem_cache *task_xstate_cachep; 17struct kmem_cache *task_xstate_cachep;
11 18
@@ -45,6 +52,76 @@ void arch_task_cache_init(void)
45 SLAB_PANIC, NULL); 52 SLAB_PANIC, NULL);
46} 53}
47 54
55/*
56 * Idle related variables and functions
57 */
58unsigned long boot_option_idle_override = 0;
59EXPORT_SYMBOL(boot_option_idle_override);
60
61/*
62 * Powermanagement idle function, if any..
63 */
64void (*pm_idle)(void);
65EXPORT_SYMBOL(pm_idle);
66
67#ifdef CONFIG_X86_32
68/*
69 * This halt magic was a workaround for ancient floppy DMA
70 * wreckage. It should be safe to remove.
71 */
72static int hlt_counter;
73void disable_hlt(void)
74{
75 hlt_counter++;
76}
77EXPORT_SYMBOL(disable_hlt);
78
79void enable_hlt(void)
80{
81 hlt_counter--;
82}
83EXPORT_SYMBOL(enable_hlt);
84
85static inline int hlt_use_halt(void)
86{
87 return (!hlt_counter && boot_cpu_data.hlt_works_ok);
88}
89#else
90static inline int hlt_use_halt(void)
91{
92 return 1;
93}
94#endif
95
96/*
97 * We use this if we don't have any better
98 * idle routine..
99 */
100void default_idle(void)
101{
102 if (hlt_use_halt()) {
103 current_thread_info()->status &= ~TS_POLLING;
104 /*
105 * TS_POLLING-cleared state must be visible before we
106 * test NEED_RESCHED:
107 */
108 smp_mb();
109
110 if (!need_resched())
111 safe_halt(); /* enables interrupts racelessly */
112 else
113 local_irq_enable();
114 current_thread_info()->status |= TS_POLLING;
115 } else {
116 local_irq_enable();
117 /* loop is done by the caller */
118 cpu_relax();
119 }
120}
121#ifdef CONFIG_APM_MODULE
122EXPORT_SYMBOL(default_idle);
123#endif
124
48static void do_nothing(void *unused) 125static void do_nothing(void *unused)
49{ 126{
50} 127}
@@ -61,7 +138,7 @@ void cpu_idle_wait(void)
61{ 138{
62 smp_mb(); 139 smp_mb();
63 /* kick all the CPUs so that they exit out of pm_idle */ 140 /* kick all the CPUs so that they exit out of pm_idle */
64 smp_call_function(do_nothing, NULL, 0, 1); 141 smp_call_function(do_nothing, NULL, 1);
65} 142}
66EXPORT_SYMBOL_GPL(cpu_idle_wait); 143EXPORT_SYMBOL_GPL(cpu_idle_wait);
67 144
@@ -107,7 +184,8 @@ static void mwait_idle(void)
107static void poll_idle(void) 184static void poll_idle(void)
108{ 185{
109 local_irq_enable(); 186 local_irq_enable();
110 cpu_relax(); 187 while (!need_resched())
188 cpu_relax();
111} 189}
112 190
113/* 191/*
@@ -122,54 +200,170 @@ static void poll_idle(void)
122 * 200 *
123 * idle=mwait overrides this decision and forces the usage of mwait. 201 * idle=mwait overrides this decision and forces the usage of mwait.
124 */ 202 */
203static int __cpuinitdata force_mwait;
204
205#define MWAIT_INFO 0x05
206#define MWAIT_ECX_EXTENDED_INFO 0x01
207#define MWAIT_EDX_C1 0xf0
208
125static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) 209static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
126{ 210{
211 u32 eax, ebx, ecx, edx;
212
127 if (force_mwait) 213 if (force_mwait)
128 return 1; 214 return 1;
129 215
130 if (c->x86_vendor == X86_VENDOR_AMD) { 216 if (c->cpuid_level < MWAIT_INFO)
131 switch(c->x86) { 217 return 0;
132 case 0x10: 218
133 case 0x11: 219 cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx);
134 return 0; 220 /* Check, whether EDX has extended info about MWAIT */
135 } 221 if (!(ecx & MWAIT_ECX_EXTENDED_INFO))
136 } 222 return 1;
223
224 /*
225 * edx enumeratios MONITOR/MWAIT extensions. Check, whether
226 * C1 supports MWAIT
227 */
228 return (edx & MWAIT_EDX_C1);
229}
230
231/*
232 * Check for AMD CPUs, which have potentially C1E support
233 */
234static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
235{
236 if (c->x86_vendor != X86_VENDOR_AMD)
237 return 0;
238
239 if (c->x86 < 0x0F)
240 return 0;
241
242 /* Family 0x0f models < rev F do not have C1E */
243 if (c->x86 == 0x0f && c->x86_model < 0x40)
244 return 0;
245
137 return 1; 246 return 1;
138} 247}
139 248
140void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) 249static cpumask_t c1e_mask = CPU_MASK_NONE;
250static int c1e_detected;
251
252void c1e_remove_cpu(int cpu)
141{ 253{
142 static int selected; 254 cpu_clear(cpu, c1e_mask);
255}
143 256
144 if (selected) 257/*
258 * C1E aware idle routine. We check for C1E active in the interrupt
259 * pending message MSR. If we detect C1E, then we handle it the same
260 * way as C3 power states (local apic timer and TSC stop)
261 */
262static void c1e_idle(void)
263{
264 if (need_resched())
145 return; 265 return;
266
267 if (!c1e_detected) {
268 u32 lo, hi;
269
270 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
271 if (lo & K8_INTP_C1E_ACTIVE_MASK) {
272 c1e_detected = 1;
273 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
274 mark_tsc_unstable("TSC halt in AMD C1E");
275 printk(KERN_INFO "System has AMD C1E enabled\n");
276 set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
277 }
278 }
279
280 if (c1e_detected) {
281 int cpu = smp_processor_id();
282
283 if (!cpu_isset(cpu, c1e_mask)) {
284 cpu_set(cpu, c1e_mask);
285 /*
286 * Force broadcast so ACPI can not interfere. Needs
287 * to run with interrupts enabled as it uses
288 * smp_function_call.
289 */
290 local_irq_enable();
291 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
292 &cpu);
293 printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
294 cpu);
295 local_irq_disable();
296 }
297 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
298
299 default_idle();
300
301 /*
302 * The switch back from broadcast mode needs to be
303 * called with interrupts disabled.
304 */
305 local_irq_disable();
306 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
307 local_irq_enable();
308 } else
309 default_idle();
310}
311
312void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
313{
146#ifdef CONFIG_X86_SMP 314#ifdef CONFIG_X86_SMP
147 if (pm_idle == poll_idle && smp_num_siblings > 1) { 315 if (pm_idle == poll_idle && smp_num_siblings > 1) {
148 printk(KERN_WARNING "WARNING: polling idle and HT enabled," 316 printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
149 " performance may degrade.\n"); 317 " performance may degrade.\n");
150 } 318 }
151#endif 319#endif
320 if (pm_idle)
321 return;
322
152 if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { 323 if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
153 /* 324 /*
154 * Skip, if setup has overridden idle.
155 * One CPU supports mwait => All CPUs supports mwait 325 * One CPU supports mwait => All CPUs supports mwait
156 */ 326 */
157 if (!pm_idle) { 327 printk(KERN_INFO "using mwait in idle threads.\n");
158 printk(KERN_INFO "using mwait in idle threads.\n"); 328 pm_idle = mwait_idle;
159 pm_idle = mwait_idle; 329 } else if (check_c1e_idle(c)) {
160 } 330 printk(KERN_INFO "using C1E aware idle routine\n");
161 } 331 pm_idle = c1e_idle;
162 selected = 1; 332 } else
333 pm_idle = default_idle;
163} 334}
164 335
165static int __init idle_setup(char *str) 336static int __init idle_setup(char *str)
166{ 337{
338 if (!str)
339 return -EINVAL;
340
167 if (!strcmp(str, "poll")) { 341 if (!strcmp(str, "poll")) {
168 printk("using polling idle threads.\n"); 342 printk("using polling idle threads.\n");
169 pm_idle = poll_idle; 343 pm_idle = poll_idle;
170 } else if (!strcmp(str, "mwait")) 344 } else if (!strcmp(str, "mwait"))
171 force_mwait = 1; 345 force_mwait = 1;
172 else 346 else if (!strcmp(str, "halt")) {
347 /*
348 * When the boot option of idle=halt is added, halt is
349 * forced to be used for CPU idle. In such case CPU C2/C3
350 * won't be used again.
351 * To continue to load the CPU idle driver, don't touch
352 * the boot_option_idle_override.
353 */
354 pm_idle = default_idle;
355 idle_halt = 1;
356 return 0;
357 } else if (!strcmp(str, "nomwait")) {
358 /*
359 * If the boot option of "idle=nomwait" is added,
360 * it means that mwait will be disabled for CPU C2/C3
361 * states. In such case it won't touch the variable
362 * of boot_option_idle_override.
363 */
364 idle_nomwait = 1;
365 return 0;
366 } else
173 return -1; 367 return -1;
174 368
175 boot_option_idle_override = 1; 369 boot_option_idle_override = 1;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index e2db9ac5c61c..0a1302fe6d45 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -37,6 +37,7 @@
37#include <linux/tick.h> 37#include <linux/tick.h>
38#include <linux/percpu.h> 38#include <linux/percpu.h>
39#include <linux/prctl.h> 39#include <linux/prctl.h>
40#include <linux/dmi.h>
40 41
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include <asm/pgtable.h> 43#include <asm/pgtable.h>
@@ -55,14 +56,12 @@
55#include <asm/tlbflush.h> 56#include <asm/tlbflush.h>
56#include <asm/cpu.h> 57#include <asm/cpu.h>
57#include <asm/kdebug.h> 58#include <asm/kdebug.h>
59#include <asm/idle.h>
60#include <asm/syscalls.h>
61#include <asm/smp.h>
58 62
59asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 63asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
60 64
61static int hlt_counter;
62
63unsigned long boot_option_idle_override = 0;
64EXPORT_SYMBOL(boot_option_idle_override);
65
66DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; 65DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
67EXPORT_PER_CPU_SYMBOL(current_task); 66EXPORT_PER_CPU_SYMBOL(current_task);
68 67
@@ -77,80 +76,12 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
77 return ((unsigned long *)tsk->thread.sp)[3]; 76 return ((unsigned long *)tsk->thread.sp)[3];
78} 77}
79 78
80/* 79#ifndef CONFIG_SMP
81 * Powermanagement idle function, if any..
82 */
83void (*pm_idle)(void);
84EXPORT_SYMBOL(pm_idle);
85
86void disable_hlt(void)
87{
88 hlt_counter++;
89}
90
91EXPORT_SYMBOL(disable_hlt);
92
93void enable_hlt(void)
94{
95 hlt_counter--;
96}
97
98EXPORT_SYMBOL(enable_hlt);
99
100/*
101 * We use this if we don't have any better
102 * idle routine..
103 */
104void default_idle(void)
105{
106 if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
107 current_thread_info()->status &= ~TS_POLLING;
108 /*
109 * TS_POLLING-cleared state must be visible before we
110 * test NEED_RESCHED:
111 */
112 smp_mb();
113
114 if (!need_resched())
115 safe_halt(); /* enables interrupts racelessly */
116 else
117 local_irq_enable();
118 current_thread_info()->status |= TS_POLLING;
119 } else {
120 local_irq_enable();
121 /* loop is done by the caller */
122 cpu_relax();
123 }
124}
125#ifdef CONFIG_APM_MODULE
126EXPORT_SYMBOL(default_idle);
127#endif
128
129#ifdef CONFIG_HOTPLUG_CPU
130#include <asm/nmi.h>
131/* We don't actually take CPU down, just spin without interrupts. */
132static inline void play_dead(void)
133{
134 /* This must be done before dead CPU ack */
135 cpu_exit_clear();
136 wbinvd();
137 mb();
138 /* Ack it */
139 __get_cpu_var(cpu_state) = CPU_DEAD;
140
141 /*
142 * With physical CPU hotplug, we should halt the cpu
143 */
144 local_irq_disable();
145 while (1)
146 halt();
147}
148#else
149static inline void play_dead(void) 80static inline void play_dead(void)
150{ 81{
151 BUG(); 82 BUG();
152} 83}
153#endif /* CONFIG_HOTPLUG_CPU */ 84#endif
154 85
155/* 86/*
156 * The idle thread. There's no useful work to be 87 * The idle thread. There's no useful work to be
@@ -166,26 +97,24 @@ void cpu_idle(void)
166 97
167 /* endless idle loop with no priority at all */ 98 /* endless idle loop with no priority at all */
168 while (1) { 99 while (1) {
169 tick_nohz_stop_sched_tick(); 100 tick_nohz_stop_sched_tick(1);
170 while (!need_resched()) { 101 while (!need_resched()) {
171 void (*idle)(void);
172 102
173 check_pgt_cache(); 103 check_pgt_cache();
174 rmb(); 104 rmb();
175 idle = pm_idle;
176 105
177 if (rcu_pending(cpu)) 106 if (rcu_pending(cpu))
178 rcu_check_callbacks(cpu, 0); 107 rcu_check_callbacks(cpu, 0);
179 108
180 if (!idle)
181 idle = default_idle;
182
183 if (cpu_is_offline(cpu)) 109 if (cpu_is_offline(cpu))
184 play_dead(); 110 play_dead();
185 111
186 local_irq_disable(); 112 local_irq_disable();
187 __get_cpu_var(irq_stat).idle_timestamp = jiffies; 113 __get_cpu_var(irq_stat).idle_timestamp = jiffies;
188 idle(); 114 /* Don't trace irqs off for idle */
115 stop_critical_timings();
116 pm_idle();
117 start_critical_timings();
189 } 118 }
190 tick_nohz_restart_sched_tick(); 119 tick_nohz_restart_sched_tick();
191 preempt_enable_no_resched(); 120 preempt_enable_no_resched();
@@ -194,12 +123,13 @@ void cpu_idle(void)
194 } 123 }
195} 124}
196 125
197void __show_registers(struct pt_regs *regs, int all) 126void __show_regs(struct pt_regs *regs, int all)
198{ 127{
199 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; 128 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
200 unsigned long d0, d1, d2, d3, d6, d7; 129 unsigned long d0, d1, d2, d3, d6, d7;
201 unsigned long sp; 130 unsigned long sp;
202 unsigned short ss, gs; 131 unsigned short ss, gs;
132 const char *board;
203 133
204 if (user_mode_vm(regs)) { 134 if (user_mode_vm(regs)) {
205 sp = regs->sp; 135 sp = regs->sp;
@@ -212,11 +142,15 @@ void __show_registers(struct pt_regs *regs, int all)
212 } 142 }
213 143
214 printk("\n"); 144 printk("\n");
215 printk("Pid: %d, comm: %s %s (%s %.*s)\n", 145
146 board = dmi_get_system_info(DMI_PRODUCT_NAME);
147 if (!board)
148 board = "";
149 printk("Pid: %d, comm: %s %s (%s %.*s) %s\n",
216 task_pid_nr(current), current->comm, 150 task_pid_nr(current), current->comm,
217 print_tainted(), init_utsname()->release, 151 print_tainted(), init_utsname()->release,
218 (int)strcspn(init_utsname()->version, " "), 152 (int)strcspn(init_utsname()->version, " "),
219 init_utsname()->version); 153 init_utsname()->version, board);
220 154
221 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", 155 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
222 (u16)regs->cs, regs->ip, regs->flags, 156 (u16)regs->cs, regs->ip, regs->flags,
@@ -255,7 +189,7 @@ void __show_registers(struct pt_regs *regs, int all)
255 189
256void show_regs(struct pt_regs *regs) 190void show_regs(struct pt_regs *regs)
257{ 191{
258 __show_registers(regs, 1); 192 __show_regs(regs, 1);
259 show_trace(NULL, regs, &regs->sp, regs->bp); 193 show_trace(NULL, regs, &regs->sp, regs->bp);
260} 194}
261 195
@@ -316,6 +250,14 @@ void exit_thread(void)
316 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; 250 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
317 put_cpu(); 251 put_cpu();
318 } 252 }
253#ifdef CONFIG_X86_DS
254 /* Free any DS contexts that have not been properly released. */
255 if (unlikely(current->thread.ds_ctx)) {
256 /* we clear debugctl to make sure DS is not used. */
257 update_debugctlmsr(0);
258 ds_free(current->thread.ds_ctx);
259 }
260#endif /* CONFIG_X86_DS */
319} 261}
320 262
321void flush_thread(void) 263void flush_thread(void)
@@ -477,6 +419,35 @@ int set_tsc_mode(unsigned int val)
477 return 0; 419 return 0;
478} 420}
479 421
422#ifdef CONFIG_X86_DS
423static int update_debugctl(struct thread_struct *prev,
424 struct thread_struct *next, unsigned long debugctl)
425{
426 unsigned long ds_prev = 0;
427 unsigned long ds_next = 0;
428
429 if (prev->ds_ctx)
430 ds_prev = (unsigned long)prev->ds_ctx->ds;
431 if (next->ds_ctx)
432 ds_next = (unsigned long)next->ds_ctx->ds;
433
434 if (ds_next != ds_prev) {
435 /* we clear debugctl to make sure DS
436 * is not in use when we change it */
437 debugctl = 0;
438 update_debugctlmsr(0);
439 wrmsr(MSR_IA32_DS_AREA, ds_next, 0);
440 }
441 return debugctl;
442}
443#else
444static int update_debugctl(struct thread_struct *prev,
445 struct thread_struct *next, unsigned long debugctl)
446{
447 return debugctl;
448}
449#endif /* CONFIG_X86_DS */
450
480static noinline void 451static noinline void
481__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 452__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
482 struct tss_struct *tss) 453 struct tss_struct *tss)
@@ -487,14 +458,7 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
487 prev = &prev_p->thread; 458 prev = &prev_p->thread;
488 next = &next_p->thread; 459 next = &next_p->thread;
489 460
490 debugctl = prev->debugctlmsr; 461 debugctl = update_debugctl(prev, next, prev->debugctlmsr);
491 if (next->ds_area_msr != prev->ds_area_msr) {
492 /* we clear debugctl to make sure DS
493 * is not in use when we change it */
494 debugctl = 0;
495 update_debugctlmsr(0);
496 wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
497 }
498 462
499 if (next->debugctlmsr != debugctl) 463 if (next->debugctlmsr != debugctl)
500 update_debugctlmsr(next->debugctlmsr); 464 update_debugctlmsr(next->debugctlmsr);
@@ -518,13 +482,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
518 hard_enable_TSC(); 482 hard_enable_TSC();
519 } 483 }
520 484
521#ifdef X86_BTS 485#ifdef CONFIG_X86_PTRACE_BTS
522 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) 486 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
523 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); 487 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
524 488
525 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) 489 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
526 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); 490 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
527#endif 491#endif /* CONFIG_X86_PTRACE_BTS */
528 492
529 493
530 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 494 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index f73cfbc2c281..749d5f888d4d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -38,11 +38,11 @@
38#include <linux/kdebug.h> 38#include <linux/kdebug.h>
39#include <linux/tick.h> 39#include <linux/tick.h>
40#include <linux/prctl.h> 40#include <linux/prctl.h>
41#include <linux/uaccess.h>
42#include <linux/io.h>
41 43
42#include <asm/uaccess.h>
43#include <asm/pgtable.h> 44#include <asm/pgtable.h>
44#include <asm/system.h> 45#include <asm/system.h>
45#include <asm/io.h>
46#include <asm/processor.h> 46#include <asm/processor.h>
47#include <asm/i387.h> 47#include <asm/i387.h>
48#include <asm/mmu_context.h> 48#include <asm/mmu_context.h>
@@ -52,20 +52,12 @@
52#include <asm/proto.h> 52#include <asm/proto.h>
53#include <asm/ia32.h> 53#include <asm/ia32.h>
54#include <asm/idle.h> 54#include <asm/idle.h>
55#include <asm/syscalls.h>
55 56
56asmlinkage extern void ret_from_fork(void); 57asmlinkage extern void ret_from_fork(void);
57 58
58unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; 59unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
59 60
60unsigned long boot_option_idle_override = 0;
61EXPORT_SYMBOL(boot_option_idle_override);
62
63/*
64 * Powermanagement idle function, if any..
65 */
66void (*pm_idle)(void);
67EXPORT_SYMBOL(pm_idle);
68
69static ATOMIC_NOTIFIER_HEAD(idle_notifier); 61static ATOMIC_NOTIFIER_HEAD(idle_notifier);
70 62
71void idle_notifier_register(struct notifier_block *n) 63void idle_notifier_register(struct notifier_block *n)
@@ -95,48 +87,12 @@ void exit_idle(void)
95 __exit_idle(); 87 __exit_idle();
96} 88}
97 89
98/* 90#ifndef CONFIG_SMP
99 * We use this if we don't have any better
100 * idle routine..
101 */
102void default_idle(void)
103{
104 current_thread_info()->status &= ~TS_POLLING;
105 /*
106 * TS_POLLING-cleared state must be visible before we
107 * test NEED_RESCHED:
108 */
109 smp_mb();
110 if (!need_resched())
111 safe_halt(); /* enables interrupts racelessly */
112 else
113 local_irq_enable();
114 current_thread_info()->status |= TS_POLLING;
115}
116
117#ifdef CONFIG_HOTPLUG_CPU
118DECLARE_PER_CPU(int, cpu_state);
119
120#include <asm/nmi.h>
121/* We halt the CPU with physical CPU hotplug */
122static inline void play_dead(void)
123{
124 idle_task_exit();
125 wbinvd();
126 mb();
127 /* Ack it */
128 __get_cpu_var(cpu_state) = CPU_DEAD;
129
130 local_irq_disable();
131 while (1)
132 halt();
133}
134#else
135static inline void play_dead(void) 91static inline void play_dead(void)
136{ 92{
137 BUG(); 93 BUG();
138} 94}
139#endif /* CONFIG_HOTPLUG_CPU */ 95#endif
140 96
141/* 97/*
142 * The idle thread. There's no useful work to be 98 * The idle thread. There's no useful work to be
@@ -160,14 +116,11 @@ void cpu_idle(void)
160 116
161 /* endless idle loop with no priority at all */ 117 /* endless idle loop with no priority at all */
162 while (1) { 118 while (1) {
163 tick_nohz_stop_sched_tick(); 119 tick_nohz_stop_sched_tick(1);
164 while (!need_resched()) { 120 while (!need_resched()) {
165 void (*idle)(void);
166 121
167 rmb(); 122 rmb();
168 idle = pm_idle; 123
169 if (!idle)
170 idle = default_idle;
171 if (cpu_is_offline(smp_processor_id())) 124 if (cpu_is_offline(smp_processor_id()))
172 play_dead(); 125 play_dead();
173 /* 126 /*
@@ -177,7 +130,10 @@ void cpu_idle(void)
177 */ 130 */
178 local_irq_disable(); 131 local_irq_disable();
179 enter_idle(); 132 enter_idle();
180 idle(); 133 /* Don't trace irqs off for idle */
134 stop_critical_timings();
135 pm_idle();
136 start_critical_timings();
181 /* In many cases the interrupt that ended idle 137 /* In many cases the interrupt that ended idle
182 has already called exit_idle. But some idle 138 has already called exit_idle. But some idle
183 loops can be woken up without interrupt. */ 139 loops can be woken up without interrupt. */
@@ -192,7 +148,7 @@ void cpu_idle(void)
192} 148}
193 149
194/* Prints also some state that isn't saved in the pt_regs */ 150/* Prints also some state that isn't saved in the pt_regs */
195void __show_regs(struct pt_regs * regs) 151void __show_regs(struct pt_regs *regs, int all)
196{ 152{
197 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; 153 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
198 unsigned long d0, d1, d2, d3, d6, d7; 154 unsigned long d0, d1, d2, d3, d6, d7;
@@ -201,60 +157,65 @@ void __show_regs(struct pt_regs * regs)
201 157
202 printk("\n"); 158 printk("\n");
203 print_modules(); 159 print_modules();
204 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 160 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n",
205 current->pid, current->comm, print_tainted(), 161 current->pid, current->comm, print_tainted(),
206 init_utsname()->release, 162 init_utsname()->release,
207 (int)strcspn(init_utsname()->version, " "), 163 (int)strcspn(init_utsname()->version, " "),
208 init_utsname()->version); 164 init_utsname()->version);
209 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); 165 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
210 printk_address(regs->ip, 1); 166 printk_address(regs->ip, 1);
211 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, 167 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
212 regs->flags); 168 regs->sp, regs->flags);
213 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", 169 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
214 regs->ax, regs->bx, regs->cx); 170 regs->ax, regs->bx, regs->cx);
215 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", 171 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
216 regs->dx, regs->si, regs->di); 172 regs->dx, regs->si, regs->di);
217 printk("RBP: %016lx R08: %016lx R09: %016lx\n", 173 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
218 regs->bp, regs->r8, regs->r9); 174 regs->bp, regs->r8, regs->r9);
219 printk("R10: %016lx R11: %016lx R12: %016lx\n", 175 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
220 regs->r10, regs->r11, regs->r12); 176 regs->r10, regs->r11, regs->r12);
221 printk("R13: %016lx R14: %016lx R15: %016lx\n", 177 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
222 regs->r13, regs->r14, regs->r15); 178 regs->r13, regs->r14, regs->r15);
223 179
224 asm("movl %%ds,%0" : "=r" (ds)); 180 asm("movl %%ds,%0" : "=r" (ds));
225 asm("movl %%cs,%0" : "=r" (cs)); 181 asm("movl %%cs,%0" : "=r" (cs));
226 asm("movl %%es,%0" : "=r" (es)); 182 asm("movl %%es,%0" : "=r" (es));
227 asm("movl %%fs,%0" : "=r" (fsindex)); 183 asm("movl %%fs,%0" : "=r" (fsindex));
228 asm("movl %%gs,%0" : "=r" (gsindex)); 184 asm("movl %%gs,%0" : "=r" (gsindex));
229 185
230 rdmsrl(MSR_FS_BASE, fs); 186 rdmsrl(MSR_FS_BASE, fs);
231 rdmsrl(MSR_GS_BASE, gs); 187 rdmsrl(MSR_GS_BASE, gs);
232 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 188 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
189
190 if (!all)
191 return;
233 192
234 cr0 = read_cr0(); 193 cr0 = read_cr0();
235 cr2 = read_cr2(); 194 cr2 = read_cr2();
236 cr3 = read_cr3(); 195 cr3 = read_cr3();
237 cr4 = read_cr4(); 196 cr4 = read_cr4();
238 197
239 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 198 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
240 fs,fsindex,gs,gsindex,shadowgs); 199 fs, fsindex, gs, gsindex, shadowgs);
241 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); 200 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
242 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); 201 es, cr0);
202 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
203 cr4);
243 204
244 get_debugreg(d0, 0); 205 get_debugreg(d0, 0);
245 get_debugreg(d1, 1); 206 get_debugreg(d1, 1);
246 get_debugreg(d2, 2); 207 get_debugreg(d2, 2);
247 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); 208 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
248 get_debugreg(d3, 3); 209 get_debugreg(d3, 3);
249 get_debugreg(d6, 6); 210 get_debugreg(d6, 6);
250 get_debugreg(d7, 7); 211 get_debugreg(d7, 7);
251 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); 212 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
252} 213}
253 214
254void show_regs(struct pt_regs *regs) 215void show_regs(struct pt_regs *regs)
255{ 216{
256 printk("CPU %d:", smp_processor_id()); 217 printk(KERN_INFO "CPU %d:", smp_processor_id());
257 __show_regs(regs); 218 __show_regs(regs, 1);
258 show_trace(NULL, regs, (void *)(regs + 1), regs->bp); 219 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
259} 220}
260 221
@@ -279,6 +240,14 @@ void exit_thread(void)
279 t->io_bitmap_max = 0; 240 t->io_bitmap_max = 0;
280 put_cpu(); 241 put_cpu();
281 } 242 }
243#ifdef CONFIG_X86_DS
244 /* Free any DS contexts that have not been properly released. */
245 if (unlikely(t->ds_ctx)) {
246 /* we clear debugctl to make sure DS is not used. */
247 update_debugctlmsr(0);
248 ds_free(t->ds_ctx);
249 }
250#endif /* CONFIG_X86_DS */
282} 251}
283 252
284void flush_thread(void) 253void flush_thread(void)
@@ -354,10 +323,10 @@ void prepare_to_copy(struct task_struct *tsk)
354 323
355int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, 324int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
356 unsigned long unused, 325 unsigned long unused,
357 struct task_struct * p, struct pt_regs * regs) 326 struct task_struct *p, struct pt_regs *regs)
358{ 327{
359 int err; 328 int err;
360 struct pt_regs * childregs; 329 struct pt_regs *childregs;
361 struct task_struct *me = current; 330 struct task_struct *me = current;
362 331
363 childregs = ((struct pt_regs *) 332 childregs = ((struct pt_regs *)
@@ -378,10 +347,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
378 p->thread.fs = me->thread.fs; 347 p->thread.fs = me->thread.fs;
379 p->thread.gs = me->thread.gs; 348 p->thread.gs = me->thread.gs;
380 349
381 asm("mov %%gs,%0" : "=m" (p->thread.gsindex)); 350 savesegment(gs, p->thread.gsindex);
382 asm("mov %%fs,%0" : "=m" (p->thread.fsindex)); 351 savesegment(fs, p->thread.fsindex);
383 asm("mov %%es,%0" : "=m" (p->thread.es)); 352 savesegment(es, p->thread.es);
384 asm("mov %%ds,%0" : "=m" (p->thread.ds)); 353 savesegment(ds, p->thread.ds);
385 354
386 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { 355 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
387 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); 356 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
@@ -402,10 +371,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
402 if (test_thread_flag(TIF_IA32)) 371 if (test_thread_flag(TIF_IA32))
403 err = do_set_thread_area(p, -1, 372 err = do_set_thread_area(p, -1,
404 (struct user_desc __user *)childregs->si, 0); 373 (struct user_desc __user *)childregs->si, 0);
405 else 374 else
406#endif 375#endif
407 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 376 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
408 if (err) 377 if (err)
409 goto out; 378 goto out;
410 } 379 }
411 err = 0; 380 err = 0;
@@ -420,7 +389,9 @@ out:
420void 389void
421start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) 390start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
422{ 391{
423 asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0)); 392 loadsegment(fs, 0);
393 loadsegment(es, 0);
394 loadsegment(ds, 0);
424 load_gs_index(0); 395 load_gs_index(0);
425 regs->ip = new_ip; 396 regs->ip = new_ip;
426 regs->sp = new_sp; 397 regs->sp = new_sp;
@@ -510,13 +481,27 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
510 next = &next_p->thread; 481 next = &next_p->thread;
511 482
512 debugctl = prev->debugctlmsr; 483 debugctl = prev->debugctlmsr;
513 if (next->ds_area_msr != prev->ds_area_msr) { 484
514 /* we clear debugctl to make sure DS 485#ifdef CONFIG_X86_DS
515 * is not in use when we change it */ 486 {
516 debugctl = 0; 487 unsigned long ds_prev = 0, ds_next = 0;
517 update_debugctlmsr(0); 488
518 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr); 489 if (prev->ds_ctx)
490 ds_prev = (unsigned long)prev->ds_ctx->ds;
491 if (next->ds_ctx)
492 ds_next = (unsigned long)next->ds_ctx->ds;
493
494 if (ds_next != ds_prev) {
495 /*
496 * We clear debugctl to make sure DS
497 * is not in use when we change it:
498 */
499 debugctl = 0;
500 update_debugctlmsr(0);
501 wrmsrl(MSR_IA32_DS_AREA, ds_next);
502 }
519 } 503 }
504#endif /* CONFIG_X86_DS */
520 505
521 if (next->debugctlmsr != debugctl) 506 if (next->debugctlmsr != debugctl)
522 update_debugctlmsr(next->debugctlmsr); 507 update_debugctlmsr(next->debugctlmsr);
@@ -554,13 +539,13 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
554 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 539 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
555 } 540 }
556 541
557#ifdef X86_BTS 542#ifdef CONFIG_X86_PTRACE_BTS
558 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) 543 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
559 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); 544 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
560 545
561 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) 546 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
562 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); 547 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
563#endif 548#endif /* CONFIG_X86_PTRACE_BTS */
564} 549}
565 550
566/* 551/*
@@ -575,13 +560,14 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
575struct task_struct * 560struct task_struct *
576__switch_to(struct task_struct *prev_p, struct task_struct *next_p) 561__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
577{ 562{
578 struct thread_struct *prev = &prev_p->thread, 563 struct thread_struct *prev = &prev_p->thread;
579 *next = &next_p->thread; 564 struct thread_struct *next = &next_p->thread;
580 int cpu = smp_processor_id(); 565 int cpu = smp_processor_id();
581 struct tss_struct *tss = &per_cpu(init_tss, cpu); 566 struct tss_struct *tss = &per_cpu(init_tss, cpu);
567 unsigned fsindex, gsindex;
582 568
583 /* we're going to use this soon, after a few expensive things */ 569 /* we're going to use this soon, after a few expensive things */
584 if (next_p->fpu_counter>5) 570 if (next_p->fpu_counter > 5)
585 prefetch(next->xstate); 571 prefetch(next->xstate);
586 572
587 /* 573 /*
@@ -589,69 +575,82 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
589 */ 575 */
590 load_sp0(tss, next); 576 load_sp0(tss, next);
591 577
592 /* 578 /*
593 * Switch DS and ES. 579 * Switch DS and ES.
594 * This won't pick up thread selector changes, but I guess that is ok. 580 * This won't pick up thread selector changes, but I guess that is ok.
595 */ 581 */
596 asm volatile("mov %%es,%0" : "=m" (prev->es)); 582 savesegment(es, prev->es);
597 if (unlikely(next->es | prev->es)) 583 if (unlikely(next->es | prev->es))
598 loadsegment(es, next->es); 584 loadsegment(es, next->es);
599 585
600 asm volatile ("mov %%ds,%0" : "=m" (prev->ds)); 586 savesegment(ds, prev->ds);
601 if (unlikely(next->ds | prev->ds)) 587 if (unlikely(next->ds | prev->ds))
602 loadsegment(ds, next->ds); 588 loadsegment(ds, next->ds);
603 589
590
591 /* We must save %fs and %gs before load_TLS() because
592 * %fs and %gs may be cleared by load_TLS().
593 *
594 * (e.g. xen_load_tls())
595 */
596 savesegment(fs, fsindex);
597 savesegment(gs, gsindex);
598
604 load_TLS(next, cpu); 599 load_TLS(next, cpu);
605 600
606 /* 601 /*
602 * Leave lazy mode, flushing any hypercalls made here.
603 * This must be done before restoring TLS segments so
604 * the GDT and LDT are properly updated, and must be
605 * done before math_state_restore, so the TS bit is up
606 * to date.
607 */
608 arch_leave_lazy_cpu_mode();
609
610 /*
607 * Switch FS and GS. 611 * Switch FS and GS.
612 *
613 * Segment register != 0 always requires a reload. Also
614 * reload when it has changed. When prev process used 64bit
615 * base always reload to avoid an information leak.
608 */ 616 */
609 { 617 if (unlikely(fsindex | next->fsindex | prev->fs)) {
610 unsigned fsindex; 618 loadsegment(fs, next->fsindex);
611 asm volatile("movl %%fs,%0" : "=r" (fsindex)); 619 /*
612 /* segment register != 0 always requires a reload. 620 * Check if the user used a selector != 0; if yes
613 also reload when it has changed. 621 * clear 64bit base, since overloaded base is always
614 when prev process used 64bit base always reload 622 * mapped to the Null selector
615 to avoid an information leak. */ 623 */
616 if (unlikely(fsindex | next->fsindex | prev->fs)) { 624 if (fsindex)
617 loadsegment(fs, next->fsindex); 625 prev->fs = 0;
618 /* check if the user used a selector != 0
619 * if yes clear 64bit base, since overloaded base
620 * is always mapped to the Null selector
621 */
622 if (fsindex)
623 prev->fs = 0;
624 }
625 /* when next process has a 64bit base use it */
626 if (next->fs)
627 wrmsrl(MSR_FS_BASE, next->fs);
628 prev->fsindex = fsindex;
629 } 626 }
630 { 627 /* when next process has a 64bit base use it */
631 unsigned gsindex; 628 if (next->fs)
632 asm volatile("movl %%gs,%0" : "=r" (gsindex)); 629 wrmsrl(MSR_FS_BASE, next->fs);
633 if (unlikely(gsindex | next->gsindex | prev->gs)) { 630 prev->fsindex = fsindex;
634 load_gs_index(next->gsindex); 631
635 if (gsindex) 632 if (unlikely(gsindex | next->gsindex | prev->gs)) {
636 prev->gs = 0; 633 load_gs_index(next->gsindex);
637 } 634 if (gsindex)
638 if (next->gs) 635 prev->gs = 0;
639 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
640 prev->gsindex = gsindex;
641 } 636 }
637 if (next->gs)
638 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
639 prev->gsindex = gsindex;
642 640
643 /* Must be after DS reload */ 641 /* Must be after DS reload */
644 unlazy_fpu(prev_p); 642 unlazy_fpu(prev_p);
645 643
646 /* 644 /*
647 * Switch the PDA and FPU contexts. 645 * Switch the PDA and FPU contexts.
648 */ 646 */
649 prev->usersp = read_pda(oldrsp); 647 prev->usersp = read_pda(oldrsp);
650 write_pda(oldrsp, next->usersp); 648 write_pda(oldrsp, next->usersp);
651 write_pda(pcurrent, next_p); 649 write_pda(pcurrent, next_p);
652 650
653 write_pda(kernelstack, 651 write_pda(kernelstack,
654 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); 652 (unsigned long)task_stack_page(next_p) +
653 THREAD_SIZE - PDA_STACKOFFSET);
655#ifdef CONFIG_CC_STACKPROTECTOR 654#ifdef CONFIG_CC_STACKPROTECTOR
656 /* 655 /*
657 * Build time only check to make sure the stack_canary is at 656 * Build time only check to make sure the stack_canary is at
@@ -687,7 +686,7 @@ long sys_execve(char __user *name, char __user * __user *argv,
687 char __user * __user *envp, struct pt_regs *regs) 686 char __user * __user *envp, struct pt_regs *regs)
688{ 687{
689 long error; 688 long error;
690 char * filename; 689 char *filename;
691 690
692 filename = getname(name); 691 filename = getname(name);
693 error = PTR_ERR(filename); 692 error = PTR_ERR(filename);
@@ -745,55 +744,55 @@ asmlinkage long sys_vfork(struct pt_regs *regs)
745unsigned long get_wchan(struct task_struct *p) 744unsigned long get_wchan(struct task_struct *p)
746{ 745{
747 unsigned long stack; 746 unsigned long stack;
748 u64 fp,ip; 747 u64 fp, ip;
749 int count = 0; 748 int count = 0;
750 749
751 if (!p || p == current || p->state==TASK_RUNNING) 750 if (!p || p == current || p->state == TASK_RUNNING)
752 return 0; 751 return 0;
753 stack = (unsigned long)task_stack_page(p); 752 stack = (unsigned long)task_stack_page(p);
754 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE) 753 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
755 return 0; 754 return 0;
756 fp = *(u64 *)(p->thread.sp); 755 fp = *(u64 *)(p->thread.sp);
757 do { 756 do {
758 if (fp < (unsigned long)stack || 757 if (fp < (unsigned long)stack ||
759 fp > (unsigned long)stack+THREAD_SIZE) 758 fp >= (unsigned long)stack+THREAD_SIZE)
760 return 0; 759 return 0;
761 ip = *(u64 *)(fp+8); 760 ip = *(u64 *)(fp+8);
762 if (!in_sched_functions(ip)) 761 if (!in_sched_functions(ip))
763 return ip; 762 return ip;
764 fp = *(u64 *)fp; 763 fp = *(u64 *)fp;
765 } while (count++ < 16); 764 } while (count++ < 16);
766 return 0; 765 return 0;
767} 766}
768 767
769long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) 768long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
770{ 769{
771 int ret = 0; 770 int ret = 0;
772 int doit = task == current; 771 int doit = task == current;
773 int cpu; 772 int cpu;
774 773
775 switch (code) { 774 switch (code) {
776 case ARCH_SET_GS: 775 case ARCH_SET_GS:
777 if (addr >= TASK_SIZE_OF(task)) 776 if (addr >= TASK_SIZE_OF(task))
778 return -EPERM; 777 return -EPERM;
779 cpu = get_cpu(); 778 cpu = get_cpu();
780 /* handle small bases via the GDT because that's faster to 779 /* handle small bases via the GDT because that's faster to
781 switch. */ 780 switch. */
782 if (addr <= 0xffffffff) { 781 if (addr <= 0xffffffff) {
783 set_32bit_tls(task, GS_TLS, addr); 782 set_32bit_tls(task, GS_TLS, addr);
784 if (doit) { 783 if (doit) {
785 load_TLS(&task->thread, cpu); 784 load_TLS(&task->thread, cpu);
786 load_gs_index(GS_TLS_SEL); 785 load_gs_index(GS_TLS_SEL);
787 } 786 }
788 task->thread.gsindex = GS_TLS_SEL; 787 task->thread.gsindex = GS_TLS_SEL;
789 task->thread.gs = 0; 788 task->thread.gs = 0;
790 } else { 789 } else {
791 task->thread.gsindex = 0; 790 task->thread.gsindex = 0;
792 task->thread.gs = addr; 791 task->thread.gs = addr;
793 if (doit) { 792 if (doit) {
794 load_gs_index(0); 793 load_gs_index(0);
795 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); 794 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
796 } 795 }
797 } 796 }
798 put_cpu(); 797 put_cpu();
799 break; 798 break;
@@ -809,7 +808,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
809 set_32bit_tls(task, FS_TLS, addr); 808 set_32bit_tls(task, FS_TLS, addr);
810 if (doit) { 809 if (doit) {
811 load_TLS(&task->thread, cpu); 810 load_TLS(&task->thread, cpu);
812 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL)); 811 loadsegment(fs, FS_TLS_SEL);
813 } 812 }
814 task->thread.fsindex = FS_TLS_SEL; 813 task->thread.fsindex = FS_TLS_SEL;
815 task->thread.fs = 0; 814 task->thread.fs = 0;
@@ -819,7 +818,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
819 if (doit) { 818 if (doit) {
820 /* set the selector to 0 to not confuse 819 /* set the selector to 0 to not confuse
821 __switch_to */ 820 __switch_to */
822 asm volatile("movl %0,%%fs" :: "r" (0)); 821 loadsegment(fs, 0);
823 ret = checking_wrmsrl(MSR_FS_BASE, addr); 822 ret = checking_wrmsrl(MSR_FS_BASE, addr);
824 } 823 }
825 } 824 }
@@ -842,13 +841,12 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
842 if (task->thread.gsindex == GS_TLS_SEL) 841 if (task->thread.gsindex == GS_TLS_SEL)
843 base = read_32bit_tls(task, GS_TLS); 842 base = read_32bit_tls(task, GS_TLS);
844 else if (doit) { 843 else if (doit) {
845 asm("movl %%gs,%0" : "=r" (gsindex)); 844 savesegment(gs, gsindex);
846 if (gsindex) 845 if (gsindex)
847 rdmsrl(MSR_KERNEL_GS_BASE, base); 846 rdmsrl(MSR_KERNEL_GS_BASE, base);
848 else 847 else
849 base = task->thread.gs; 848 base = task->thread.gs;
850 } 849 } else
851 else
852 base = task->thread.gs; 850 base = task->thread.gs;
853 ret = put_user(base, (unsigned long __user *)addr); 851 ret = put_user(base, (unsigned long __user *)addr);
854 break; 852 break;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index a7835f282936..0a6d8c12e10d 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -14,6 +14,7 @@
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <linux/ptrace.h> 15#include <linux/ptrace.h>
16#include <linux/regset.h> 16#include <linux/regset.h>
17#include <linux/tracehook.h>
17#include <linux/user.h> 18#include <linux/user.h>
18#include <linux/elf.h> 19#include <linux/elf.h>
19#include <linux/security.h> 20#include <linux/security.h>
@@ -39,7 +40,9 @@ enum x86_regset {
39 REGSET_GENERAL, 40 REGSET_GENERAL,
40 REGSET_FP, 41 REGSET_FP,
41 REGSET_XFP, 42 REGSET_XFP,
43 REGSET_IOPERM64 = REGSET_XFP,
42 REGSET_TLS, 44 REGSET_TLS,
45 REGSET_IOPERM32,
43}; 46};
44 47
45/* 48/*
@@ -69,7 +72,7 @@ static inline bool invalid_selector(u16 value)
69 72
70#define FLAG_MASK FLAG_MASK_32 73#define FLAG_MASK FLAG_MASK_32
71 74
72static long *pt_regs_access(struct pt_regs *regs, unsigned long regno) 75static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
73{ 76{
74 BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); 77 BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
75 regno >>= 2; 78 regno >>= 2;
@@ -554,45 +557,138 @@ static int ptrace_set_debugreg(struct task_struct *child,
554 return 0; 557 return 0;
555} 558}
556 559
557#ifdef X86_BTS 560/*
561 * These access the current or another (stopped) task's io permission
562 * bitmap for debugging or core dump.
563 */
564static int ioperm_active(struct task_struct *target,
565 const struct user_regset *regset)
566{
567 return target->thread.io_bitmap_max / regset->size;
568}
558 569
559static int ptrace_bts_get_size(struct task_struct *child) 570static int ioperm_get(struct task_struct *target,
571 const struct user_regset *regset,
572 unsigned int pos, unsigned int count,
573 void *kbuf, void __user *ubuf)
560{ 574{
561 if (!child->thread.ds_area_msr) 575 if (!target->thread.io_bitmap_ptr)
562 return -ENXIO; 576 return -ENXIO;
563 577
564 return ds_get_bts_index((void *)child->thread.ds_area_msr); 578 return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
579 target->thread.io_bitmap_ptr,
580 0, IO_BITMAP_BYTES);
581}
582
583#ifdef CONFIG_X86_PTRACE_BTS
584/*
585 * The configuration for a particular BTS hardware implementation.
586 */
587struct bts_configuration {
588 /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */
589 unsigned char sizeof_bts;
590 /* the size of a field in the BTS record in bytes */
591 unsigned char sizeof_field;
592 /* a bitmask to enable/disable BTS in DEBUGCTL MSR */
593 unsigned long debugctl_mask;
594};
595static struct bts_configuration bts_cfg;
596
597#define BTS_MAX_RECORD_SIZE (8 * 3)
598
599
600/*
601 * Branch Trace Store (BTS) uses the following format. Different
602 * architectures vary in the size of those fields.
603 * - source linear address
604 * - destination linear address
605 * - flags
606 *
607 * Later architectures use 64bit pointers throughout, whereas earlier
608 * architectures use 32bit pointers in 32bit mode.
609 *
610 * We compute the base address for the first 8 fields based on:
611 * - the field size stored in the DS configuration
612 * - the relative field position
613 *
614 * In order to store additional information in the BTS buffer, we use
615 * a special source address to indicate that the record requires
616 * special interpretation.
617 *
618 * Netburst indicated via a bit in the flags field whether the branch
619 * was predicted; this is ignored.
620 */
621
622enum bts_field {
623 bts_from = 0,
624 bts_to,
625 bts_flags,
626
627 bts_escape = (unsigned long)-1,
628 bts_qual = bts_to,
629 bts_jiffies = bts_flags
630};
631
632static inline unsigned long bts_get(const char *base, enum bts_field field)
633{
634 base += (bts_cfg.sizeof_field * field);
635 return *(unsigned long *)base;
636}
637
638static inline void bts_set(char *base, enum bts_field field, unsigned long val)
639{
640 base += (bts_cfg.sizeof_field * field);;
641 (*(unsigned long *)base) = val;
642}
643
644/*
645 * Translate a BTS record from the raw format into the bts_struct format
646 *
647 * out (out): bts_struct interpretation
648 * raw: raw BTS record
649 */
650static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw)
651{
652 memset(out, 0, sizeof(*out));
653 if (bts_get(raw, bts_from) == bts_escape) {
654 out->qualifier = bts_get(raw, bts_qual);
655 out->variant.jiffies = bts_get(raw, bts_jiffies);
656 } else {
657 out->qualifier = BTS_BRANCH;
658 out->variant.lbr.from_ip = bts_get(raw, bts_from);
659 out->variant.lbr.to_ip = bts_get(raw, bts_to);
660 }
565} 661}
566 662
567static int ptrace_bts_read_record(struct task_struct *child, 663static int ptrace_bts_read_record(struct task_struct *child, size_t index,
568 long index,
569 struct bts_struct __user *out) 664 struct bts_struct __user *out)
570{ 665{
571 struct bts_struct ret; 666 struct bts_struct ret;
572 int retval; 667 const void *bts_record;
573 int bts_end; 668 size_t bts_index, bts_end;
574 int bts_index; 669 int error;
575
576 if (!child->thread.ds_area_msr)
577 return -ENXIO;
578 670
579 if (index < 0) 671 error = ds_get_bts_end(child, &bts_end);
580 return -EINVAL; 672 if (error < 0)
673 return error;
581 674
582 bts_end = ds_get_bts_end((void *)child->thread.ds_area_msr);
583 if (bts_end <= index) 675 if (bts_end <= index)
584 return -EINVAL; 676 return -EINVAL;
585 677
678 error = ds_get_bts_index(child, &bts_index);
679 if (error < 0)
680 return error;
681
586 /* translate the ptrace bts index into the ds bts index */ 682 /* translate the ptrace bts index into the ds bts index */
587 bts_index = ds_get_bts_index((void *)child->thread.ds_area_msr); 683 bts_index += bts_end - (index + 1);
588 bts_index -= (index + 1); 684 if (bts_end <= bts_index)
589 if (bts_index < 0) 685 bts_index -= bts_end;
590 bts_index += bts_end; 686
687 error = ds_access_bts(child, bts_index, &bts_record);
688 if (error < 0)
689 return error;
591 690
592 retval = ds_read_bts((void *)child->thread.ds_area_msr, 691 ptrace_bts_translate_record(&ret, bts_record);
593 bts_index, &ret);
594 if (retval < 0)
595 return retval;
596 692
597 if (copy_to_user(out, &ret, sizeof(ret))) 693 if (copy_to_user(out, &ret, sizeof(ret)))
598 return -EFAULT; 694 return -EFAULT;
@@ -600,101 +696,106 @@ static int ptrace_bts_read_record(struct task_struct *child,
600 return sizeof(ret); 696 return sizeof(ret);
601} 697}
602 698
603static int ptrace_bts_clear(struct task_struct *child)
604{
605 if (!child->thread.ds_area_msr)
606 return -ENXIO;
607
608 return ds_clear((void *)child->thread.ds_area_msr);
609}
610
611static int ptrace_bts_drain(struct task_struct *child, 699static int ptrace_bts_drain(struct task_struct *child,
612 long size, 700 long size,
613 struct bts_struct __user *out) 701 struct bts_struct __user *out)
614{ 702{
615 int end, i; 703 struct bts_struct ret;
616 void *ds = (void *)child->thread.ds_area_msr; 704 const unsigned char *raw;
617 705 size_t end, i;
618 if (!ds) 706 int error;
619 return -ENXIO;
620 707
621 end = ds_get_bts_index(ds); 708 error = ds_get_bts_index(child, &end);
622 if (end <= 0) 709 if (error < 0)
623 return end; 710 return error;
624 711
625 if (size < (end * sizeof(struct bts_struct))) 712 if (size < (end * sizeof(struct bts_struct)))
626 return -EIO; 713 return -EIO;
627 714
628 for (i = 0; i < end; i++, out++) { 715 error = ds_access_bts(child, 0, (const void **)&raw);
629 struct bts_struct ret; 716 if (error < 0)
630 int retval; 717 return error;
631 718
632 retval = ds_read_bts(ds, i, &ret); 719 for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) {
633 if (retval < 0) 720 ptrace_bts_translate_record(&ret, raw);
634 return retval;
635 721
636 if (copy_to_user(out, &ret, sizeof(ret))) 722 if (copy_to_user(out, &ret, sizeof(ret)))
637 return -EFAULT; 723 return -EFAULT;
638 } 724 }
639 725
640 ds_clear(ds); 726 error = ds_clear_bts(child);
727 if (error < 0)
728 return error;
641 729
642 return end; 730 return end;
643} 731}
644 732
733static void ptrace_bts_ovfl(struct task_struct *child)
734{
735 send_sig(child->thread.bts_ovfl_signal, child, 0);
736}
737
645static int ptrace_bts_config(struct task_struct *child, 738static int ptrace_bts_config(struct task_struct *child,
646 long cfg_size, 739 long cfg_size,
647 const struct ptrace_bts_config __user *ucfg) 740 const struct ptrace_bts_config __user *ucfg)
648{ 741{
649 struct ptrace_bts_config cfg; 742 struct ptrace_bts_config cfg;
650 int bts_size, ret = 0; 743 int error = 0;
651 void *ds; 744
745 error = -EOPNOTSUPP;
746 if (!bts_cfg.sizeof_bts)
747 goto errout;
652 748
749 error = -EIO;
653 if (cfg_size < sizeof(cfg)) 750 if (cfg_size < sizeof(cfg))
654 return -EIO; 751 goto errout;
655 752
753 error = -EFAULT;
656 if (copy_from_user(&cfg, ucfg, sizeof(cfg))) 754 if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
657 return -EFAULT; 755 goto errout;
658 756
659 if ((int)cfg.size < 0) 757 error = -EINVAL;
660 return -EINVAL; 758 if ((cfg.flags & PTRACE_BTS_O_SIGNAL) &&
759 !(cfg.flags & PTRACE_BTS_O_ALLOC))
760 goto errout;
661 761
662 bts_size = 0; 762 if (cfg.flags & PTRACE_BTS_O_ALLOC) {
663 ds = (void *)child->thread.ds_area_msr; 763 ds_ovfl_callback_t ovfl = NULL;
664 if (ds) { 764 unsigned int sig = 0;
665 bts_size = ds_get_bts_size(ds); 765
666 if (bts_size < 0) 766 /* we ignore the error in case we were not tracing child */
667 return bts_size; 767 (void)ds_release_bts(child);
668 }
669 cfg.size = PAGE_ALIGN(cfg.size);
670 768
671 if (bts_size != cfg.size) { 769 if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
672 ret = ptrace_bts_realloc(child, cfg.size, 770 if (!cfg.signal)
673 cfg.flags & PTRACE_BTS_O_CUT_SIZE); 771 goto errout;
674 if (ret < 0) 772
773 sig = cfg.signal;
774 ovfl = ptrace_bts_ovfl;
775 }
776
777 error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl);
778 if (error < 0)
675 goto errout; 779 goto errout;
676 780
677 ds = (void *)child->thread.ds_area_msr; 781 child->thread.bts_ovfl_signal = sig;
678 } 782 }
679 783
680 if (cfg.flags & PTRACE_BTS_O_SIGNAL) 784 error = -EINVAL;
681 ret = ds_set_overflow(ds, DS_O_SIGNAL); 785 if (!child->thread.ds_ctx && cfg.flags)
682 else
683 ret = ds_set_overflow(ds, DS_O_WRAP);
684 if (ret < 0)
685 goto errout; 786 goto errout;
686 787
687 if (cfg.flags & PTRACE_BTS_O_TRACE) 788 if (cfg.flags & PTRACE_BTS_O_TRACE)
688 child->thread.debugctlmsr |= ds_debugctl_mask(); 789 child->thread.debugctlmsr |= bts_cfg.debugctl_mask;
689 else 790 else
690 child->thread.debugctlmsr &= ~ds_debugctl_mask(); 791 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
691 792
692 if (cfg.flags & PTRACE_BTS_O_SCHED) 793 if (cfg.flags & PTRACE_BTS_O_SCHED)
693 set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 794 set_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
694 else 795 else
695 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 796 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
696 797
697 ret = sizeof(cfg); 798 error = sizeof(cfg);
698 799
699out: 800out:
700 if (child->thread.debugctlmsr) 801 if (child->thread.debugctlmsr)
@@ -702,10 +803,10 @@ out:
702 else 803 else
703 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 804 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
704 805
705 return ret; 806 return error;
706 807
707errout: 808errout:
708 child->thread.debugctlmsr &= ~ds_debugctl_mask(); 809 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
709 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 810 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
710 goto out; 811 goto out;
711} 812}
@@ -714,29 +815,40 @@ static int ptrace_bts_status(struct task_struct *child,
714 long cfg_size, 815 long cfg_size,
715 struct ptrace_bts_config __user *ucfg) 816 struct ptrace_bts_config __user *ucfg)
716{ 817{
717 void *ds = (void *)child->thread.ds_area_msr;
718 struct ptrace_bts_config cfg; 818 struct ptrace_bts_config cfg;
819 size_t end;
820 const void *base, *max;
821 int error;
719 822
720 if (cfg_size < sizeof(cfg)) 823 if (cfg_size < sizeof(cfg))
721 return -EIO; 824 return -EIO;
722 825
723 memset(&cfg, 0, sizeof(cfg)); 826 error = ds_get_bts_end(child, &end);
827 if (error < 0)
828 return error;
724 829
725 if (ds) { 830 error = ds_access_bts(child, /* index = */ 0, &base);
726 cfg.size = ds_get_bts_size(ds); 831 if (error < 0)
832 return error;
727 833
728 if (ds_get_overflow(ds) == DS_O_SIGNAL) 834 error = ds_access_bts(child, /* index = */ end, &max);
729 cfg.flags |= PTRACE_BTS_O_SIGNAL; 835 if (error < 0)
836 return error;
730 837
731 if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && 838 memset(&cfg, 0, sizeof(cfg));
732 child->thread.debugctlmsr & ds_debugctl_mask()) 839 cfg.size = (max - base);
733 cfg.flags |= PTRACE_BTS_O_TRACE; 840 cfg.signal = child->thread.bts_ovfl_signal;
841 cfg.bts_size = sizeof(struct bts_struct);
734 842
735 if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) 843 if (cfg.signal)
736 cfg.flags |= PTRACE_BTS_O_SCHED; 844 cfg.flags |= PTRACE_BTS_O_SIGNAL;
737 }
738 845
739 cfg.bts_size = sizeof(struct bts_struct); 846 if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
847 child->thread.debugctlmsr & bts_cfg.debugctl_mask)
848 cfg.flags |= PTRACE_BTS_O_TRACE;
849
850 if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
851 cfg.flags |= PTRACE_BTS_O_SCHED;
740 852
741 if (copy_to_user(ucfg, &cfg, sizeof(cfg))) 853 if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
742 return -EFAULT; 854 return -EFAULT;
@@ -744,89 +856,38 @@ static int ptrace_bts_status(struct task_struct *child,
744 return sizeof(cfg); 856 return sizeof(cfg);
745} 857}
746 858
747
748static int ptrace_bts_write_record(struct task_struct *child, 859static int ptrace_bts_write_record(struct task_struct *child,
749 const struct bts_struct *in) 860 const struct bts_struct *in)
750{ 861{
751 int retval; 862 unsigned char bts_record[BTS_MAX_RECORD_SIZE];
752 863
753 if (!child->thread.ds_area_msr) 864 BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts);
754 return -ENXIO;
755 865
756 retval = ds_write_bts((void *)child->thread.ds_area_msr, in); 866 memset(bts_record, 0, bts_cfg.sizeof_bts);
757 if (retval) 867 switch (in->qualifier) {
758 return retval; 868 case BTS_INVALID:
869 break;
759 870
760 return sizeof(*in); 871 case BTS_BRANCH:
761} 872 bts_set(bts_record, bts_from, in->variant.lbr.from_ip);
873 bts_set(bts_record, bts_to, in->variant.lbr.to_ip);
874 break;
762 875
763static int ptrace_bts_realloc(struct task_struct *child, 876 case BTS_TASK_ARRIVES:
764 int size, int reduce_size) 877 case BTS_TASK_DEPARTS:
765{ 878 bts_set(bts_record, bts_from, bts_escape);
766 unsigned long rlim, vm; 879 bts_set(bts_record, bts_qual, in->qualifier);
767 int ret, old_size; 880 bts_set(bts_record, bts_jiffies, in->variant.jiffies);
881 break;
768 882
769 if (size < 0) 883 default:
770 return -EINVAL; 884 return -EINVAL;
771
772 old_size = ds_get_bts_size((void *)child->thread.ds_area_msr);
773 if (old_size < 0)
774 return old_size;
775
776 ret = ds_free((void **)&child->thread.ds_area_msr);
777 if (ret < 0)
778 goto out;
779
780 size >>= PAGE_SHIFT;
781 old_size >>= PAGE_SHIFT;
782
783 current->mm->total_vm -= old_size;
784 current->mm->locked_vm -= old_size;
785
786 if (size == 0)
787 goto out;
788
789 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
790 vm = current->mm->total_vm + size;
791 if (rlim < vm) {
792 ret = -ENOMEM;
793
794 if (!reduce_size)
795 goto out;
796
797 size = rlim - current->mm->total_vm;
798 if (size <= 0)
799 goto out;
800 } 885 }
801 886
802 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; 887 /* The writing task will be the switched-to task on a context
803 vm = current->mm->locked_vm + size; 888 * switch. It needs to write into the switched-from task's BTS
804 if (rlim < vm) { 889 * buffer. */
805 ret = -ENOMEM; 890 return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts);
806
807 if (!reduce_size)
808 goto out;
809
810 size = rlim - current->mm->locked_vm;
811 if (size <= 0)
812 goto out;
813 }
814
815 ret = ds_allocate((void **)&child->thread.ds_area_msr,
816 size << PAGE_SHIFT);
817 if (ret < 0)
818 goto out;
819
820 current->mm->total_vm += size;
821 current->mm->locked_vm += size;
822
823out:
824 if (child->thread.ds_area_msr)
825 set_tsk_thread_flag(child, TIF_DS_AREA_MSR);
826 else
827 clear_tsk_thread_flag(child, TIF_DS_AREA_MSR);
828
829 return ret;
830} 891}
831 892
832void ptrace_bts_take_timestamp(struct task_struct *tsk, 893void ptrace_bts_take_timestamp(struct task_struct *tsk,
@@ -839,7 +900,66 @@ void ptrace_bts_take_timestamp(struct task_struct *tsk,
839 900
840 ptrace_bts_write_record(tsk, &rec); 901 ptrace_bts_write_record(tsk, &rec);
841} 902}
842#endif /* X86_BTS */ 903
904static const struct bts_configuration bts_cfg_netburst = {
905 .sizeof_bts = sizeof(long) * 3,
906 .sizeof_field = sizeof(long),
907 .debugctl_mask = (1<<2)|(1<<3)|(1<<5)
908};
909
910static const struct bts_configuration bts_cfg_pentium_m = {
911 .sizeof_bts = sizeof(long) * 3,
912 .sizeof_field = sizeof(long),
913 .debugctl_mask = (1<<6)|(1<<7)
914};
915
916static const struct bts_configuration bts_cfg_core2 = {
917 .sizeof_bts = 8 * 3,
918 .sizeof_field = 8,
919 .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
920};
921
922static inline void bts_configure(const struct bts_configuration *cfg)
923{
924 bts_cfg = *cfg;
925}
926
927void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c)
928{
929 switch (c->x86) {
930 case 0x6:
931 switch (c->x86_model) {
932 case 0xD:
933 case 0xE: /* Pentium M */
934 bts_configure(&bts_cfg_pentium_m);
935 break;
936 case 0xF: /* Core2 */
937 case 0x1C: /* Atom */
938 bts_configure(&bts_cfg_core2);
939 break;
940 default:
941 /* sorry, don't know about them */
942 break;
943 }
944 break;
945 case 0xF:
946 switch (c->x86_model) {
947 case 0x0:
948 case 0x1:
949 case 0x2: /* Netburst */
950 bts_configure(&bts_cfg_netburst);
951 break;
952 default:
953 /* sorry, don't know about them */
954 break;
955 }
956 break;
957 default:
958 /* sorry, don't know about them */
959 break;
960 }
961}
962#endif /* CONFIG_X86_PTRACE_BTS */
843 963
844/* 964/*
845 * Called by kernel/ptrace.c when detaching.. 965 * Called by kernel/ptrace.c when detaching..
@@ -852,15 +972,15 @@ void ptrace_disable(struct task_struct *child)
852#ifdef TIF_SYSCALL_EMU 972#ifdef TIF_SYSCALL_EMU
853 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); 973 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
854#endif 974#endif
855 if (child->thread.ds_area_msr) { 975#ifdef CONFIG_X86_PTRACE_BTS
856#ifdef X86_BTS 976 (void)ds_release_bts(child);
857 ptrace_bts_realloc(child, 0, 0); 977
858#endif 978 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
859 child->thread.debugctlmsr &= ~ds_debugctl_mask(); 979 if (!child->thread.debugctlmsr)
860 if (!child->thread.debugctlmsr) 980 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
861 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 981
862 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 982 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
863 } 983#endif /* CONFIG_X86_PTRACE_BTS */
864} 984}
865 985
866#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 986#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -943,13 +1063,13 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
943 return copy_regset_to_user(child, &user_x86_32_view, 1063 return copy_regset_to_user(child, &user_x86_32_view,
944 REGSET_XFP, 1064 REGSET_XFP,
945 0, sizeof(struct user_fxsr_struct), 1065 0, sizeof(struct user_fxsr_struct),
946 datap); 1066 datap) ? -EIO : 0;
947 1067
948 case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ 1068 case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
949 return copy_regset_from_user(child, &user_x86_32_view, 1069 return copy_regset_from_user(child, &user_x86_32_view,
950 REGSET_XFP, 1070 REGSET_XFP,
951 0, sizeof(struct user_fxsr_struct), 1071 0, sizeof(struct user_fxsr_struct),
952 datap); 1072 datap) ? -EIO : 0;
953#endif 1073#endif
954 1074
955#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 1075#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -980,7 +1100,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
980 /* 1100 /*
981 * These bits need more cooking - not enabled yet: 1101 * These bits need more cooking - not enabled yet:
982 */ 1102 */
983#ifdef X86_BTS 1103#ifdef CONFIG_X86_PTRACE_BTS
984 case PTRACE_BTS_CONFIG: 1104 case PTRACE_BTS_CONFIG:
985 ret = ptrace_bts_config 1105 ret = ptrace_bts_config
986 (child, data, (struct ptrace_bts_config __user *)addr); 1106 (child, data, (struct ptrace_bts_config __user *)addr);
@@ -992,7 +1112,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
992 break; 1112 break;
993 1113
994 case PTRACE_BTS_SIZE: 1114 case PTRACE_BTS_SIZE:
995 ret = ptrace_bts_get_size(child); 1115 ret = ds_get_bts_index(child, /* pos = */ NULL);
996 break; 1116 break;
997 1117
998 case PTRACE_BTS_GET: 1118 case PTRACE_BTS_GET:
@@ -1001,14 +1121,14 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1001 break; 1121 break;
1002 1122
1003 case PTRACE_BTS_CLEAR: 1123 case PTRACE_BTS_CLEAR:
1004 ret = ptrace_bts_clear(child); 1124 ret = ds_clear_bts(child);
1005 break; 1125 break;
1006 1126
1007 case PTRACE_BTS_DRAIN: 1127 case PTRACE_BTS_DRAIN:
1008 ret = ptrace_bts_drain 1128 ret = ptrace_bts_drain
1009 (child, data, (struct bts_struct __user *) addr); 1129 (child, data, (struct bts_struct __user *) addr);
1010 break; 1130 break;
1011#endif 1131#endif /* CONFIG_X86_PTRACE_BTS */
1012 1132
1013 default: 1133 default:
1014 ret = ptrace_request(child, request, addr, data); 1134 ret = ptrace_request(child, request, addr, data);
@@ -1290,6 +1410,12 @@ static const struct user_regset x86_64_regsets[] = {
1290 .size = sizeof(long), .align = sizeof(long), 1410 .size = sizeof(long), .align = sizeof(long),
1291 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set 1411 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
1292 }, 1412 },
1413 [REGSET_IOPERM64] = {
1414 .core_note_type = NT_386_IOPERM,
1415 .n = IO_BITMAP_LONGS,
1416 .size = sizeof(long), .align = sizeof(long),
1417 .active = ioperm_active, .get = ioperm_get
1418 },
1293}; 1419};
1294 1420
1295static const struct user_regset_view user_x86_64_view = { 1421static const struct user_regset_view user_x86_64_view = {
@@ -1336,6 +1462,12 @@ static const struct user_regset x86_32_regsets[] = {
1336 .active = regset_tls_active, 1462 .active = regset_tls_active,
1337 .get = regset_tls_get, .set = regset_tls_set 1463 .get = regset_tls_get, .set = regset_tls_set
1338 }, 1464 },
1465 [REGSET_IOPERM32] = {
1466 .core_note_type = NT_386_IOPERM,
1467 .n = IO_BITMAP_BYTES / sizeof(u32),
1468 .size = sizeof(u32), .align = sizeof(u32),
1469 .active = ioperm_active, .get = ioperm_get
1470 },
1339}; 1471};
1340 1472
1341static const struct user_regset_view user_x86_32_view = { 1473static const struct user_regset_view user_x86_32_view = {
@@ -1357,9 +1489,8 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
1357#endif 1489#endif
1358} 1490}
1359 1491
1360#ifdef CONFIG_X86_32 1492void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
1361 1493 int error_code, int si_code)
1362void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1363{ 1494{
1364 struct siginfo info; 1495 struct siginfo info;
1365 1496
@@ -1368,7 +1499,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1368 1499
1369 memset(&info, 0, sizeof(info)); 1500 memset(&info, 0, sizeof(info));
1370 info.si_signo = SIGTRAP; 1501 info.si_signo = SIGTRAP;
1371 info.si_code = TRAP_BRKPT; 1502 info.si_code = si_code;
1372 1503
1373 /* User-mode ip? */ 1504 /* User-mode ip? */
1374 info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL; 1505 info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL;
@@ -1377,143 +1508,83 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1377 force_sig_info(SIGTRAP, &info, tsk); 1508 force_sig_info(SIGTRAP, &info, tsk);
1378} 1509}
1379 1510
1380/* notification of system call entry/exit
1381 * - triggered by current->work.syscall_trace
1382 */
1383int do_syscall_trace(struct pt_regs *regs, int entryexit)
1384{
1385 int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
1386 /*
1387 * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
1388 * interception
1389 */
1390 int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
1391 int ret = 0;
1392 1511
1393 /* do the secure computing check first */ 1512#ifdef CONFIG_X86_32
1394 if (!entryexit) 1513# define IS_IA32 1
1395 secure_computing(regs->orig_ax); 1514#elif defined CONFIG_IA32_EMULATION
1396 1515# define IS_IA32 test_thread_flag(TIF_IA32)
1397 if (unlikely(current->audit_context)) { 1516#else
1398 if (entryexit) 1517# define IS_IA32 0
1399 audit_syscall_exit(AUDITSC_RESULT(regs->ax), 1518#endif
1400 regs->ax);
1401 /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
1402 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
1403 * not used, entry.S will call us only on syscall exit, not
1404 * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
1405 * calling send_sigtrap() on syscall entry.
1406 *
1407 * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
1408 * is_singlestep is false, despite his name, so we will still do
1409 * the correct thing.
1410 */
1411 else if (is_singlestep)
1412 goto out;
1413 }
1414
1415 if (!(current->ptrace & PT_PTRACED))
1416 goto out;
1417
1418 /* If a process stops on the 1st tracepoint with SYSCALL_TRACE
1419 * and then is resumed with SYSEMU_SINGLESTEP, it will come in
1420 * here. We have to check this and return */
1421 if (is_sysemu && entryexit)
1422 return 0;
1423
1424 /* Fake a debug trap */
1425 if (is_singlestep)
1426 send_sigtrap(current, regs, 0);
1427
1428 if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
1429 goto out;
1430
1431 /* the 0x80 provides a way for the tracing parent to distinguish
1432 between a syscall stop and SIGTRAP delivery */
1433 /* Note that the debugger could change the result of test_thread_flag!*/
1434 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
1435
1436 /*
1437 * this isn't the same as continuing with a signal, but it will do
1438 * for normal use. strace only continues with a signal if the
1439 * stopping signal is not SIGTRAP. -brl
1440 */
1441 if (current->exit_code) {
1442 send_sig(current->exit_code, current, 1);
1443 current->exit_code = 0;
1444 }
1445 ret = is_sysemu;
1446out:
1447 if (unlikely(current->audit_context) && !entryexit)
1448 audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax,
1449 regs->bx, regs->cx, regs->dx, regs->si);
1450 if (ret == 0)
1451 return 0;
1452
1453 regs->orig_ax = -1; /* force skip of syscall restarting */
1454 if (unlikely(current->audit_context))
1455 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1456 return 1;
1457}
1458
1459#else /* CONFIG_X86_64 */
1460 1519
1461static void syscall_trace(struct pt_regs *regs) 1520/*
1521 * We must return the syscall number to actually look up in the table.
1522 * This can be -1L to skip running any syscall at all.
1523 */
1524asmregparm long syscall_trace_enter(struct pt_regs *regs)
1462{ 1525{
1526 long ret = 0;
1463 1527
1464#if 0
1465 printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
1466 current->comm,
1467 regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0),
1468 current_thread_info()->flags, current->ptrace);
1469#endif
1470
1471 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
1472 ? 0x80 : 0));
1473 /* 1528 /*
1474 * this isn't the same as continuing with a signal, but it will do 1529 * If we stepped into a sysenter/syscall insn, it trapped in
1475 * for normal use. strace only continues with a signal if the 1530 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
1476 * stopping signal is not SIGTRAP. -brl 1531 * If user-mode had set TF itself, then it's still clear from
1532 * do_debug() and we need to set it again to restore the user
1533 * state. If we entered on the slow path, TF was already set.
1477 */ 1534 */
1478 if (current->exit_code) { 1535 if (test_thread_flag(TIF_SINGLESTEP))
1479 send_sig(current->exit_code, current, 1); 1536 regs->flags |= X86_EFLAGS_TF;
1480 current->exit_code = 0;
1481 }
1482}
1483 1537
1484asmlinkage void syscall_trace_enter(struct pt_regs *regs)
1485{
1486 /* do the secure computing check first */ 1538 /* do the secure computing check first */
1487 secure_computing(regs->orig_ax); 1539 secure_computing(regs->orig_ax);
1488 1540
1489 if (test_thread_flag(TIF_SYSCALL_TRACE) 1541 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1490 && (current->ptrace & PT_PTRACED)) 1542 ret = -1L;
1491 syscall_trace(regs); 1543
1544 if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
1545 tracehook_report_syscall_entry(regs))
1546 ret = -1L;
1492 1547
1493 if (unlikely(current->audit_context)) { 1548 if (unlikely(current->audit_context)) {
1494 if (test_thread_flag(TIF_IA32)) { 1549 if (IS_IA32)
1495 audit_syscall_entry(AUDIT_ARCH_I386, 1550 audit_syscall_entry(AUDIT_ARCH_I386,
1496 regs->orig_ax, 1551 regs->orig_ax,
1497 regs->bx, regs->cx, 1552 regs->bx, regs->cx,
1498 regs->dx, regs->si); 1553 regs->dx, regs->si);
1499 } else { 1554#ifdef CONFIG_X86_64
1555 else
1500 audit_syscall_entry(AUDIT_ARCH_X86_64, 1556 audit_syscall_entry(AUDIT_ARCH_X86_64,
1501 regs->orig_ax, 1557 regs->orig_ax,
1502 regs->di, regs->si, 1558 regs->di, regs->si,
1503 regs->dx, regs->r10); 1559 regs->dx, regs->r10);
1504 } 1560#endif
1505 } 1561 }
1562
1563 return ret ?: regs->orig_ax;
1506} 1564}
1507 1565
1508asmlinkage void syscall_trace_leave(struct pt_regs *regs) 1566asmregparm void syscall_trace_leave(struct pt_regs *regs)
1509{ 1567{
1510 if (unlikely(current->audit_context)) 1568 if (unlikely(current->audit_context))
1511 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); 1569 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1512 1570
1513 if ((test_thread_flag(TIF_SYSCALL_TRACE) 1571 if (test_thread_flag(TIF_SYSCALL_TRACE))
1514 || test_thread_flag(TIF_SINGLESTEP)) 1572 tracehook_report_syscall_exit(regs, 0);
1515 && (current->ptrace & PT_PTRACED))
1516 syscall_trace(regs);
1517}
1518 1573
1519#endif /* CONFIG_X86_32 */ 1574 /*
1575 * If TIF_SYSCALL_EMU is set, we only get here because of
1576 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
1577 * We already reported this syscall instruction in
1578 * syscall_trace_enter(), so don't do any more now.
1579 */
1580 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1581 return;
1582
1583 /*
1584 * If we are single-stepping, synthesize a trap to follow the
1585 * system call instruction.
1586 */
1587 if (test_thread_flag(TIF_SINGLESTEP) &&
1588 tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL))
1589 send_sigtrap(current, regs, 0, TRAP_BRKPT);
1590}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index d89a648fe710..f6a11b9b1f98 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -65,6 +65,7 @@ static enum {
65 ICH_FORCE_HPET_RESUME, 65 ICH_FORCE_HPET_RESUME,
66 VT8237_FORCE_HPET_RESUME, 66 VT8237_FORCE_HPET_RESUME,
67 NVIDIA_FORCE_HPET_RESUME, 67 NVIDIA_FORCE_HPET_RESUME,
68 ATI_FORCE_HPET_RESUME,
68} force_hpet_resume_type; 69} force_hpet_resume_type;
69 70
70static void __iomem *rcba_base; 71static void __iomem *rcba_base;
@@ -158,6 +159,8 @@ static void ich_force_enable_hpet(struct pci_dev *dev)
158 159
159DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0, 160DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0,
160 ich_force_enable_hpet); 161 ich_force_enable_hpet);
162DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_0,
163 ich_force_enable_hpet);
161DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1, 164DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1,
162 ich_force_enable_hpet); 165 ich_force_enable_hpet);
163DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0, 166DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0,
@@ -174,6 +177,12 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
174 177
175static struct pci_dev *cached_dev; 178static struct pci_dev *cached_dev;
176 179
180static void hpet_print_force_info(void)
181{
182 printk(KERN_INFO "HPET not enabled in BIOS. "
183 "You might try hpet=force boot option\n");
184}
185
177static void old_ich_force_hpet_resume(void) 186static void old_ich_force_hpet_resume(void)
178{ 187{
179 u32 val; 188 u32 val;
@@ -253,8 +262,12 @@ static void old_ich_force_enable_hpet_user(struct pci_dev *dev)
253{ 262{
254 if (hpet_force_user) 263 if (hpet_force_user)
255 old_ich_force_enable_hpet(dev); 264 old_ich_force_enable_hpet(dev);
265 else
266 hpet_print_force_info();
256} 267}
257 268
269DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1,
270 old_ich_force_enable_hpet_user);
258DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0, 271DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0,
259 old_ich_force_enable_hpet_user); 272 old_ich_force_enable_hpet_user);
260DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12, 273DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12,
@@ -290,9 +303,14 @@ static void vt8237_force_enable_hpet(struct pci_dev *dev)
290{ 303{
291 u32 uninitialized_var(val); 304 u32 uninitialized_var(val);
292 305
293 if (!hpet_force_user || hpet_address || force_hpet_address) 306 if (hpet_address || force_hpet_address)
294 return; 307 return;
295 308
309 if (!hpet_force_user) {
310 hpet_print_force_info();
311 return;
312 }
313
296 pci_read_config_dword(dev, 0x68, &val); 314 pci_read_config_dword(dev, 0x68, &val);
297 /* 315 /*
298 * Bit 7 is HPET enable bit. 316 * Bit 7 is HPET enable bit.
@@ -330,6 +348,73 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235,
330DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237, 348DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
331 vt8237_force_enable_hpet); 349 vt8237_force_enable_hpet);
332 350
351static void ati_force_hpet_resume(void)
352{
353 pci_write_config_dword(cached_dev, 0x14, 0xfed00000);
354 printk(KERN_DEBUG "Force enabled HPET at resume\n");
355}
356
357static u32 ati_ixp4x0_rev(struct pci_dev *dev)
358{
359 u32 d;
360 u8 b;
361
362 pci_read_config_byte(dev, 0xac, &b);
363 b &= ~(1<<5);
364 pci_write_config_byte(dev, 0xac, b);
365 pci_read_config_dword(dev, 0x70, &d);
366 d |= 1<<8;
367 pci_write_config_dword(dev, 0x70, d);
368 pci_read_config_dword(dev, 0x8, &d);
369 d &= 0xff;
370 dev_printk(KERN_DEBUG, &dev->dev, "SB4X0 revision 0x%x\n", d);
371 return d;
372}
373
374static void ati_force_enable_hpet(struct pci_dev *dev)
375{
376 u32 d, val;
377 u8 b;
378
379 if (hpet_address || force_hpet_address)
380 return;
381
382 if (!hpet_force_user) {
383 hpet_print_force_info();
384 return;
385 }
386
387 d = ati_ixp4x0_rev(dev);
388 if (d < 0x82)
389 return;
390
391 /* base address */
392 pci_write_config_dword(dev, 0x14, 0xfed00000);
393 pci_read_config_dword(dev, 0x14, &val);
394
395 /* enable interrupt */
396 outb(0x72, 0xcd6); b = inb(0xcd7);
397 b |= 0x1;
398 outb(0x72, 0xcd6); outb(b, 0xcd7);
399 outb(0x72, 0xcd6); b = inb(0xcd7);
400 if (!(b & 0x1))
401 return;
402 pci_read_config_dword(dev, 0x64, &d);
403 d |= (1<<10);
404 pci_write_config_dword(dev, 0x64, d);
405 pci_read_config_dword(dev, 0x64, &d);
406 if (!(d & (1<<10)))
407 return;
408
409 force_hpet_address = val;
410 force_hpet_resume_type = ATI_FORCE_HPET_RESUME;
411 dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
412 force_hpet_address);
413 cached_dev = dev;
414}
415DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
416 ati_force_enable_hpet);
417
333/* 418/*
334 * Undocumented chipset feature taken from LinuxBIOS. 419 * Undocumented chipset feature taken from LinuxBIOS.
335 */ 420 */
@@ -343,9 +428,14 @@ static void nvidia_force_enable_hpet(struct pci_dev *dev)
343{ 428{
344 u32 uninitialized_var(val); 429 u32 uninitialized_var(val);
345 430
346 if (!hpet_force_user || hpet_address || force_hpet_address) 431 if (hpet_address || force_hpet_address)
347 return; 432 return;
348 433
434 if (!hpet_force_user) {
435 hpet_print_force_info();
436 return;
437 }
438
349 pci_write_config_dword(dev, 0x44, 0xfed00001); 439 pci_write_config_dword(dev, 0x44, 0xfed00001);
350 pci_read_config_dword(dev, 0x44, &val); 440 pci_read_config_dword(dev, 0x44, &val);
351 force_hpet_address = val & 0xfffffffe; 441 force_hpet_address = val & 0xfffffffe;
@@ -397,6 +487,9 @@ void force_hpet_resume(void)
397 case NVIDIA_FORCE_HPET_RESUME: 487 case NVIDIA_FORCE_HPET_RESUME:
398 nvidia_force_hpet_resume(); 488 nvidia_force_hpet_resume();
399 return; 489 return;
490 case ATI_FORCE_HPET_RESUME:
491 ati_force_hpet_resume();
492 return;
400 default: 493 default:
401 break; 494 break;
402 } 495 }
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index f6be7d5f82f8..f4c93f1cfc19 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -27,9 +27,13 @@
27void (*pm_power_off)(void); 27void (*pm_power_off)(void);
28EXPORT_SYMBOL(pm_power_off); 28EXPORT_SYMBOL(pm_power_off);
29 29
30static long no_idt[3]; 30static const struct desc_ptr no_idt = {};
31static int reboot_mode; 31static int reboot_mode;
32enum reboot_type reboot_type = BOOT_KBD; 32/*
33 * Keyboard reset and triple fault may result in INIT, not RESET, which
34 * doesn't work when we're in vmx root mode. Try ACPI first.
35 */
36enum reboot_type reboot_type = BOOT_ACPI;
33int reboot_force; 37int reboot_force;
34 38
35#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) 39#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
@@ -177,6 +181,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
177 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"), 181 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
178 }, 182 },
179 }, 183 },
184 { /* Handle problems with rebooting on Dell T5400's */
185 .callback = set_bios_reboot,
186 .ident = "Dell Precision T5400",
187 .matches = {
188 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
189 DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"),
190 },
191 },
180 { /* Handle problems with rebooting on HP laptops */ 192 { /* Handle problems with rebooting on HP laptops */
181 .callback = set_bios_reboot, 193 .callback = set_bios_reboot,
182 .ident = "HP Compaq Laptop", 194 .ident = "HP Compaq Laptop",
@@ -201,15 +213,15 @@ core_initcall(reboot_init);
201 controller to pulse the CPU reset line, which is more thorough, but 213 controller to pulse the CPU reset line, which is more thorough, but
202 doesn't work with at least one type of 486 motherboard. It is easy 214 doesn't work with at least one type of 486 motherboard. It is easy
203 to stop this code working; hence the copious comments. */ 215 to stop this code working; hence the copious comments. */
204static unsigned long long 216static const unsigned long long
205real_mode_gdt_entries [3] = 217real_mode_gdt_entries [3] =
206{ 218{
207 0x0000000000000000ULL, /* Null descriptor */ 219 0x0000000000000000ULL, /* Null descriptor */
208 0x00009a000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */ 220 0x00009b000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */
209 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */ 221 0x000093000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */
210}; 222};
211 223
212static struct desc_ptr 224static const struct desc_ptr
213real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries }, 225real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
214real_mode_idt = { 0x3ff, 0 }; 226real_mode_idt = { 0x3ff, 0 };
215 227
@@ -231,7 +243,7 @@ real_mode_idt = { 0x3ff, 0 };
231 243
232 More could be done here to set up the registers as if a CPU reset had 244 More could be done here to set up the registers as if a CPU reset had
233 occurred; hopefully real BIOSs don't assume much. */ 245 occurred; hopefully real BIOSs don't assume much. */
234static unsigned char real_mode_switch [] = 246static const unsigned char real_mode_switch [] =
235{ 247{
236 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */ 248 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */
237 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */ 249 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */
@@ -245,7 +257,7 @@ static unsigned char real_mode_switch [] =
245 0x24, 0x10, /* f: andb $0x10,al */ 257 0x24, 0x10, /* f: andb $0x10,al */
246 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */ 258 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */
247}; 259};
248static unsigned char jump_to_bios [] = 260static const unsigned char jump_to_bios [] =
249{ 261{
250 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */ 262 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */
251}; 263};
@@ -255,7 +267,7 @@ static unsigned char jump_to_bios [] =
255 * specified by the code and length parameters. 267 * specified by the code and length parameters.
256 * We assume that length will aways be less that 100! 268 * We assume that length will aways be less that 100!
257 */ 269 */
258void machine_real_restart(unsigned char *code, int length) 270void machine_real_restart(const unsigned char *code, int length)
259{ 271{
260 local_irq_disable(); 272 local_irq_disable();
261 273
@@ -368,7 +380,7 @@ static void native_machine_emergency_restart(void)
368 } 380 }
369 381
370 case BOOT_TRIPLE: 382 case BOOT_TRIPLE:
371 load_idt((const struct desc_ptr *)&no_idt); 383 load_idt(&no_idt);
372 __asm__ __volatile__("int3"); 384 __asm__ __volatile__("int3");
373 385
374 reboot_type = BOOT_KBD; 386 reboot_type = BOOT_KBD;
@@ -403,10 +415,9 @@ void native_machine_shutdown(void)
403{ 415{
404 /* Stop the cpus and apics */ 416 /* Stop the cpus and apics */
405#ifdef CONFIG_SMP 417#ifdef CONFIG_SMP
406 int reboot_cpu_id;
407 418
408 /* The boot cpu is always logical cpu 0 */ 419 /* The boot cpu is always logical cpu 0 */
409 reboot_cpu_id = 0; 420 int reboot_cpu_id = 0;
410 421
411#ifdef CONFIG_X86_32 422#ifdef CONFIG_X86_32
412 /* See if there has been given a command line override */ 423 /* See if there has been given a command line override */
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index dec0b5ec25c2..61a837743fe5 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -49,7 +49,7 @@ struct device_fixup {
49 void (*reboot_fixup)(struct pci_dev *); 49 void (*reboot_fixup)(struct pci_dev *);
50}; 50};
51 51
52static struct device_fixup fixups_table[] = { 52static const struct device_fixup fixups_table[] = {
53{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset }, 53{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
54{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset }, 54{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
55{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset }, 55{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset },
@@ -64,7 +64,7 @@ static struct device_fixup fixups_table[] = {
64 */ 64 */
65void mach_reboot_fixups(void) 65void mach_reboot_fixups(void)
66{ 66{
67 struct device_fixup *cur; 67 const struct device_fixup *cur;
68 struct pci_dev *dev; 68 struct pci_dev *dev;
69 int i; 69 int i;
70 70
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index c30fe25d470d..6f50664b2ba5 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -20,11 +20,45 @@
20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) 20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
21#define PAE_PGD_ATTR (_PAGE_PRESENT) 21#define PAE_PGD_ATTR (_PAGE_PRESENT)
22 22
23/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE
24 * ~ control_page + PAGE_SIZE are used as data storage and stack for
25 * jumping back
26 */
27#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
28
29/* Minimal CPU state */
30#define ESP DATA(0x0)
31#define CR0 DATA(0x4)
32#define CR3 DATA(0x8)
33#define CR4 DATA(0xc)
34
35/* other data */
36#define CP_VA_CONTROL_PAGE DATA(0x10)
37#define CP_PA_PGD DATA(0x14)
38#define CP_PA_SWAP_PAGE DATA(0x18)
39#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c)
40
23 .text 41 .text
24 .align PAGE_SIZE 42 .align PAGE_SIZE
25 .globl relocate_kernel 43 .globl relocate_kernel
26relocate_kernel: 44relocate_kernel:
27 movl 8(%esp), %ebp /* list of pages */ 45 /* Save the CPU context, used for jumping back */
46
47 pushl %ebx
48 pushl %esi
49 pushl %edi
50 pushl %ebp
51 pushf
52
53 movl 20+8(%esp), %ebp /* list of pages */
54 movl PTR(VA_CONTROL_PAGE)(%ebp), %edi
55 movl %esp, ESP(%edi)
56 movl %cr0, %eax
57 movl %eax, CR0(%edi)
58 movl %cr3, %eax
59 movl %eax, CR3(%edi)
60 movl %cr4, %eax
61 movl %eax, CR4(%edi)
28 62
29#ifdef CONFIG_X86_PAE 63#ifdef CONFIG_X86_PAE
30 /* map the control page at its virtual address */ 64 /* map the control page at its virtual address */
@@ -138,15 +172,25 @@ relocate_kernel:
138 172
139relocate_new_kernel: 173relocate_new_kernel:
140 /* read the arguments and say goodbye to the stack */ 174 /* read the arguments and say goodbye to the stack */
141 movl 4(%esp), %ebx /* page_list */ 175 movl 20+4(%esp), %ebx /* page_list */
142 movl 8(%esp), %ebp /* list of pages */ 176 movl 20+8(%esp), %ebp /* list of pages */
143 movl 12(%esp), %edx /* start address */ 177 movl 20+12(%esp), %edx /* start address */
144 movl 16(%esp), %ecx /* cpu_has_pae */ 178 movl 20+16(%esp), %ecx /* cpu_has_pae */
179 movl 20+20(%esp), %esi /* preserve_context */
145 180
146 /* zero out flags, and disable interrupts */ 181 /* zero out flags, and disable interrupts */
147 pushl $0 182 pushl $0
148 popfl 183 popfl
149 184
185 /* save some information for jumping back */
186 movl PTR(VA_CONTROL_PAGE)(%ebp), %edi
187 movl %edi, CP_VA_CONTROL_PAGE(%edi)
188 movl PTR(PA_PGD)(%ebp), %eax
189 movl %eax, CP_PA_PGD(%edi)
190 movl PTR(PA_SWAP_PAGE)(%ebp), %eax
191 movl %eax, CP_PA_SWAP_PAGE(%edi)
192 movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi)
193
150 /* get physical address of control page now */ 194 /* get physical address of control page now */
151 /* this is impossible after page table switch */ 195 /* this is impossible after page table switch */
152 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi 196 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi
@@ -197,8 +241,90 @@ identity_mapped:
197 xorl %eax, %eax 241 xorl %eax, %eax
198 movl %eax, %cr3 242 movl %eax, %cr3
199 243
244 movl CP_PA_SWAP_PAGE(%edi), %eax
245 pushl %eax
246 pushl %ebx
247 call swap_pages
248 addl $8, %esp
249
250 /* To be certain of avoiding problems with self-modifying code
251 * I need to execute a serializing instruction here.
252 * So I flush the TLB, it's handy, and not processor dependent.
253 */
254 xorl %eax, %eax
255 movl %eax, %cr3
256
257 /* set all of the registers to known values */
258 /* leave %esp alone */
259
260 testl %esi, %esi
261 jnz 1f
262 xorl %edi, %edi
263 xorl %eax, %eax
264 xorl %ebx, %ebx
265 xorl %ecx, %ecx
266 xorl %edx, %edx
267 xorl %esi, %esi
268 xorl %ebp, %ebp
269 ret
2701:
271 popl %edx
272 movl CP_PA_SWAP_PAGE(%edi), %esp
273 addl $PAGE_SIZE, %esp
2742:
275 call *%edx
276
277 /* get the re-entry point of the peer system */
278 movl 0(%esp), %ebp
279 call 1f
2801:
281 popl %ebx
282 subl $(1b - relocate_kernel), %ebx
283 movl CP_VA_CONTROL_PAGE(%ebx), %edi
284 lea PAGE_SIZE(%ebx), %esp
285 movl CP_PA_SWAP_PAGE(%ebx), %eax
286 movl CP_PA_BACKUP_PAGES_MAP(%ebx), %edx
287 pushl %eax
288 pushl %edx
289 call swap_pages
290 addl $8, %esp
291 movl CP_PA_PGD(%ebx), %eax
292 movl %eax, %cr3
293 movl %cr0, %eax
294 orl $(1<<31), %eax
295 movl %eax, %cr0
296 lea PAGE_SIZE(%edi), %esp
297 movl %edi, %eax
298 addl $(virtual_mapped - relocate_kernel), %eax
299 pushl %eax
300 ret
301
302virtual_mapped:
303 movl CR4(%edi), %eax
304 movl %eax, %cr4
305 movl CR3(%edi), %eax
306 movl %eax, %cr3
307 movl CR0(%edi), %eax
308 movl %eax, %cr0
309 movl ESP(%edi), %esp
310 movl %ebp, %eax
311
312 popf
313 popl %ebp
314 popl %edi
315 popl %esi
316 popl %ebx
317 ret
318
200 /* Do the copies */ 319 /* Do the copies */
201 movl %ebx, %ecx 320swap_pages:
321 movl 8(%esp), %edx
322 movl 4(%esp), %ecx
323 pushl %ebp
324 pushl %ebx
325 pushl %edi
326 pushl %esi
327 movl %ecx, %ebx
202 jmp 1f 328 jmp 1f
203 329
2040: /* top, read another word from the indirection page */ 3300: /* top, read another word from the indirection page */
@@ -226,27 +352,31 @@ identity_mapped:
226 movl %ecx, %esi /* For every source page do a copy */ 352 movl %ecx, %esi /* For every source page do a copy */
227 andl $0xfffff000, %esi 353 andl $0xfffff000, %esi
228 354
355 movl %edi, %eax
356 movl %esi, %ebp
357
358 movl %edx, %edi
229 movl $1024, %ecx 359 movl $1024, %ecx
230 rep ; movsl 360 rep ; movsl
231 jmp 0b
232
2333:
234 361
235 /* To be certain of avoiding problems with self-modifying code 362 movl %ebp, %edi
236 * I need to execute a serializing instruction here. 363 movl %eax, %esi
237 * So I flush the TLB, it's handy, and not processor dependent. 364 movl $1024, %ecx
238 */ 365 rep ; movsl
239 xorl %eax, %eax
240 movl %eax, %cr3
241 366
242 /* set all of the registers to known values */ 367 movl %eax, %edi
243 /* leave %esp alone */ 368 movl %edx, %esi
369 movl $1024, %ecx
370 rep ; movsl
244 371
245 xorl %eax, %eax 372 lea PAGE_SIZE(%ebp), %esi
246 xorl %ebx, %ebx 373 jmp 0b
247 xorl %ecx, %ecx 3743:
248 xorl %edx, %edx 375 popl %esi
249 xorl %esi, %esi 376 popl %edi
250 xorl %edi, %edi 377 popl %ebx
251 xorl %ebp, %ebp 378 popl %ebp
252 ret 379 ret
380
381 .globl kexec_control_code_size
382.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 05191bbc68b8..0a23b5795b25 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -223,11 +223,25 @@ static struct platform_device rtc_device = {
223static __init int add_rtc_cmos(void) 223static __init int add_rtc_cmos(void)
224{ 224{
225#ifdef CONFIG_PNP 225#ifdef CONFIG_PNP
226 if (!pnp_platform_devices) 226 static const char *ids[] __initconst =
227 platform_device_register(&rtc_device); 227 { "PNP0b00", "PNP0b01", "PNP0b02", };
228#else 228 struct pnp_dev *dev;
229 struct pnp_id *id;
230 int i;
231
232 pnp_for_each_dev(dev) {
233 for (id = dev->id; id; id = id->next) {
234 for (i = 0; i < ARRAY_SIZE(ids); i++) {
235 if (compare_pnp_id(id, ids[i]) != 0)
236 return 0;
237 }
238 }
239 }
240#endif
241
229 platform_device_register(&rtc_device); 242 platform_device_register(&rtc_device);
230#endif /* CONFIG_PNP */ 243 dev_info(&rtc_device.dev,
244 "registered platform RTC device (no PNP device found)\n");
231 return 0; 245 return 0;
232} 246}
233device_initcall(add_rtc_cmos); 247device_initcall(add_rtc_cmos);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 6f80b852a196..2255782e8d4b 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1,139 +1,1099 @@
1#include <linux/kernel.h> 1/*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
5 *
6 * Memory region support
7 * David Parsons <orc@pell.chi.il.us>, July-August 1999
8 *
9 * Added E820 sanitization routine (removes overlapping memory regions);
10 * Brian Moyle <bmoyle@mvista.com>, February 2001
11 *
12 * Moved CPU detection code to cpu/${cpu}.c
13 * Patrick Mochel <mochel@osdl.org>, March 2002
14 *
15 * Provisions for empty E820 memory regions (reported by certain BIOSes).
16 * Alex Achenbach <xela@slit.de>, December 2002.
17 *
18 */
19
20/*
21 * This file handles the architecture-dependent parts of initialization
22 */
23
24#include <linux/sched.h>
25#include <linux/mm.h>
26#include <linux/mmzone.h>
27#include <linux/screen_info.h>
28#include <linux/ioport.h>
29#include <linux/acpi.h>
30#include <linux/apm_bios.h>
31#include <linux/initrd.h>
32#include <linux/bootmem.h>
33#include <linux/seq_file.h>
34#include <linux/console.h>
35#include <linux/mca.h>
36#include <linux/root_dev.h>
37#include <linux/highmem.h>
2#include <linux/module.h> 38#include <linux/module.h>
39#include <linux/efi.h>
3#include <linux/init.h> 40#include <linux/init.h>
4#include <linux/bootmem.h> 41#include <linux/edd.h>
42#include <linux/iscsi_ibft.h>
43#include <linux/nodemask.h>
44#include <linux/kexec.h>
45#include <linux/dmi.h>
46#include <linux/pfn.h>
47#include <linux/pci.h>
48#include <asm/pci-direct.h>
49#include <linux/init_ohci1394_dma.h>
50#include <linux/kvm_para.h>
51
52#include <linux/errno.h>
53#include <linux/kernel.h>
54#include <linux/stddef.h>
55#include <linux/unistd.h>
56#include <linux/ptrace.h>
57#include <linux/slab.h>
58#include <linux/user.h>
59#include <linux/delay.h>
60
61#include <linux/kallsyms.h>
62#include <linux/cpufreq.h>
63#include <linux/dma-mapping.h>
64#include <linux/ctype.h>
65#include <linux/uaccess.h>
66
5#include <linux/percpu.h> 67#include <linux/percpu.h>
6#include <asm/smp.h> 68#include <linux/crash_dump.h>
7#include <asm/percpu.h> 69
70#include <video/edid.h>
71
72#include <asm/mtrr.h>
73#include <asm/apic.h>
74#include <asm/e820.h>
75#include <asm/mpspec.h>
76#include <asm/setup.h>
77#include <asm/arch_hooks.h>
78#include <asm/efi.h>
8#include <asm/sections.h> 79#include <asm/sections.h>
80#include <asm/dmi.h>
81#include <asm/io_apic.h>
82#include <asm/ist.h>
83#include <asm/vmi.h>
84#include <setup_arch.h>
85#include <asm/bios_ebda.h>
86#include <asm/cacheflush.h>
9#include <asm/processor.h> 87#include <asm/processor.h>
10#include <asm/setup.h> 88#include <asm/bugs.h>
89
90#include <asm/system.h>
91#include <asm/vsyscall.h>
92#include <asm/smp.h>
93#include <asm/desc.h>
94#include <asm/dma.h>
95#include <asm/iommu.h>
96#include <asm/mmu_context.h>
97#include <asm/proto.h>
98
99#include <mach_apic.h>
100#include <asm/paravirt.h>
101
102#include <asm/percpu.h>
11#include <asm/topology.h> 103#include <asm/topology.h>
12#include <asm/mpspec.h>
13#include <asm/apicdef.h> 104#include <asm/apicdef.h>
105#ifdef CONFIG_X86_64
106#include <asm/numa_64.h>
107#endif
14 108
15#ifdef CONFIG_X86_LOCAL_APIC 109#ifndef ARCH_SETUP
16unsigned int num_processors; 110#define ARCH_SETUP
17unsigned disabled_cpus __cpuinitdata; 111#endif
18/* Processor that is doing the boot up */
19unsigned int boot_cpu_physical_apicid = -1U;
20EXPORT_SYMBOL(boot_cpu_physical_apicid);
21 112
22DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID; 113#ifndef CONFIG_DEBUG_BOOT_PARAMS
23EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid); 114struct boot_params __initdata boot_params;
115#else
116struct boot_params boot_params;
117#endif
24 118
25/* Bitmask of physically existing CPUs */ 119/*
26physid_mask_t phys_cpu_present_map; 120 * Machine setup..
121 */
122static struct resource data_resource = {
123 .name = "Kernel data",
124 .start = 0,
125 .end = 0,
126 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
127};
128
129static struct resource code_resource = {
130 .name = "Kernel code",
131 .start = 0,
132 .end = 0,
133 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
134};
135
136static struct resource bss_resource = {
137 .name = "Kernel bss",
138 .start = 0,
139 .end = 0,
140 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
141};
142
143
144#ifdef CONFIG_X86_32
145/* This value is set up by the early boot code to point to the value
146 immediately after the boot time page tables. It contains a *physical*
147 address, and must not be in the .bss segment! */
148unsigned long init_pg_tables_start __initdata = ~0UL;
149unsigned long init_pg_tables_end __initdata = ~0UL;
150
151static struct resource video_ram_resource = {
152 .name = "Video RAM area",
153 .start = 0xa0000,
154 .end = 0xbffff,
155 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
156};
157
158/* cpu data as detected by the assembly code in head.S */
159struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};
160/* common cpu data for all cpus */
161struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1};
162EXPORT_SYMBOL(boot_cpu_data);
163static void set_mca_bus(int x)
164{
165#ifdef CONFIG_MCA
166 MCA_bus = x;
27#endif 167#endif
168}
169
170unsigned int def_to_bigsmp;
171
172/* for MCA, but anyone else can use it if they want */
173unsigned int machine_id;
174unsigned int machine_submodel_id;
175unsigned int BIOS_revision;
176
177struct apm_info apm_info;
178EXPORT_SYMBOL(apm_info);
179
180#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
181 defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
182struct ist_info ist_info;
183EXPORT_SYMBOL(ist_info);
184#else
185struct ist_info ist_info;
186#endif
187
188#else
189struct cpuinfo_x86 boot_cpu_data __read_mostly;
190EXPORT_SYMBOL(boot_cpu_data);
191#endif
192
193
194#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
195unsigned long mmu_cr4_features;
196#else
197unsigned long mmu_cr4_features = X86_CR4_PAE;
198#endif
199
200/* Boot loader ID as an integer, for the benefit of proc_dointvec */
201int bootloader_type;
202
203/*
204 * Early DMI memory
205 */
206int dmi_alloc_index;
207char dmi_alloc_data[DMI_MAX_DATA];
28 208
29#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
30/* 209/*
31 * Copy data used in early init routines from the initial arrays to the 210 * Setup options
32 * per cpu data areas. These arrays then become expendable and the 211 */
33 * *_early_ptr's are zeroed indicating that the static arrays are gone. 212struct screen_info screen_info;
213EXPORT_SYMBOL(screen_info);
214struct edid_info edid_info;
215EXPORT_SYMBOL_GPL(edid_info);
216
217extern int root_mountflags;
218
219unsigned long saved_video_mode;
220
221#define RAMDISK_IMAGE_START_MASK 0x07FF
222#define RAMDISK_PROMPT_FLAG 0x8000
223#define RAMDISK_LOAD_FLAG 0x4000
224
225static char __initdata command_line[COMMAND_LINE_SIZE];
226#ifdef CONFIG_CMDLINE_BOOL
227static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
228#endif
229
230#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
231struct edd edd;
232#ifdef CONFIG_EDD_MODULE
233EXPORT_SYMBOL(edd);
234#endif
235/**
236 * copy_edd() - Copy the BIOS EDD information
237 * from boot_params into a safe place.
238 *
34 */ 239 */
35static void __init setup_per_cpu_maps(void) 240static inline void copy_edd(void)
36{ 241{
37 int cpu; 242 memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
243 sizeof(edd.mbr_signature));
244 memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
245 edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
246 edd.edd_info_nr = boot_params.eddbuf_entries;
247}
248#else
249static inline void copy_edd(void)
250{
251}
252#endif
253
254#ifdef CONFIG_BLK_DEV_INITRD
255
256#ifdef CONFIG_X86_32
257
258#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
259static void __init relocate_initrd(void)
260{
261
262 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
263 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
264 u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
265 u64 ramdisk_here;
266 unsigned long slop, clen, mapaddr;
267 char *p, *q;
268
269 /* We need to move the initrd down into lowmem */
270 ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size,
271 PAGE_SIZE);
272
273 if (ramdisk_here == -1ULL)
274 panic("Cannot find place for new RAMDISK of size %lld\n",
275 ramdisk_size);
276
277 /* Note: this includes all the lowmem currently occupied by
278 the initrd, we rely on that fact to keep the data intact. */
279 reserve_early(ramdisk_here, ramdisk_here + ramdisk_size,
280 "NEW RAMDISK");
281 initrd_start = ramdisk_here + PAGE_OFFSET;
282 initrd_end = initrd_start + ramdisk_size;
283 printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
284 ramdisk_here, ramdisk_here + ramdisk_size);
285
286 q = (char *)initrd_start;
287
288 /* Copy any lowmem portion of the initrd */
289 if (ramdisk_image < end_of_lowmem) {
290 clen = end_of_lowmem - ramdisk_image;
291 p = (char *)__va(ramdisk_image);
292 memcpy(q, p, clen);
293 q += clen;
294 ramdisk_image += clen;
295 ramdisk_size -= clen;
296 }
38 297
39 for_each_possible_cpu(cpu) { 298 /* Copy the highmem portion of the initrd */
40 per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu]; 299 while (ramdisk_size) {
41 per_cpu(x86_bios_cpu_apicid, cpu) = 300 slop = ramdisk_image & ~PAGE_MASK;
42 x86_bios_cpu_apicid_init[cpu]; 301 clen = ramdisk_size;
43#ifdef CONFIG_NUMA 302 if (clen > MAX_MAP_CHUNK-slop)
44 per_cpu(x86_cpu_to_node_map, cpu) = 303 clen = MAX_MAP_CHUNK-slop;
45 x86_cpu_to_node_map_init[cpu]; 304 mapaddr = ramdisk_image & PAGE_MASK;
305 p = early_memremap(mapaddr, clen+slop);
306 memcpy(q, p+slop, clen);
307 early_iounmap(p, clen+slop);
308 q += clen;
309 ramdisk_image += clen;
310 ramdisk_size -= clen;
311 }
312 /* high pages is not converted by early_res_to_bootmem */
313 ramdisk_image = boot_params.hdr.ramdisk_image;
314 ramdisk_size = boot_params.hdr.ramdisk_size;
315 printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to"
316 " %08llx - %08llx\n",
317 ramdisk_image, ramdisk_image + ramdisk_size - 1,
318 ramdisk_here, ramdisk_here + ramdisk_size - 1);
319}
46#endif 320#endif
321
322static void __init reserve_initrd(void)
323{
324 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
325 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
326 u64 ramdisk_end = ramdisk_image + ramdisk_size;
327 u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
328
329 if (!boot_params.hdr.type_of_loader ||
330 !ramdisk_image || !ramdisk_size)
331 return; /* No initrd provided by bootloader */
332
333 initrd_start = 0;
334
335 if (ramdisk_size >= (end_of_lowmem>>1)) {
336 free_early(ramdisk_image, ramdisk_end);
337 printk(KERN_ERR "initrd too large to handle, "
338 "disabling initrd\n");
339 return;
340 }
341
342 printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image,
343 ramdisk_end);
344
345
346 if (ramdisk_end <= end_of_lowmem) {
347 /* All in lowmem, easy case */
348 /*
349 * don't need to reserve again, already reserved early
350 * in i386_start_kernel
351 */
352 initrd_start = ramdisk_image + PAGE_OFFSET;
353 initrd_end = initrd_start + ramdisk_size;
354 return;
47 } 355 }
48 356
49 /* indicate the early static arrays will soon be gone */ 357#ifdef CONFIG_X86_32
50 x86_cpu_to_apicid_early_ptr = NULL; 358 relocate_initrd();
51 x86_bios_cpu_apicid_early_ptr = NULL; 359#else
52#ifdef CONFIG_NUMA 360 printk(KERN_ERR "initrd extends beyond end of memory "
53 x86_cpu_to_node_map_early_ptr = NULL; 361 "(0x%08llx > 0x%08llx)\ndisabling initrd\n",
362 ramdisk_end, end_of_lowmem);
363 initrd_start = 0;
54#endif 364#endif
365 free_early(ramdisk_image, ramdisk_end);
366}
367#else
368static void __init reserve_initrd(void)
369{
370}
371#endif /* CONFIG_BLK_DEV_INITRD */
372
373static void __init parse_setup_data(void)
374{
375 struct setup_data *data;
376 u64 pa_data;
377
378 if (boot_params.hdr.version < 0x0209)
379 return;
380 pa_data = boot_params.hdr.setup_data;
381 while (pa_data) {
382 data = early_memremap(pa_data, PAGE_SIZE);
383 switch (data->type) {
384 case SETUP_E820_EXT:
385 parse_e820_ext(data, pa_data);
386 break;
387 default:
388 break;
389 }
390 pa_data = data->next;
391 early_iounmap(data, PAGE_SIZE);
392 }
55} 393}
56 394
57#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP 395static void __init e820_reserve_setup_data(void)
58cpumask_t *cpumask_of_cpu_map __read_mostly; 396{
59EXPORT_SYMBOL(cpumask_of_cpu_map); 397 struct setup_data *data;
398 u64 pa_data;
399 int found = 0;
400
401 if (boot_params.hdr.version < 0x0209)
402 return;
403 pa_data = boot_params.hdr.setup_data;
404 while (pa_data) {
405 data = early_memremap(pa_data, sizeof(*data));
406 e820_update_range(pa_data, sizeof(*data)+data->len,
407 E820_RAM, E820_RESERVED_KERN);
408 found = 1;
409 pa_data = data->next;
410 early_iounmap(data, sizeof(*data));
411 }
412 if (!found)
413 return;
414
415 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
416 memcpy(&e820_saved, &e820, sizeof(struct e820map));
417 printk(KERN_INFO "extended physical RAM map:\n");
418 e820_print_map("reserve setup_data");
419}
60 420
61/* requires nr_cpu_ids to be initialized */ 421static void __init reserve_early_setup_data(void)
62static void __init setup_cpumask_of_cpu(void)
63{ 422{
64 int i; 423 struct setup_data *data;
424 u64 pa_data;
425 char buf[32];
426
427 if (boot_params.hdr.version < 0x0209)
428 return;
429 pa_data = boot_params.hdr.setup_data;
430 while (pa_data) {
431 data = early_memremap(pa_data, sizeof(*data));
432 sprintf(buf, "setup data %x", data->type);
433 reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
434 pa_data = data->next;
435 early_iounmap(data, sizeof(*data));
436 }
437}
438
439/*
440 * --------- Crashkernel reservation ------------------------------
441 */
442
443#ifdef CONFIG_KEXEC
444
445/**
446 * Reserve @size bytes of crashkernel memory at any suitable offset.
447 *
448 * @size: Size of the crashkernel memory to reserve.
449 * Returns the base address on success, and -1ULL on failure.
450 */
451unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
452{
453 const unsigned long long alignment = 16<<20; /* 16M */
454 unsigned long long start = 0LL;
455
456 while (1) {
457 int ret;
458
459 start = find_e820_area(start, ULONG_MAX, size, alignment);
460 if (start == -1ULL)
461 return start;
462
463 /* try to reserve it */
464 ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE);
465 if (ret >= 0)
466 return start;
467
468 start += alignment;
469 }
470}
471
472static inline unsigned long long get_total_mem(void)
473{
474 unsigned long long total;
475
476 total = max_low_pfn - min_low_pfn;
477#ifdef CONFIG_HIGHMEM
478 total += highend_pfn - highstart_pfn;
479#endif
65 480
66 /* alloc_bootmem zeroes memory */ 481 return total << PAGE_SHIFT;
67 cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids); 482}
68 for (i = 0; i < nr_cpu_ids; i++) 483
69 cpu_set(i, cpumask_of_cpu_map[i]); 484static void __init reserve_crashkernel(void)
485{
486 unsigned long long total_mem;
487 unsigned long long crash_size, crash_base;
488 int ret;
489
490 total_mem = get_total_mem();
491
492 ret = parse_crashkernel(boot_command_line, total_mem,
493 &crash_size, &crash_base);
494 if (ret != 0 || crash_size <= 0)
495 return;
496
497 /* 0 means: find the address automatically */
498 if (crash_base <= 0) {
499 crash_base = find_and_reserve_crashkernel(crash_size);
500 if (crash_base == -1ULL) {
501 pr_info("crashkernel reservation failed. "
502 "No suitable area found.\n");
503 return;
504 }
505 } else {
506 ret = reserve_bootmem_generic(crash_base, crash_size,
507 BOOTMEM_EXCLUSIVE);
508 if (ret < 0) {
509 pr_info("crashkernel reservation failed - "
510 "memory is in use\n");
511 return;
512 }
513 }
514
515 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
516 "for crashkernel (System RAM: %ldMB)\n",
517 (unsigned long)(crash_size >> 20),
518 (unsigned long)(crash_base >> 20),
519 (unsigned long)(total_mem >> 20));
520
521 crashk_res.start = crash_base;
522 crashk_res.end = crash_base + crash_size - 1;
523 insert_resource(&iomem_resource, &crashk_res);
70} 524}
71#else 525#else
72static inline void setup_cpumask_of_cpu(void) { } 526static void __init reserve_crashkernel(void)
527{
528}
73#endif 529#endif
74 530
75#ifdef CONFIG_X86_32 531static struct resource standard_io_resources[] = {
532 { .name = "dma1", .start = 0x00, .end = 0x1f,
533 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
534 { .name = "pic1", .start = 0x20, .end = 0x21,
535 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
536 { .name = "timer0", .start = 0x40, .end = 0x43,
537 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
538 { .name = "timer1", .start = 0x50, .end = 0x53,
539 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
540 { .name = "keyboard", .start = 0x60, .end = 0x60,
541 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
542 { .name = "keyboard", .start = 0x64, .end = 0x64,
543 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
544 { .name = "dma page reg", .start = 0x80, .end = 0x8f,
545 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
546 { .name = "pic2", .start = 0xa0, .end = 0xa1,
547 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
548 { .name = "dma2", .start = 0xc0, .end = 0xdf,
549 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
550 { .name = "fpu", .start = 0xf0, .end = 0xff,
551 .flags = IORESOURCE_BUSY | IORESOURCE_IO }
552};
553
554static void __init reserve_standard_io_resources(void)
555{
556 int i;
557
558 /* request I/O space for devices used on all i[345]86 PCs */
559 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
560 request_resource(&ioport_resource, &standard_io_resources[i]);
561
562}
563
564#ifdef CONFIG_PROC_VMCORE
565/* elfcorehdr= specifies the location of elf core header
566 * stored by the crashed kernel. This option will be passed
567 * by kexec loader to the capture kernel.
568 */
569static int __init setup_elfcorehdr(char *arg)
570{
571 char *end;
572 if (!arg)
573 return -EINVAL;
574 elfcorehdr_addr = memparse(arg, &end);
575 return end > arg ? 0 : -EINVAL;
576}
577early_param("elfcorehdr", setup_elfcorehdr);
578#endif
579
580static struct x86_quirks default_x86_quirks __initdata;
581
582struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
583
76/* 584/*
77 * Great future not-so-futuristic plan: make i386 and x86_64 do it 585 * Some BIOSes seem to corrupt the low 64k of memory during events
78 * the same way 586 * like suspend/resume and unplugging an HDMI cable. Reserve all
587 * remaining free memory in that area and fill it with a distinct
588 * pattern.
79 */ 589 */
80unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; 590#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
81EXPORT_SYMBOL(__per_cpu_offset); 591#define MAX_SCAN_AREAS 8
592
593static int __read_mostly memory_corruption_check = -1;
594
595static unsigned __read_mostly corruption_check_size = 64*1024;
596static unsigned __read_mostly corruption_check_period = 60; /* seconds */
597
598static struct e820entry scan_areas[MAX_SCAN_AREAS];
599static int num_scan_areas;
600
601
602static int set_corruption_check(char *arg)
603{
604 char *end;
605
606 memory_corruption_check = simple_strtol(arg, &end, 10);
607
608 return (*end == 0) ? 0 : -EINVAL;
609}
610early_param("memory_corruption_check", set_corruption_check);
611
612static int set_corruption_check_period(char *arg)
613{
614 char *end;
615
616 corruption_check_period = simple_strtoul(arg, &end, 10);
617
618 return (*end == 0) ? 0 : -EINVAL;
619}
620early_param("memory_corruption_check_period", set_corruption_check_period);
621
622static int set_corruption_check_size(char *arg)
623{
624 char *end;
625 unsigned size;
626
627 size = memparse(arg, &end);
628
629 if (*end == '\0')
630 corruption_check_size = size;
631
632 return (size == corruption_check_size) ? 0 : -EINVAL;
633}
634early_param("memory_corruption_check_size", set_corruption_check_size);
635
636
637static void __init setup_bios_corruption_check(void)
638{
639 u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */
640
641 if (memory_corruption_check == -1) {
642 memory_corruption_check =
643#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
644 1
645#else
646 0
647#endif
648 ;
649 }
650
651 if (corruption_check_size == 0)
652 memory_corruption_check = 0;
653
654 if (!memory_corruption_check)
655 return;
656
657 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
658
659 while(addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
660 u64 size;
661 addr = find_e820_area_size(addr, &size, PAGE_SIZE);
662
663 if (addr == 0)
664 break;
665
666 if ((addr + size) > corruption_check_size)
667 size = corruption_check_size - addr;
668
669 if (size == 0)
670 break;
671
672 e820_update_range(addr, size, E820_RAM, E820_RESERVED);
673 scan_areas[num_scan_areas].addr = addr;
674 scan_areas[num_scan_areas].size = size;
675 num_scan_areas++;
676
677 /* Assume we've already mapped this early memory */
678 memset(__va(addr), 0, size);
679
680 addr += size;
681 }
682
683 printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
684 num_scan_areas);
685 update_e820();
686}
687
688static struct timer_list periodic_check_timer;
689
690void check_for_bios_corruption(void)
691{
692 int i;
693 int corruption = 0;
694
695 if (!memory_corruption_check)
696 return;
697
698 for(i = 0; i < num_scan_areas; i++) {
699 unsigned long *addr = __va(scan_areas[i].addr);
700 unsigned long size = scan_areas[i].size;
701
702 for(; size; addr++, size -= sizeof(unsigned long)) {
703 if (!*addr)
704 continue;
705 printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
706 addr, __pa(addr), *addr);
707 corruption = 1;
708 *addr = 0;
709 }
710 }
711
712 WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n");
713}
714
715static void periodic_check_for_corruption(unsigned long data)
716{
717 check_for_bios_corruption();
718 mod_timer(&periodic_check_timer, round_jiffies(jiffies + corruption_check_period*HZ));
719}
720
721void start_periodic_check_for_corruption(void)
722{
723 if (!memory_corruption_check || corruption_check_period == 0)
724 return;
725
726 printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
727 corruption_check_period);
728
729 init_timer(&periodic_check_timer);
730 periodic_check_timer.function = &periodic_check_for_corruption;
731 periodic_check_for_corruption(0);
732}
82#endif 733#endif
83 734
735static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
736{
737 printk(KERN_NOTICE
738 "%s detected: BIOS may corrupt low RAM, working it around.\n",
739 d->ident);
740
741 e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED);
742 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
743
744 return 0;
745}
746
747/* List of systems that have known low memory corruption BIOS problems */
748static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
749#ifdef CONFIG_X86_RESERVE_LOW_64K
750 {
751 .callback = dmi_low_memory_corruption,
752 .ident = "AMI BIOS",
753 .matches = {
754 DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
755 },
756 },
757 {
758 .callback = dmi_low_memory_corruption,
759 .ident = "Phoenix BIOS",
760 .matches = {
761 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"),
762 },
763 },
764#endif
765 {}
766};
767
84/* 768/*
85 * Great future plan: 769 * Determine if we were loaded by an EFI loader. If so, then we have also been
86 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. 770 * passed the efi memmap, systab, etc., so we should use these data structures
87 * Always point %gs to its beginning 771 * for initialization. Note, the efi init code path is determined by the
772 * global efi_enabled. This allows the same kernel image to be used on existing
773 * systems (with a traditional BIOS) as well as on EFI systems.
88 */ 774 */
89void __init setup_per_cpu_areas(void) 775/*
776 * setup_arch - architecture-specific boot-time initializations
777 *
778 * Note: On x86_64, fixmaps are ready for use even before this is called.
779 */
780
781void __init setup_arch(char **cmdline_p)
90{ 782{
91 int i, highest_cpu = 0; 783#ifdef CONFIG_X86_32
92 unsigned long size; 784 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
785 visws_early_detect();
786 pre_setup_arch_hook();
787#else
788 printk(KERN_INFO "Command line: %s\n", boot_command_line);
789#endif
93 790
94#ifdef CONFIG_HOTPLUG_CPU 791 early_cpu_init();
95 prefill_possible_map(); 792 early_ioremap_init();
793
794 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
795 screen_info = boot_params.screen_info;
796 edid_info = boot_params.edid_info;
797#ifdef CONFIG_X86_32
798 apm_info.bios = boot_params.apm_bios_info;
799 ist_info = boot_params.ist_info;
800 if (boot_params.sys_desc_table.length != 0) {
801 set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
802 machine_id = boot_params.sys_desc_table.table[0];
803 machine_submodel_id = boot_params.sys_desc_table.table[1];
804 BIOS_revision = boot_params.sys_desc_table.table[2];
805 }
96#endif 806#endif
807 saved_video_mode = boot_params.hdr.vid_mode;
808 bootloader_type = boot_params.hdr.type_of_loader;
97 809
98 /* Copy section for each CPU (we discard the original) */ 810#ifdef CONFIG_BLK_DEV_RAM
99 size = PERCPU_ENOUGH_ROOM; 811 rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
100 printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", 812 rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
101 size); 813 rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
814#endif
815#ifdef CONFIG_EFI
816 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
817#ifdef CONFIG_X86_32
818 "EL32",
819#else
820 "EL64",
821#endif
822 4)) {
823 efi_enabled = 1;
824 efi_reserve_early();
825 }
826#endif
102 827
103 for_each_possible_cpu(i) { 828 ARCH_SETUP
104 char *ptr; 829
105#ifndef CONFIG_NEED_MULTIPLE_NODES 830 setup_memory_map();
106 ptr = alloc_bootmem_pages(size); 831 parse_setup_data();
832 /* update the e820_saved too */
833 e820_reserve_setup_data();
834
835 copy_edd();
836
837 if (!boot_params.hdr.root_flags)
838 root_mountflags &= ~MS_RDONLY;
839 init_mm.start_code = (unsigned long) _text;
840 init_mm.end_code = (unsigned long) _etext;
841 init_mm.end_data = (unsigned long) _edata;
842#ifdef CONFIG_X86_32
843 init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
107#else 844#else
108 int node = early_cpu_to_node(i); 845 init_mm.brk = (unsigned long) &_end;
109 if (!node_online(node) || !NODE_DATA(node)) { 846#endif
110 ptr = alloc_bootmem_pages(size); 847
111 printk(KERN_INFO 848 code_resource.start = virt_to_phys(_text);
112 "cpu %d has no node or node-local memory\n", i); 849 code_resource.end = virt_to_phys(_etext)-1;
113 } 850 data_resource.start = virt_to_phys(_etext);
114 else 851 data_resource.end = virt_to_phys(_edata)-1;
115 ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); 852 bss_resource.start = virt_to_phys(&__bss_start);
853 bss_resource.end = virt_to_phys(&__bss_stop)-1;
854
855#ifdef CONFIG_CMDLINE_BOOL
856#ifdef CONFIG_CMDLINE_OVERRIDE
857 strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
858#else
859 if (builtin_cmdline[0]) {
860 /* append boot loader cmdline to builtin */
861 strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
862 strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
863 strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
864 }
116#endif 865#endif
117 if (!ptr) 866#endif
118 panic("Cannot allocate cpu data for CPU %d\n", i); 867
868 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
869 *cmdline_p = command_line;
870
871 parse_early_param();
872
119#ifdef CONFIG_X86_64 873#ifdef CONFIG_X86_64
120 cpu_pda(i)->data_offset = ptr - __per_cpu_start; 874 check_efer();
875#endif
876
877#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
878 /*
879 * Must be before kernel pagetables are setup
880 * or fixmap area is touched.
881 */
882 vmi_init();
883#endif
884
885 /* after early param, so could get panic from serial */
886 reserve_early_setup_data();
887
888 if (acpi_mps_check()) {
889#ifdef CONFIG_X86_LOCAL_APIC
890 disable_apic = 1;
891#endif
892 setup_clear_cpu_cap(X86_FEATURE_APIC);
893 }
894
895#ifdef CONFIG_PCI
896 if (pci_early_dump_regs)
897 early_dump_pci_devices();
898#endif
899
900 finish_e820_parsing();
901
902 dmi_scan_machine();
903
904 dmi_check_system(bad_bios_dmi_table);
905
906#ifdef CONFIG_X86_32
907 probe_roms();
908#endif
909
910 /* after parse_early_param, so could debug it */
911 insert_resource(&iomem_resource, &code_resource);
912 insert_resource(&iomem_resource, &data_resource);
913 insert_resource(&iomem_resource, &bss_resource);
914
915 if (efi_enabled)
916 efi_init();
917
918#ifdef CONFIG_X86_32
919 if (ppro_with_ram_bug()) {
920 e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
921 E820_RESERVED);
922 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
923 printk(KERN_INFO "fixed physical RAM map:\n");
924 e820_print_map("bad_ppro");
925 }
121#else 926#else
122 __per_cpu_offset[i] = ptr - __per_cpu_start; 927 early_gart_iommu_check();
123#endif 928#endif
124 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
125 929
126 highest_cpu = i; 930 /*
931 * partially used pages are not usable - thus
932 * we are rounding upwards:
933 */
934 max_pfn = e820_end_of_ram_pfn();
935
936 /* preallocate 4k for mptable mpc */
937 early_reserve_e820_mpc_new();
938 /* update e820 for memory not covered by WB MTRRs */
939 mtrr_bp_init();
940 if (mtrr_trim_uncached_memory(max_pfn))
941 max_pfn = e820_end_of_ram_pfn();
942
943#ifdef CONFIG_X86_32
944 /* max_low_pfn get updated here */
945 find_low_pfn_range();
946#else
947 num_physpages = max_pfn;
948
949 if (cpu_has_x2apic)
950 check_x2apic();
951
952 /* How many end-of-memory variables you have, grandma! */
953 /* need this before calling reserve_initrd */
954 if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
955 max_low_pfn = e820_end_of_low_ram_pfn();
956 else
957 max_low_pfn = max_pfn;
958
959 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
960#endif
961
962#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
963 setup_bios_corruption_check();
964#endif
965
966 /* max_pfn_mapped is updated here */
967 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
968 max_pfn_mapped = max_low_pfn_mapped;
969
970#ifdef CONFIG_X86_64
971 if (max_pfn > max_low_pfn) {
972 max_pfn_mapped = init_memory_mapping(1UL<<32,
973 max_pfn<<PAGE_SHIFT);
974 /* can we preseve max_low_pfn ?*/
975 max_low_pfn = max_pfn;
127 } 976 }
977#endif
128 978
129 nr_cpu_ids = highest_cpu + 1; 979 /*
130 printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids); 980 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
981 */
131 982
132 /* Setup percpu data maps */ 983#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
133 setup_per_cpu_maps(); 984 if (init_ohci1394_dma_early)
985 init_ohci1394_dma_on_all_controllers();
986#endif
134 987
135 /* Setup cpumask_of_cpu map */ 988 reserve_initrd();
136 setup_cpumask_of_cpu(); 989
137} 990#ifdef CONFIG_X86_64
991 vsmp_init();
992#endif
993
994 io_delay_init();
995
996 /*
997 * Parse the ACPI tables for possible boot-time SMP configuration.
998 */
999 acpi_boot_table_init();
1000
1001 early_acpi_boot_init();
1002
1003#ifdef CONFIG_ACPI_NUMA
1004 /*
1005 * Parse SRAT to discover nodes.
1006 */
1007 acpi_numa_init();
1008#endif
1009
1010 initmem_init(0, max_pfn);
1011
1012#ifdef CONFIG_ACPI_SLEEP
1013 /*
1014 * Reserve low memory region for sleep support.
1015 */
1016 acpi_reserve_bootmem();
1017#endif
1018#ifdef CONFIG_X86_FIND_SMP_CONFIG
1019 /*
1020 * Find and reserve possible boot-time SMP configuration:
1021 */
1022 find_smp_config();
1023#endif
1024 reserve_crashkernel();
1025
1026#ifdef CONFIG_X86_64
1027 /*
1028 * dma32_reserve_bootmem() allocates bootmem which may conflict
1029 * with the crashkernel command line, so do that after
1030 * reserve_crashkernel()
1031 */
1032 dma32_reserve_bootmem();
1033#endif
1034
1035 reserve_ibft_region();
1036
1037#ifdef CONFIG_KVM_CLOCK
1038 kvmclock_init();
1039#endif
1040
1041 paravirt_pagetable_setup_start(swapper_pg_dir);
1042 paging_init();
1043 paravirt_pagetable_setup_done(swapper_pg_dir);
1044 paravirt_post_allocator_init();
1045
1046#ifdef CONFIG_X86_64
1047 map_vsyscall();
1048#endif
138 1049
1050#ifdef CONFIG_X86_GENERICARCH
1051 generic_apic_probe();
139#endif 1052#endif
1053
1054 early_quirks();
1055
1056 /*
1057 * Read APIC and some other early information from ACPI tables.
1058 */
1059 acpi_boot_init();
1060
1061#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
1062 /*
1063 * get boot-time SMP configuration:
1064 */
1065 if (smp_found_config)
1066 get_smp_config();
1067#endif
1068
1069 prefill_possible_map();
1070#ifdef CONFIG_X86_64
1071 init_cpu_to_node();
1072#endif
1073
1074 init_apic_mappings();
1075 ioapic_init_mappings();
1076
1077 kvm_guest_init();
1078
1079 e820_reserve_resources();
1080 e820_mark_nosave_regions(max_low_pfn);
1081
1082#ifdef CONFIG_X86_32
1083 request_resource(&iomem_resource, &video_ram_resource);
1084#endif
1085 reserve_standard_io_resources();
1086
1087 e820_setup_gap();
1088
1089#ifdef CONFIG_VT
1090#if defined(CONFIG_VGA_CONSOLE)
1091 if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
1092 conswitchp = &vga_con;
1093#elif defined(CONFIG_DUMMY_CONSOLE)
1094 conswitchp = &dummy_con;
1095#endif
1096#endif
1097}
1098
1099
diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
deleted file mode 100644
index aee0e8200777..000000000000
--- a/arch/x86/kernel/setup64.c
+++ /dev/null
@@ -1,287 +0,0 @@
1/*
2 * X86-64 specific CPU setup.
3 * Copyright (C) 1995 Linus Torvalds
4 * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
5 * See setup.c for older changelog.
6 */
7#include <linux/init.h>
8#include <linux/kernel.h>
9#include <linux/sched.h>
10#include <linux/string.h>
11#include <linux/bootmem.h>
12#include <linux/bitops.h>
13#include <linux/module.h>
14#include <linux/kgdb.h>
15#include <asm/pda.h>
16#include <asm/pgtable.h>
17#include <asm/processor.h>
18#include <asm/desc.h>
19#include <asm/atomic.h>
20#include <asm/mmu_context.h>
21#include <asm/smp.h>
22#include <asm/i387.h>
23#include <asm/percpu.h>
24#include <asm/proto.h>
25#include <asm/sections.h>
26#include <asm/setup.h>
27#include <asm/genapic.h>
28
29#ifndef CONFIG_DEBUG_BOOT_PARAMS
30struct boot_params __initdata boot_params;
31#else
32struct boot_params boot_params;
33#endif
34
35cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
36
37struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
38EXPORT_SYMBOL(_cpu_pda);
39struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
40
41struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
42
43char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
44
45unsigned long __supported_pte_mask __read_mostly = ~0UL;
46EXPORT_SYMBOL_GPL(__supported_pte_mask);
47
48static int do_not_nx __cpuinitdata = 0;
49
50/* noexec=on|off
51Control non executable mappings for 64bit processes.
52
53on Enable(default)
54off Disable
55*/
56static int __init nonx_setup(char *str)
57{
58 if (!str)
59 return -EINVAL;
60 if (!strncmp(str, "on", 2)) {
61 __supported_pte_mask |= _PAGE_NX;
62 do_not_nx = 0;
63 } else if (!strncmp(str, "off", 3)) {
64 do_not_nx = 1;
65 __supported_pte_mask &= ~_PAGE_NX;
66 }
67 return 0;
68}
69early_param("noexec", nonx_setup);
70
71int force_personality32 = 0;
72
73/* noexec32=on|off
74Control non executable heap for 32bit processes.
75To control the stack too use noexec=off
76
77on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
78off PROT_READ implies PROT_EXEC
79*/
80static int __init nonx32_setup(char *str)
81{
82 if (!strcmp(str, "on"))
83 force_personality32 &= ~READ_IMPLIES_EXEC;
84 else if (!strcmp(str, "off"))
85 force_personality32 |= READ_IMPLIES_EXEC;
86 return 1;
87}
88__setup("noexec32=", nonx32_setup);
89
90void pda_init(int cpu)
91{
92 struct x8664_pda *pda = cpu_pda(cpu);
93
94 /* Setup up data that may be needed in __get_free_pages early */
95 asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
96 /* Memory clobbers used to order PDA accessed */
97 mb();
98 wrmsrl(MSR_GS_BASE, pda);
99 mb();
100
101 pda->cpunumber = cpu;
102 pda->irqcount = -1;
103 pda->kernelstack =
104 (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE;
105 pda->active_mm = &init_mm;
106 pda->mmu_state = 0;
107
108 if (cpu == 0) {
109 /* others are initialized in smpboot.c */
110 pda->pcurrent = &init_task;
111 pda->irqstackptr = boot_cpu_stack;
112 } else {
113 pda->irqstackptr = (char *)
114 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
115 if (!pda->irqstackptr)
116 panic("cannot allocate irqstack for cpu %d", cpu);
117 }
118
119
120 pda->irqstackptr += IRQSTACKSIZE-64;
121}
122
123char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
124__attribute__((section(".bss.page_aligned")));
125
126extern asmlinkage void ignore_sysret(void);
127
128/* May not be marked __init: used by software suspend */
129void syscall_init(void)
130{
131 /*
132 * LSTAR and STAR live in a bit strange symbiosis.
133 * They both write to the same internal register. STAR allows to set CS/DS
134 * but only a 32bit target. LSTAR sets the 64bit rip.
135 */
136 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
137 wrmsrl(MSR_LSTAR, system_call);
138 wrmsrl(MSR_CSTAR, ignore_sysret);
139
140#ifdef CONFIG_IA32_EMULATION
141 syscall32_cpu_init ();
142#endif
143
144 /* Flags to clear on syscall */
145 wrmsrl(MSR_SYSCALL_MASK,
146 X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
147}
148
149void __cpuinit check_efer(void)
150{
151 unsigned long efer;
152
153 rdmsrl(MSR_EFER, efer);
154 if (!(efer & EFER_NX) || do_not_nx) {
155 __supported_pte_mask &= ~_PAGE_NX;
156 }
157}
158
159unsigned long kernel_eflags;
160
161/*
162 * Copies of the original ist values from the tss are only accessed during
163 * debugging, no special alignment required.
164 */
165DEFINE_PER_CPU(struct orig_ist, orig_ist);
166
167/*
168 * cpu_init() initializes state that is per-CPU. Some data is already
169 * initialized (naturally) in the bootstrap process, such as the GDT
170 * and IDT. We reload them nevertheless, this function acts as a
171 * 'CPU state barrier', nothing should get across.
172 * A lot of state is already set up in PDA init.
173 */
174void __cpuinit cpu_init (void)
175{
176 int cpu = stack_smp_processor_id();
177 struct tss_struct *t = &per_cpu(init_tss, cpu);
178 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
179 unsigned long v;
180 char *estacks = NULL;
181 struct task_struct *me;
182 int i;
183
184 /* CPU 0 is initialised in head64.c */
185 if (cpu != 0) {
186 pda_init(cpu);
187 } else
188 estacks = boot_exception_stacks;
189
190 me = current;
191
192 if (cpu_test_and_set(cpu, cpu_initialized))
193 panic("CPU#%d already initialized!\n", cpu);
194
195 printk("Initializing CPU#%d\n", cpu);
196
197 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
198
199 /*
200 * Initialize the per-CPU GDT with the boot GDT,
201 * and set up the GDT descriptor:
202 */
203 if (cpu)
204 memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE);
205
206 cpu_gdt_descr[cpu].size = GDT_SIZE;
207 load_gdt((const struct desc_ptr *)&cpu_gdt_descr[cpu]);
208 load_idt((const struct desc_ptr *)&idt_descr);
209
210 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
211 syscall_init();
212
213 wrmsrl(MSR_FS_BASE, 0);
214 wrmsrl(MSR_KERNEL_GS_BASE, 0);
215 barrier();
216
217 check_efer();
218
219 /*
220 * set up and load the per-CPU TSS
221 */
222 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
223 static const unsigned int order[N_EXCEPTION_STACKS] = {
224 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
225 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
226 };
227 if (cpu) {
228 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
229 if (!estacks)
230 panic("Cannot allocate exception stack %ld %d\n",
231 v, cpu);
232 }
233 estacks += PAGE_SIZE << order[v];
234 orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
235 }
236
237 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
238 /*
239 * <= is required because the CPU will access up to
240 * 8 bits beyond the end of the IO permission bitmap.
241 */
242 for (i = 0; i <= IO_BITMAP_LONGS; i++)
243 t->io_bitmap[i] = ~0UL;
244
245 atomic_inc(&init_mm.mm_count);
246 me->active_mm = &init_mm;
247 if (me->mm)
248 BUG();
249 enter_lazy_tlb(&init_mm, me);
250
251 set_tss_desc(cpu, t);
252 load_TR_desc();
253 load_LDT(&init_mm.context);
254
255#ifdef CONFIG_KGDB
256 /*
257 * If the kgdb is connected no debug regs should be altered. This
258 * is only applicable when KGDB and a KGDB I/O module are built
259 * into the kernel and you are using early debugging with
260 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
261 */
262 if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
263 arch_kgdb_ops.correct_hw_break();
264 else {
265#endif
266 /*
267 * Clear all 6 debug registers:
268 */
269
270 set_debugreg(0UL, 0);
271 set_debugreg(0UL, 1);
272 set_debugreg(0UL, 2);
273 set_debugreg(0UL, 3);
274 set_debugreg(0UL, 6);
275 set_debugreg(0UL, 7);
276#ifdef CONFIG_KGDB
277 /* If the kgdb is connected no debug regs should be altered. */
278 }
279#endif
280
281 fpu_init();
282
283 raw_local_save_flags(kernel_eflags);
284
285 if (is_uv_system())
286 uv_cpu_init();
287}
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
deleted file mode 100644
index 5a2f8e063887..000000000000
--- a/arch/x86/kernel/setup_32.c
+++ /dev/null
@@ -1,964 +0,0 @@
1/*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
5 *
6 * Memory region support
7 * David Parsons <orc@pell.chi.il.us>, July-August 1999
8 *
9 * Added E820 sanitization routine (removes overlapping memory regions);
10 * Brian Moyle <bmoyle@mvista.com>, February 2001
11 *
12 * Moved CPU detection code to cpu/${cpu}.c
13 * Patrick Mochel <mochel@osdl.org>, March 2002
14 *
15 * Provisions for empty E820 memory regions (reported by certain BIOSes).
16 * Alex Achenbach <xela@slit.de>, December 2002.
17 *
18 */
19
20/*
21 * This file handles the architecture-dependent parts of initialization
22 */
23
24#include <linux/sched.h>
25#include <linux/mm.h>
26#include <linux/mmzone.h>
27#include <linux/screen_info.h>
28#include <linux/ioport.h>
29#include <linux/acpi.h>
30#include <linux/apm_bios.h>
31#include <linux/initrd.h>
32#include <linux/bootmem.h>
33#include <linux/seq_file.h>
34#include <linux/console.h>
35#include <linux/mca.h>
36#include <linux/root_dev.h>
37#include <linux/highmem.h>
38#include <linux/module.h>
39#include <linux/efi.h>
40#include <linux/init.h>
41#include <linux/edd.h>
42#include <linux/iscsi_ibft.h>
43#include <linux/nodemask.h>
44#include <linux/kexec.h>
45#include <linux/crash_dump.h>
46#include <linux/dmi.h>
47#include <linux/pfn.h>
48#include <linux/pci.h>
49#include <linux/init_ohci1394_dma.h>
50#include <linux/kvm_para.h>
51
52#include <video/edid.h>
53
54#include <asm/mtrr.h>
55#include <asm/apic.h>
56#include <asm/e820.h>
57#include <asm/mpspec.h>
58#include <asm/mmzone.h>
59#include <asm/setup.h>
60#include <asm/arch_hooks.h>
61#include <asm/sections.h>
62#include <asm/io_apic.h>
63#include <asm/ist.h>
64#include <asm/io.h>
65#include <asm/vmi.h>
66#include <setup_arch.h>
67#include <asm/bios_ebda.h>
68#include <asm/cacheflush.h>
69#include <asm/processor.h>
70
71/* This value is set up by the early boot code to point to the value
72 immediately after the boot time page tables. It contains a *physical*
73 address, and must not be in the .bss segment! */
74unsigned long init_pg_tables_end __initdata = ~0UL;
75
76/*
77 * Machine setup..
78 */
79static struct resource data_resource = {
80 .name = "Kernel data",
81 .start = 0,
82 .end = 0,
83 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
84};
85
86static struct resource code_resource = {
87 .name = "Kernel code",
88 .start = 0,
89 .end = 0,
90 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
91};
92
93static struct resource bss_resource = {
94 .name = "Kernel bss",
95 .start = 0,
96 .end = 0,
97 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
98};
99
100static struct resource video_ram_resource = {
101 .name = "Video RAM area",
102 .start = 0xa0000,
103 .end = 0xbffff,
104 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
105};
106
107static struct resource standard_io_resources[] = { {
108 .name = "dma1",
109 .start = 0x0000,
110 .end = 0x001f,
111 .flags = IORESOURCE_BUSY | IORESOURCE_IO
112}, {
113 .name = "pic1",
114 .start = 0x0020,
115 .end = 0x0021,
116 .flags = IORESOURCE_BUSY | IORESOURCE_IO
117}, {
118 .name = "timer0",
119 .start = 0x0040,
120 .end = 0x0043,
121 .flags = IORESOURCE_BUSY | IORESOURCE_IO
122}, {
123 .name = "timer1",
124 .start = 0x0050,
125 .end = 0x0053,
126 .flags = IORESOURCE_BUSY | IORESOURCE_IO
127}, {
128 .name = "keyboard",
129 .start = 0x0060,
130 .end = 0x0060,
131 .flags = IORESOURCE_BUSY | IORESOURCE_IO
132}, {
133 .name = "keyboard",
134 .start = 0x0064,
135 .end = 0x0064,
136 .flags = IORESOURCE_BUSY | IORESOURCE_IO
137}, {
138 .name = "dma page reg",
139 .start = 0x0080,
140 .end = 0x008f,
141 .flags = IORESOURCE_BUSY | IORESOURCE_IO
142}, {
143 .name = "pic2",
144 .start = 0x00a0,
145 .end = 0x00a1,
146 .flags = IORESOURCE_BUSY | IORESOURCE_IO
147}, {
148 .name = "dma2",
149 .start = 0x00c0,
150 .end = 0x00df,
151 .flags = IORESOURCE_BUSY | IORESOURCE_IO
152}, {
153 .name = "fpu",
154 .start = 0x00f0,
155 .end = 0x00ff,
156 .flags = IORESOURCE_BUSY | IORESOURCE_IO
157} };
158
159/* cpu data as detected by the assembly code in head.S */
160struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
161/* common cpu data for all cpus */
162struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
163EXPORT_SYMBOL(boot_cpu_data);
164
165unsigned int def_to_bigsmp;
166
167#ifndef CONFIG_X86_PAE
168unsigned long mmu_cr4_features;
169#else
170unsigned long mmu_cr4_features = X86_CR4_PAE;
171#endif
172
173/* for MCA, but anyone else can use it if they want */
174unsigned int machine_id;
175unsigned int machine_submodel_id;
176unsigned int BIOS_revision;
177
178/* Boot loader ID as an integer, for the benefit of proc_dointvec */
179int bootloader_type;
180
181/* user-defined highmem size */
182static unsigned int highmem_pages = -1;
183
184/*
185 * Setup options
186 */
187struct screen_info screen_info;
188EXPORT_SYMBOL(screen_info);
189struct apm_info apm_info;
190EXPORT_SYMBOL(apm_info);
191struct edid_info edid_info;
192EXPORT_SYMBOL_GPL(edid_info);
193struct ist_info ist_info;
194#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
195 defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
196EXPORT_SYMBOL(ist_info);
197#endif
198
199extern void early_cpu_init(void);
200extern int root_mountflags;
201
202unsigned long saved_video_mode;
203
204#define RAMDISK_IMAGE_START_MASK 0x07FF
205#define RAMDISK_PROMPT_FLAG 0x8000
206#define RAMDISK_LOAD_FLAG 0x4000
207
208static char __initdata command_line[COMMAND_LINE_SIZE];
209
210#ifndef CONFIG_DEBUG_BOOT_PARAMS
211struct boot_params __initdata boot_params;
212#else
213struct boot_params boot_params;
214#endif
215
216#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
217struct edd edd;
218#ifdef CONFIG_EDD_MODULE
219EXPORT_SYMBOL(edd);
220#endif
221/**
222 * copy_edd() - Copy the BIOS EDD information
223 * from boot_params into a safe place.
224 *
225 */
226static inline void copy_edd(void)
227{
228 memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
229 sizeof(edd.mbr_signature));
230 memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
231 edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
232 edd.edd_info_nr = boot_params.eddbuf_entries;
233}
234#else
235static inline void copy_edd(void)
236{
237}
238#endif
239
240int __initdata user_defined_memmap;
241
242/*
243 * "mem=nopentium" disables the 4MB page tables.
244 * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
245 * to <mem>, overriding the bios size.
246 * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
247 * <start> to <start>+<mem>, overriding the bios size.
248 *
249 * HPA tells me bootloaders need to parse mem=, so no new
250 * option should be mem= [also see Documentation/i386/boot.txt]
251 */
252static int __init parse_mem(char *arg)
253{
254 if (!arg)
255 return -EINVAL;
256
257 if (strcmp(arg, "nopentium") == 0) {
258 setup_clear_cpu_cap(X86_FEATURE_PSE);
259 } else {
260 /* If the user specifies memory size, we
261 * limit the BIOS-provided memory map to
262 * that size. exactmap can be used to specify
263 * the exact map. mem=number can be used to
264 * trim the existing memory map.
265 */
266 unsigned long long mem_size;
267
268 mem_size = memparse(arg, &arg);
269 limit_regions(mem_size);
270 user_defined_memmap = 1;
271 }
272 return 0;
273}
274early_param("mem", parse_mem);
275
276#ifdef CONFIG_PROC_VMCORE
277/* elfcorehdr= specifies the location of elf core header
278 * stored by the crashed kernel.
279 */
280static int __init parse_elfcorehdr(char *arg)
281{
282 if (!arg)
283 return -EINVAL;
284
285 elfcorehdr_addr = memparse(arg, &arg);
286 return 0;
287}
288early_param("elfcorehdr", parse_elfcorehdr);
289#endif /* CONFIG_PROC_VMCORE */
290
291/*
292 * highmem=size forces highmem to be exactly 'size' bytes.
293 * This works even on boxes that have no highmem otherwise.
294 * This also works to reduce highmem size on bigger boxes.
295 */
296static int __init parse_highmem(char *arg)
297{
298 if (!arg)
299 return -EINVAL;
300
301 highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
302 return 0;
303}
304early_param("highmem", parse_highmem);
305
306/*
307 * vmalloc=size forces the vmalloc area to be exactly 'size'
308 * bytes. This can be used to increase (or decrease) the
309 * vmalloc area - the default is 128m.
310 */
311static int __init parse_vmalloc(char *arg)
312{
313 if (!arg)
314 return -EINVAL;
315
316 __VMALLOC_RESERVE = memparse(arg, &arg);
317 return 0;
318}
319early_param("vmalloc", parse_vmalloc);
320
321/*
322 * reservetop=size reserves a hole at the top of the kernel address space which
323 * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
324 * so relocating the fixmap can be done before paging initialization.
325 */
326static int __init parse_reservetop(char *arg)
327{
328 unsigned long address;
329
330 if (!arg)
331 return -EINVAL;
332
333 address = memparse(arg, &arg);
334 reserve_top_address(address);
335 return 0;
336}
337early_param("reservetop", parse_reservetop);
338
339/*
340 * Determine low and high memory ranges:
341 */
342unsigned long __init find_max_low_pfn(void)
343{
344 unsigned long max_low_pfn;
345
346 max_low_pfn = max_pfn;
347 if (max_low_pfn > MAXMEM_PFN) {
348 if (highmem_pages == -1)
349 highmem_pages = max_pfn - MAXMEM_PFN;
350 if (highmem_pages + MAXMEM_PFN < max_pfn)
351 max_pfn = MAXMEM_PFN + highmem_pages;
352 if (highmem_pages + MAXMEM_PFN > max_pfn) {
353 printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
354 highmem_pages = 0;
355 }
356 max_low_pfn = MAXMEM_PFN;
357#ifndef CONFIG_HIGHMEM
358 /* Maximum memory usable is what is directly addressable */
359 printk(KERN_WARNING "Warning only %ldMB will be used.\n",
360 MAXMEM>>20);
361 if (max_pfn > MAX_NONPAE_PFN)
362 printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
363 else
364 printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
365 max_pfn = MAXMEM_PFN;
366#else /* !CONFIG_HIGHMEM */
367#ifndef CONFIG_HIGHMEM64G
368 if (max_pfn > MAX_NONPAE_PFN) {
369 max_pfn = MAX_NONPAE_PFN;
370 printk(KERN_WARNING "Warning only 4GB will be used.\n");
371 printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
372 }
373#endif /* !CONFIG_HIGHMEM64G */
374#endif /* !CONFIG_HIGHMEM */
375 } else {
376 if (highmem_pages == -1)
377 highmem_pages = 0;
378#ifdef CONFIG_HIGHMEM
379 if (highmem_pages >= max_pfn) {
380 printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
381 highmem_pages = 0;
382 }
383 if (highmem_pages) {
384 if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
385 printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
386 highmem_pages = 0;
387 }
388 max_low_pfn -= highmem_pages;
389 }
390#else
391 if (highmem_pages)
392 printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
393#endif
394 }
395 return max_low_pfn;
396}
397
398#define BIOS_LOWMEM_KILOBYTES 0x413
399
400/*
401 * The BIOS places the EBDA/XBDA at the top of conventional
402 * memory, and usually decreases the reported amount of
403 * conventional memory (int 0x12) too. This also contains a
404 * workaround for Dell systems that neglect to reserve EBDA.
405 * The same workaround also avoids a problem with the AMD768MPX
406 * chipset: reserve a page before VGA to prevent PCI prefetch
407 * into it (errata #56). Usually the page is reserved anyways,
408 * unless you have no PS/2 mouse plugged in.
409 */
410static void __init reserve_ebda_region(void)
411{
412 unsigned int lowmem, ebda_addr;
413
414 /* To determine the position of the EBDA and the */
415 /* end of conventional memory, we need to look at */
416 /* the BIOS data area. In a paravirtual environment */
417 /* that area is absent. We'll just have to assume */
418 /* that the paravirt case can handle memory setup */
419 /* correctly, without our help. */
420 if (paravirt_enabled())
421 return;
422
423 /* end of low (conventional) memory */
424 lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
425 lowmem <<= 10;
426
427 /* start of EBDA area */
428 ebda_addr = get_bios_ebda();
429
430 /* Fixup: bios puts an EBDA in the top 64K segment */
431 /* of conventional memory, but does not adjust lowmem. */
432 if ((lowmem - ebda_addr) <= 0x10000)
433 lowmem = ebda_addr;
434
435 /* Fixup: bios does not report an EBDA at all. */
436 /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
437 if ((ebda_addr == 0) && (lowmem >= 0x9f000))
438 lowmem = 0x9f000;
439
440 /* Paranoia: should never happen, but... */
441 if ((lowmem == 0) || (lowmem >= 0x100000))
442 lowmem = 0x9f000;
443
444 /* reserve all memory between lowmem and the 1MB mark */
445 reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT);
446}
447
448#ifndef CONFIG_NEED_MULTIPLE_NODES
449static void __init setup_bootmem_allocator(void);
450static unsigned long __init setup_memory(void)
451{
452 /*
453 * partially used pages are not usable - thus
454 * we are rounding upwards:
455 */
456 min_low_pfn = PFN_UP(init_pg_tables_end);
457
458 max_low_pfn = find_max_low_pfn();
459
460#ifdef CONFIG_HIGHMEM
461 highstart_pfn = highend_pfn = max_pfn;
462 if (max_pfn > max_low_pfn) {
463 highstart_pfn = max_low_pfn;
464 }
465 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
466 pages_to_mb(highend_pfn - highstart_pfn));
467 num_physpages = highend_pfn;
468 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
469#else
470 num_physpages = max_low_pfn;
471 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
472#endif
473#ifdef CONFIG_FLATMEM
474 max_mapnr = num_physpages;
475#endif
476 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
477 pages_to_mb(max_low_pfn));
478
479 setup_bootmem_allocator();
480
481 return max_low_pfn;
482}
483
484static void __init zone_sizes_init(void)
485{
486 unsigned long max_zone_pfns[MAX_NR_ZONES];
487 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
488 max_zone_pfns[ZONE_DMA] =
489 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
490 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
491#ifdef CONFIG_HIGHMEM
492 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
493 add_active_range(0, 0, highend_pfn);
494#else
495 add_active_range(0, 0, max_low_pfn);
496#endif
497
498 free_area_init_nodes(max_zone_pfns);
499}
500#else
501extern unsigned long __init setup_memory(void);
502extern void zone_sizes_init(void);
503#endif /* !CONFIG_NEED_MULTIPLE_NODES */
504
505static inline unsigned long long get_total_mem(void)
506{
507 unsigned long long total;
508
509 total = max_low_pfn - min_low_pfn;
510#ifdef CONFIG_HIGHMEM
511 total += highend_pfn - highstart_pfn;
512#endif
513
514 return total << PAGE_SHIFT;
515}
516
517#ifdef CONFIG_KEXEC
518static void __init reserve_crashkernel(void)
519{
520 unsigned long long total_mem;
521 unsigned long long crash_size, crash_base;
522 int ret;
523
524 total_mem = get_total_mem();
525
526 ret = parse_crashkernel(boot_command_line, total_mem,
527 &crash_size, &crash_base);
528 if (ret == 0 && crash_size > 0) {
529 if (crash_base > 0) {
530 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
531 "for crashkernel (System RAM: %ldMB)\n",
532 (unsigned long)(crash_size >> 20),
533 (unsigned long)(crash_base >> 20),
534 (unsigned long)(total_mem >> 20));
535
536 if (reserve_bootmem(crash_base, crash_size,
537 BOOTMEM_EXCLUSIVE) < 0) {
538 printk(KERN_INFO "crashkernel reservation "
539 "failed - memory is in use\n");
540 return;
541 }
542
543 crashk_res.start = crash_base;
544 crashk_res.end = crash_base + crash_size - 1;
545 } else
546 printk(KERN_INFO "crashkernel reservation failed - "
547 "you have to specify a base address\n");
548 }
549}
550#else
551static inline void __init reserve_crashkernel(void)
552{}
553#endif
554
555#ifdef CONFIG_BLK_DEV_INITRD
556
557static bool do_relocate_initrd = false;
558
559static void __init reserve_initrd(void)
560{
561 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
562 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
563 unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
564 unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
565 unsigned long ramdisk_here;
566
567 initrd_start = 0;
568
569 if (!boot_params.hdr.type_of_loader ||
570 !ramdisk_image || !ramdisk_size)
571 return; /* No initrd provided by bootloader */
572
573 if (ramdisk_end < ramdisk_image) {
574 printk(KERN_ERR "initrd wraps around end of memory, "
575 "disabling initrd\n");
576 return;
577 }
578 if (ramdisk_size >= end_of_lowmem/2) {
579 printk(KERN_ERR "initrd too large to handle, "
580 "disabling initrd\n");
581 return;
582 }
583 if (ramdisk_end <= end_of_lowmem) {
584 /* All in lowmem, easy case */
585 reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT);
586 initrd_start = ramdisk_image + PAGE_OFFSET;
587 initrd_end = initrd_start+ramdisk_size;
588 return;
589 }
590
591 /* We need to move the initrd down into lowmem */
592 ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
593
594 /* Note: this includes all the lowmem currently occupied by
595 the initrd, we rely on that fact to keep the data intact. */
596 reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT);
597 initrd_start = ramdisk_here + PAGE_OFFSET;
598 initrd_end = initrd_start + ramdisk_size;
599
600 do_relocate_initrd = true;
601}
602
603#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
604
605static void __init relocate_initrd(void)
606{
607 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
608 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
609 unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
610 unsigned long ramdisk_here;
611 unsigned long slop, clen, mapaddr;
612 char *p, *q;
613
614 if (!do_relocate_initrd)
615 return;
616
617 ramdisk_here = initrd_start - PAGE_OFFSET;
618
619 q = (char *)initrd_start;
620
621 /* Copy any lowmem portion of the initrd */
622 if (ramdisk_image < end_of_lowmem) {
623 clen = end_of_lowmem - ramdisk_image;
624 p = (char *)__va(ramdisk_image);
625 memcpy(q, p, clen);
626 q += clen;
627 ramdisk_image += clen;
628 ramdisk_size -= clen;
629 }
630
631 /* Copy the highmem portion of the initrd */
632 while (ramdisk_size) {
633 slop = ramdisk_image & ~PAGE_MASK;
634 clen = ramdisk_size;
635 if (clen > MAX_MAP_CHUNK-slop)
636 clen = MAX_MAP_CHUNK-slop;
637 mapaddr = ramdisk_image & PAGE_MASK;
638 p = early_ioremap(mapaddr, clen+slop);
639 memcpy(q, p+slop, clen);
640 early_iounmap(p, clen+slop);
641 q += clen;
642 ramdisk_image += clen;
643 ramdisk_size -= clen;
644 }
645}
646
647#endif /* CONFIG_BLK_DEV_INITRD */
648
649void __init setup_bootmem_allocator(void)
650{
651 unsigned long bootmap_size;
652 /*
653 * Initialize the boot-time allocator (with low memory only):
654 */
655 bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
656
657 register_bootmem_low_pages(max_low_pfn);
658
659 /*
660 * Reserve the bootmem bitmap itself as well. We do this in two
661 * steps (first step was init_bootmem()) because this catches
662 * the (very unlikely) case of us accidentally initializing the
663 * bootmem allocator with an invalid RAM area.
664 */
665 reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
666 bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text),
667 BOOTMEM_DEFAULT);
668
669 /*
670 * reserve physical page 0 - it's a special BIOS page on many boxes,
671 * enabling clean reboots, SMP operation, laptop functions.
672 */
673 reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
674
675 /* reserve EBDA region */
676 reserve_ebda_region();
677
678#ifdef CONFIG_SMP
679 /*
680 * But first pinch a few for the stack/trampoline stuff
681 * FIXME: Don't need the extra page at 4K, but need to fix
682 * trampoline before removing it. (see the GDT stuff)
683 */
684 reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT);
685#endif
686#ifdef CONFIG_ACPI_SLEEP
687 /*
688 * Reserve low memory region for sleep support.
689 */
690 acpi_reserve_bootmem();
691#endif
692#ifdef CONFIG_X86_FIND_SMP_CONFIG
693 /*
694 * Find and reserve possible boot-time SMP configuration:
695 */
696 find_smp_config();
697#endif
698#ifdef CONFIG_BLK_DEV_INITRD
699 reserve_initrd();
700#endif
701 numa_kva_reserve();
702 reserve_crashkernel();
703
704 reserve_ibft_region();
705}
706
707/*
708 * The node 0 pgdat is initialized before all of these because
709 * it's needed for bootmem. node>0 pgdats have their virtual
710 * space allocated before the pagetables are in place to access
711 * them, so they can't be cleared then.
712 *
713 * This should all compile down to nothing when NUMA is off.
714 */
715static void __init remapped_pgdat_init(void)
716{
717 int nid;
718
719 for_each_online_node(nid) {
720 if (nid != 0)
721 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
722 }
723}
724
725#ifdef CONFIG_MCA
726static void set_mca_bus(int x)
727{
728 MCA_bus = x;
729}
730#else
731static void set_mca_bus(int x) { }
732#endif
733
734/* Overridden in paravirt.c if CONFIG_PARAVIRT */
735char * __init __attribute__((weak)) memory_setup(void)
736{
737 return machine_specific_memory_setup();
738}
739
740#ifdef CONFIG_NUMA
741/*
742 * In the golden day, when everything among i386 and x86_64 will be
743 * integrated, this will not live here
744 */
745void *x86_cpu_to_node_map_early_ptr;
746int x86_cpu_to_node_map_init[NR_CPUS] = {
747 [0 ... NR_CPUS-1] = NUMA_NO_NODE
748};
749DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
750#endif
751
752/*
753 * Determine if we were loaded by an EFI loader. If so, then we have also been
754 * passed the efi memmap, systab, etc., so we should use these data structures
755 * for initialization. Note, the efi init code path is determined by the
756 * global efi_enabled. This allows the same kernel image to be used on existing
757 * systems (with a traditional BIOS) as well as on EFI systems.
758 */
759void __init setup_arch(char **cmdline_p)
760{
761 unsigned long max_low_pfn;
762
763 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
764 pre_setup_arch_hook();
765 early_cpu_init();
766 early_ioremap_init();
767
768#ifdef CONFIG_EFI
769 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
770 "EL32", 4))
771 efi_enabled = 1;
772#endif
773
774 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
775 screen_info = boot_params.screen_info;
776 edid_info = boot_params.edid_info;
777 apm_info.bios = boot_params.apm_bios_info;
778 ist_info = boot_params.ist_info;
779 saved_video_mode = boot_params.hdr.vid_mode;
780 if( boot_params.sys_desc_table.length != 0 ) {
781 set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
782 machine_id = boot_params.sys_desc_table.table[0];
783 machine_submodel_id = boot_params.sys_desc_table.table[1];
784 BIOS_revision = boot_params.sys_desc_table.table[2];
785 }
786 bootloader_type = boot_params.hdr.type_of_loader;
787
788#ifdef CONFIG_BLK_DEV_RAM
789 rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
790 rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
791 rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
792#endif
793 ARCH_SETUP
794
795 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
796 print_memory_map(memory_setup());
797
798 copy_edd();
799
800 if (!boot_params.hdr.root_flags)
801 root_mountflags &= ~MS_RDONLY;
802 init_mm.start_code = (unsigned long) _text;
803 init_mm.end_code = (unsigned long) _etext;
804 init_mm.end_data = (unsigned long) _edata;
805 init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
806
807 code_resource.start = virt_to_phys(_text);
808 code_resource.end = virt_to_phys(_etext)-1;
809 data_resource.start = virt_to_phys(_etext);
810 data_resource.end = virt_to_phys(_edata)-1;
811 bss_resource.start = virt_to_phys(&__bss_start);
812 bss_resource.end = virt_to_phys(&__bss_stop)-1;
813
814 parse_early_param();
815
816 if (user_defined_memmap) {
817 printk(KERN_INFO "user-defined physical RAM map:\n");
818 print_memory_map("user");
819 }
820
821 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
822 *cmdline_p = command_line;
823
824 if (efi_enabled)
825 efi_init();
826
827 /* update e820 for memory not covered by WB MTRRs */
828 propagate_e820_map();
829 mtrr_bp_init();
830 if (mtrr_trim_uncached_memory(max_pfn))
831 propagate_e820_map();
832
833 max_low_pfn = setup_memory();
834
835#ifdef CONFIG_KVM_CLOCK
836 kvmclock_init();
837#endif
838
839#ifdef CONFIG_VMI
840 /*
841 * Must be after max_low_pfn is determined, and before kernel
842 * pagetables are setup.
843 */
844 vmi_init();
845#endif
846 kvm_guest_init();
847
848 /*
849 * NOTE: before this point _nobody_ is allowed to allocate
850 * any memory using the bootmem allocator. Although the
851 * allocator is now initialised only the first 8Mb of the kernel
852 * virtual address space has been mapped. All allocations before
853 * paging_init() has completed must use the alloc_bootmem_low_pages()
854 * variant (which allocates DMA'able memory) and care must be taken
855 * not to exceed the 8Mb limit.
856 */
857
858#ifdef CONFIG_SMP
859 smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
860#endif
861 paging_init();
862
863 /*
864 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
865 */
866
867#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
868 if (init_ohci1394_dma_early)
869 init_ohci1394_dma_on_all_controllers();
870#endif
871
872 remapped_pgdat_init();
873 sparse_init();
874 zone_sizes_init();
875
876 /*
877 * NOTE: at this point the bootmem allocator is fully available.
878 */
879
880#ifdef CONFIG_BLK_DEV_INITRD
881 relocate_initrd();
882#endif
883
884 paravirt_post_allocator_init();
885
886 dmi_scan_machine();
887
888 io_delay_init();
889
890#ifdef CONFIG_X86_SMP
891 /*
892 * setup to use the early static init tables during kernel startup
893 * X86_SMP will exclude sub-arches that don't deal well with it.
894 */
895 x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
896 x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
897#ifdef CONFIG_NUMA
898 x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
899#endif
900#endif
901
902#ifdef CONFIG_X86_GENERICARCH
903 generic_apic_probe();
904#endif
905
906#ifdef CONFIG_ACPI
907 /*
908 * Parse the ACPI tables for possible boot-time SMP configuration.
909 */
910 acpi_boot_table_init();
911#endif
912
913 early_quirks();
914
915#ifdef CONFIG_ACPI
916 acpi_boot_init();
917
918#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
919 if (def_to_bigsmp)
920 printk(KERN_WARNING "More than 8 CPUs detected and "
921 "CONFIG_X86_PC cannot handle it.\nUse "
922 "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
923#endif
924#endif
925#ifdef CONFIG_X86_LOCAL_APIC
926 if (smp_found_config)
927 get_smp_config();
928#endif
929
930 e820_register_memory();
931 e820_mark_nosave_regions();
932
933#ifdef CONFIG_VT
934#if defined(CONFIG_VGA_CONSOLE)
935 if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
936 conswitchp = &vga_con;
937#elif defined(CONFIG_DUMMY_CONSOLE)
938 conswitchp = &dummy_con;
939#endif
940#endif
941}
942
943/*
944 * Request address space for all standard resources
945 *
946 * This is called just before pcibios_init(), which is also a
947 * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
948 */
949static int __init request_standard_resources(void)
950{
951 int i;
952
953 printk(KERN_INFO "Setting up standard PCI resources\n");
954 init_iomem_resources(&code_resource, &data_resource, &bss_resource);
955
956 request_resource(&iomem_resource, &video_ram_resource);
957
958 /* request I/O space for devices used on all i[345]86 PCs */
959 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
960 request_resource(&ioport_resource, &standard_io_resources[i]);
961 return 0;
962}
963
964subsys_initcall(request_standard_resources);
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
deleted file mode 100644
index 6dff1286ad8a..000000000000
--- a/arch/x86/kernel/setup_64.c
+++ /dev/null
@@ -1,1194 +0,0 @@
1/*
2 * Copyright (C) 1995 Linus Torvalds
3 */
4
5/*
6 * This file handles the architecture-dependent parts of initialization
7 */
8
9#include <linux/errno.h>
10#include <linux/sched.h>
11#include <linux/kernel.h>
12#include <linux/mm.h>
13#include <linux/stddef.h>
14#include <linux/unistd.h>
15#include <linux/ptrace.h>
16#include <linux/slab.h>
17#include <linux/user.h>
18#include <linux/screen_info.h>
19#include <linux/ioport.h>
20#include <linux/delay.h>
21#include <linux/init.h>
22#include <linux/initrd.h>
23#include <linux/highmem.h>
24#include <linux/bootmem.h>
25#include <linux/module.h>
26#include <asm/processor.h>
27#include <linux/console.h>
28#include <linux/seq_file.h>
29#include <linux/crash_dump.h>
30#include <linux/root_dev.h>
31#include <linux/pci.h>
32#include <asm/pci-direct.h>
33#include <linux/efi.h>
34#include <linux/acpi.h>
35#include <linux/kallsyms.h>
36#include <linux/edd.h>
37#include <linux/iscsi_ibft.h>
38#include <linux/mmzone.h>
39#include <linux/kexec.h>
40#include <linux/cpufreq.h>
41#include <linux/dmi.h>
42#include <linux/dma-mapping.h>
43#include <linux/ctype.h>
44#include <linux/sort.h>
45#include <linux/uaccess.h>
46#include <linux/init_ohci1394_dma.h>
47#include <linux/kvm_para.h>
48
49#include <asm/mtrr.h>
50#include <asm/uaccess.h>
51#include <asm/system.h>
52#include <asm/vsyscall.h>
53#include <asm/io.h>
54#include <asm/smp.h>
55#include <asm/msr.h>
56#include <asm/desc.h>
57#include <video/edid.h>
58#include <asm/e820.h>
59#include <asm/dma.h>
60#include <asm/gart.h>
61#include <asm/mpspec.h>
62#include <asm/mmu_context.h>
63#include <asm/proto.h>
64#include <asm/setup.h>
65#include <asm/numa.h>
66#include <asm/sections.h>
67#include <asm/dmi.h>
68#include <asm/cacheflush.h>
69#include <asm/mce.h>
70#include <asm/ds.h>
71#include <asm/topology.h>
72#include <asm/trampoline.h>
73#include <asm/pat.h>
74
75#include <mach_apic.h>
76#ifdef CONFIG_PARAVIRT
77#include <asm/paravirt.h>
78#else
79#define ARCH_SETUP
80#endif
81
82/*
83 * Machine setup..
84 */
85
86struct cpuinfo_x86 boot_cpu_data __read_mostly;
87EXPORT_SYMBOL(boot_cpu_data);
88
89__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
90
91unsigned long mmu_cr4_features;
92
93/* Boot loader ID as an integer, for the benefit of proc_dointvec */
94int bootloader_type;
95
96unsigned long saved_video_mode;
97
98int force_mwait __cpuinitdata;
99
100/*
101 * Early DMI memory
102 */
103int dmi_alloc_index;
104char dmi_alloc_data[DMI_MAX_DATA];
105
106/*
107 * Setup options
108 */
109struct screen_info screen_info;
110EXPORT_SYMBOL(screen_info);
111struct sys_desc_table_struct {
112 unsigned short length;
113 unsigned char table[0];
114};
115
116struct edid_info edid_info;
117EXPORT_SYMBOL_GPL(edid_info);
118
119extern int root_mountflags;
120
121char __initdata command_line[COMMAND_LINE_SIZE];
122
123static struct resource standard_io_resources[] = {
124 { .name = "dma1", .start = 0x00, .end = 0x1f,
125 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
126 { .name = "pic1", .start = 0x20, .end = 0x21,
127 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
128 { .name = "timer0", .start = 0x40, .end = 0x43,
129 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
130 { .name = "timer1", .start = 0x50, .end = 0x53,
131 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
132 { .name = "keyboard", .start = 0x60, .end = 0x60,
133 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
134 { .name = "keyboard", .start = 0x64, .end = 0x64,
135 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
136 { .name = "dma page reg", .start = 0x80, .end = 0x8f,
137 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
138 { .name = "pic2", .start = 0xa0, .end = 0xa1,
139 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
140 { .name = "dma2", .start = 0xc0, .end = 0xdf,
141 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
142 { .name = "fpu", .start = 0xf0, .end = 0xff,
143 .flags = IORESOURCE_BUSY | IORESOURCE_IO }
144};
145
146#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
147
148static struct resource data_resource = {
149 .name = "Kernel data",
150 .start = 0,
151 .end = 0,
152 .flags = IORESOURCE_RAM,
153};
154static struct resource code_resource = {
155 .name = "Kernel code",
156 .start = 0,
157 .end = 0,
158 .flags = IORESOURCE_RAM,
159};
160static struct resource bss_resource = {
161 .name = "Kernel bss",
162 .start = 0,
163 .end = 0,
164 .flags = IORESOURCE_RAM,
165};
166
167static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
168
169#ifdef CONFIG_PROC_VMCORE
170/* elfcorehdr= specifies the location of elf core header
171 * stored by the crashed kernel. This option will be passed
172 * by kexec loader to the capture kernel.
173 */
174static int __init setup_elfcorehdr(char *arg)
175{
176 char *end;
177 if (!arg)
178 return -EINVAL;
179 elfcorehdr_addr = memparse(arg, &end);
180 return end > arg ? 0 : -EINVAL;
181}
182early_param("elfcorehdr", setup_elfcorehdr);
183#endif
184
185#ifndef CONFIG_NUMA
186static void __init
187contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
188{
189 unsigned long bootmap_size, bootmap;
190
191 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
192 bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
193 PAGE_SIZE);
194 if (bootmap == -1L)
195 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
196 bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
197 e820_register_active_regions(0, start_pfn, end_pfn);
198 free_bootmem_with_active_regions(0, end_pfn);
199 early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
200 reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
201}
202#endif
203
204#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
205struct edd edd;
206#ifdef CONFIG_EDD_MODULE
207EXPORT_SYMBOL(edd);
208#endif
209/**
210 * copy_edd() - Copy the BIOS EDD information
211 * from boot_params into a safe place.
212 *
213 */
214static inline void copy_edd(void)
215{
216 memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
217 sizeof(edd.mbr_signature));
218 memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
219 edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
220 edd.edd_info_nr = boot_params.eddbuf_entries;
221}
222#else
223static inline void copy_edd(void)
224{
225}
226#endif
227
228#ifdef CONFIG_KEXEC
229static void __init reserve_crashkernel(void)
230{
231 unsigned long long total_mem;
232 unsigned long long crash_size, crash_base;
233 int ret;
234
235 total_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
236
237 ret = parse_crashkernel(boot_command_line, total_mem,
238 &crash_size, &crash_base);
239 if (ret == 0 && crash_size) {
240 if (crash_base <= 0) {
241 printk(KERN_INFO "crashkernel reservation failed - "
242 "you have to specify a base address\n");
243 return;
244 }
245
246 if (reserve_bootmem(crash_base, crash_size,
247 BOOTMEM_EXCLUSIVE) < 0) {
248 printk(KERN_INFO "crashkernel reservation failed - "
249 "memory is in use\n");
250 return;
251 }
252
253 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
254 "for crashkernel (System RAM: %ldMB)\n",
255 (unsigned long)(crash_size >> 20),
256 (unsigned long)(crash_base >> 20),
257 (unsigned long)(total_mem >> 20));
258 crashk_res.start = crash_base;
259 crashk_res.end = crash_base + crash_size - 1;
260 insert_resource(&iomem_resource, &crashk_res);
261 }
262}
263#else
264static inline void __init reserve_crashkernel(void)
265{}
266#endif
267
268/* Overridden in paravirt.c if CONFIG_PARAVIRT */
269void __attribute__((weak)) __init memory_setup(void)
270{
271 machine_specific_memory_setup();
272}
273
274static void __init parse_setup_data(void)
275{
276 struct setup_data *data;
277 unsigned long pa_data;
278
279 if (boot_params.hdr.version < 0x0209)
280 return;
281 pa_data = boot_params.hdr.setup_data;
282 while (pa_data) {
283 data = early_ioremap(pa_data, PAGE_SIZE);
284 switch (data->type) {
285 default:
286 break;
287 }
288#ifndef CONFIG_DEBUG_BOOT_PARAMS
289 free_early(pa_data, pa_data+sizeof(*data)+data->len);
290#endif
291 pa_data = data->next;
292 early_iounmap(data, PAGE_SIZE);
293 }
294}
295
296#ifdef CONFIG_PCI_MMCONFIG
297extern void __cpuinit fam10h_check_enable_mmcfg(void);
298extern void __init check_enable_amd_mmconf_dmi(void);
299#else
300void __cpuinit fam10h_check_enable_mmcfg(void)
301{
302}
303void __init check_enable_amd_mmconf_dmi(void)
304{
305}
306#endif
307
308/*
309 * setup_arch - architecture-specific boot-time initializations
310 *
311 * Note: On x86_64, fixmaps are ready for use even before this is called.
312 */
313void __init setup_arch(char **cmdline_p)
314{
315 unsigned i;
316
317 printk(KERN_INFO "Command line: %s\n", boot_command_line);
318
319 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
320 screen_info = boot_params.screen_info;
321 edid_info = boot_params.edid_info;
322 saved_video_mode = boot_params.hdr.vid_mode;
323 bootloader_type = boot_params.hdr.type_of_loader;
324
325#ifdef CONFIG_BLK_DEV_RAM
326 rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
327 rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
328 rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
329#endif
330#ifdef CONFIG_EFI
331 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
332 "EL64", 4))
333 efi_enabled = 1;
334#endif
335
336 ARCH_SETUP
337
338 memory_setup();
339 copy_edd();
340
341 if (!boot_params.hdr.root_flags)
342 root_mountflags &= ~MS_RDONLY;
343 init_mm.start_code = (unsigned long) &_text;
344 init_mm.end_code = (unsigned long) &_etext;
345 init_mm.end_data = (unsigned long) &_edata;
346 init_mm.brk = (unsigned long) &_end;
347
348 code_resource.start = virt_to_phys(&_text);
349 code_resource.end = virt_to_phys(&_etext)-1;
350 data_resource.start = virt_to_phys(&_etext);
351 data_resource.end = virt_to_phys(&_edata)-1;
352 bss_resource.start = virt_to_phys(&__bss_start);
353 bss_resource.end = virt_to_phys(&__bss_stop)-1;
354
355 early_identify_cpu(&boot_cpu_data);
356
357 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
358 *cmdline_p = command_line;
359
360 parse_setup_data();
361
362 parse_early_param();
363
364#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
365 if (init_ohci1394_dma_early)
366 init_ohci1394_dma_on_all_controllers();
367#endif
368
369 finish_e820_parsing();
370
371 /* after parse_early_param, so could debug it */
372 insert_resource(&iomem_resource, &code_resource);
373 insert_resource(&iomem_resource, &data_resource);
374 insert_resource(&iomem_resource, &bss_resource);
375
376 early_gart_iommu_check();
377
378 e820_register_active_regions(0, 0, -1UL);
379 /*
380 * partially used pages are not usable - thus
381 * we are rounding upwards:
382 */
383 end_pfn = e820_end_of_ram();
384 /* update e820 for memory not covered by WB MTRRs */
385 mtrr_bp_init();
386 if (mtrr_trim_uncached_memory(end_pfn)) {
387 e820_register_active_regions(0, 0, -1UL);
388 end_pfn = e820_end_of_ram();
389 }
390
391 num_physpages = end_pfn;
392
393 check_efer();
394
395 max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT));
396 if (efi_enabled)
397 efi_init();
398
399 vsmp_init();
400
401 dmi_scan_machine();
402
403 io_delay_init();
404
405#ifdef CONFIG_KVM_CLOCK
406 kvmclock_init();
407#endif
408
409#ifdef CONFIG_SMP
410 /* setup to use the early static init tables during kernel startup */
411 x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
412 x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
413#ifdef CONFIG_NUMA
414 x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
415#endif
416#endif
417
418#ifdef CONFIG_ACPI
419 /*
420 * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
421 * Call this early for SRAT node setup.
422 */
423 acpi_boot_table_init();
424#endif
425
426 /* How many end-of-memory variables you have, grandma! */
427 max_low_pfn = end_pfn;
428 max_pfn = end_pfn;
429 high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
430
431 /* Remove active ranges so rediscovery with NUMA-awareness happens */
432 remove_all_active_ranges();
433
434#ifdef CONFIG_ACPI_NUMA
435 /*
436 * Parse SRAT to discover nodes.
437 */
438 acpi_numa_init();
439#endif
440
441#ifdef CONFIG_NUMA
442 numa_initmem_init(0, end_pfn);
443#else
444 contig_initmem_init(0, end_pfn);
445#endif
446
447 dma32_reserve_bootmem();
448
449#ifdef CONFIG_ACPI_SLEEP
450 /*
451 * Reserve low memory region for sleep support.
452 */
453 acpi_reserve_bootmem();
454#endif
455
456 if (efi_enabled)
457 efi_reserve_bootmem();
458
459 /*
460 * Find and reserve possible boot-time SMP configuration:
461 */
462 find_smp_config();
463#ifdef CONFIG_BLK_DEV_INITRD
464 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
465 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
466 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
467 unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
468 unsigned long end_of_mem = end_pfn << PAGE_SHIFT;
469
470 if (ramdisk_end <= end_of_mem) {
471 /*
472 * don't need to reserve again, already reserved early
473 * in x86_64_start_kernel, and early_res_to_bootmem
474 * convert that to reserved in bootmem
475 */
476 initrd_start = ramdisk_image + PAGE_OFFSET;
477 initrd_end = initrd_start+ramdisk_size;
478 } else {
479 free_bootmem(ramdisk_image, ramdisk_size);
480 printk(KERN_ERR "initrd extends beyond end of memory "
481 "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
482 ramdisk_end, end_of_mem);
483 initrd_start = 0;
484 }
485 }
486#endif
487 reserve_crashkernel();
488
489 reserve_ibft_region();
490
491 paging_init();
492 map_vsyscall();
493
494 early_quirks();
495
496#ifdef CONFIG_ACPI
497 /*
498 * Read APIC and some other early information from ACPI tables.
499 */
500 acpi_boot_init();
501#endif
502
503 init_cpu_to_node();
504
505 /*
506 * get boot-time SMP configuration:
507 */
508 if (smp_found_config)
509 get_smp_config();
510 init_apic_mappings();
511 ioapic_init_mappings();
512
513 kvm_guest_init();
514
515 /*
516 * We trust e820 completely. No explicit ROM probing in memory.
517 */
518 e820_reserve_resources();
519 e820_mark_nosave_regions();
520
521 /* request I/O space for devices used on all i[345]86 PCs */
522 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
523 request_resource(&ioport_resource, &standard_io_resources[i]);
524
525 e820_setup_gap();
526
527#ifdef CONFIG_VT
528#if defined(CONFIG_VGA_CONSOLE)
529 if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
530 conswitchp = &vga_con;
531#elif defined(CONFIG_DUMMY_CONSOLE)
532 conswitchp = &dummy_con;
533#endif
534#endif
535
536 /* do this before identify_cpu for boot cpu */
537 check_enable_amd_mmconf_dmi();
538}
539
540static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
541{
542 unsigned int *v;
543
544 if (c->extended_cpuid_level < 0x80000004)
545 return 0;
546
547 v = (unsigned int *) c->x86_model_id;
548 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
549 cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
550 cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
551 c->x86_model_id[48] = 0;
552 return 1;
553}
554
555
556static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
557{
558 unsigned int n, dummy, eax, ebx, ecx, edx;
559
560 n = c->extended_cpuid_level;
561
562 if (n >= 0x80000005) {
563 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
564 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
565 "D cache %dK (%d bytes/line)\n",
566 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
567 c->x86_cache_size = (ecx>>24) + (edx>>24);
568 /* On K8 L1 TLB is inclusive, so don't count it */
569 c->x86_tlbsize = 0;
570 }
571
572 if (n >= 0x80000006) {
573 cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
574 ecx = cpuid_ecx(0x80000006);
575 c->x86_cache_size = ecx >> 16;
576 c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
577
578 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
579 c->x86_cache_size, ecx & 0xFF);
580 }
581 if (n >= 0x80000008) {
582 cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
583 c->x86_virt_bits = (eax >> 8) & 0xff;
584 c->x86_phys_bits = eax & 0xff;
585 }
586}
587
588#ifdef CONFIG_NUMA
589static int __cpuinit nearby_node(int apicid)
590{
591 int i, node;
592
593 for (i = apicid - 1; i >= 0; i--) {
594 node = apicid_to_node[i];
595 if (node != NUMA_NO_NODE && node_online(node))
596 return node;
597 }
598 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
599 node = apicid_to_node[i];
600 if (node != NUMA_NO_NODE && node_online(node))
601 return node;
602 }
603 return first_node(node_online_map); /* Shouldn't happen */
604}
605#endif
606
607/*
608 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
609 * Assumes number of cores is a power of two.
610 */
611static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
612{
613#ifdef CONFIG_SMP
614 unsigned bits;
615#ifdef CONFIG_NUMA
616 int cpu = smp_processor_id();
617 int node = 0;
618 unsigned apicid = hard_smp_processor_id();
619#endif
620 bits = c->x86_coreid_bits;
621
622 /* Low order bits define the core id (index of core in socket) */
623 c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
624 /* Convert the initial APIC ID into the socket ID */
625 c->phys_proc_id = c->initial_apicid >> bits;
626
627#ifdef CONFIG_NUMA
628 node = c->phys_proc_id;
629 if (apicid_to_node[apicid] != NUMA_NO_NODE)
630 node = apicid_to_node[apicid];
631 if (!node_online(node)) {
632 /* Two possibilities here:
633 - The CPU is missing memory and no node was created.
634 In that case try picking one from a nearby CPU
635 - The APIC IDs differ from the HyperTransport node IDs
636 which the K8 northbridge parsing fills in.
637 Assume they are all increased by a constant offset,
638 but in the same order as the HT nodeids.
639 If that doesn't result in a usable node fall back to the
640 path for the previous case. */
641
642 int ht_nodeid = c->initial_apicid;
643
644 if (ht_nodeid >= 0 &&
645 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
646 node = apicid_to_node[ht_nodeid];
647 /* Pick a nearby node */
648 if (!node_online(node))
649 node = nearby_node(apicid);
650 }
651 numa_set_node(cpu, node);
652
653 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
654#endif
655#endif
656}
657
658static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
659{
660#ifdef CONFIG_SMP
661 unsigned bits, ecx;
662
663 /* Multi core CPU? */
664 if (c->extended_cpuid_level < 0x80000008)
665 return;
666
667 ecx = cpuid_ecx(0x80000008);
668
669 c->x86_max_cores = (ecx & 0xff) + 1;
670
671 /* CPU telling us the core id bits shift? */
672 bits = (ecx >> 12) & 0xF;
673
674 /* Otherwise recompute */
675 if (bits == 0) {
676 while ((1 << bits) < c->x86_max_cores)
677 bits++;
678 }
679
680 c->x86_coreid_bits = bits;
681
682#endif
683}
684
685#define ENABLE_C1E_MASK 0x18000000
686#define CPUID_PROCESSOR_SIGNATURE 1
687#define CPUID_XFAM 0x0ff00000
688#define CPUID_XFAM_K8 0x00000000
689#define CPUID_XFAM_10H 0x00100000
690#define CPUID_XFAM_11H 0x00200000
691#define CPUID_XMOD 0x000f0000
692#define CPUID_XMOD_REV_F 0x00040000
693
694/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
695static __cpuinit int amd_apic_timer_broken(void)
696{
697 u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
698
699 switch (eax & CPUID_XFAM) {
700 case CPUID_XFAM_K8:
701 if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
702 break;
703 case CPUID_XFAM_10H:
704 case CPUID_XFAM_11H:
705 rdmsr(MSR_K8_ENABLE_C1E, lo, hi);
706 if (lo & ENABLE_C1E_MASK)
707 return 1;
708 break;
709 default:
710 /* err on the side of caution */
711 return 1;
712 }
713 return 0;
714}
715
716static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
717{
718 early_init_amd_mc(c);
719
720 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
721 if (c->x86_power & (1<<8))
722 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
723}
724
725static void __cpuinit init_amd(struct cpuinfo_x86 *c)
726{
727 unsigned level;
728
729#ifdef CONFIG_SMP
730 unsigned long value;
731
732 /*
733 * Disable TLB flush filter by setting HWCR.FFDIS on K8
734 * bit 6 of msr C001_0015
735 *
736 * Errata 63 for SH-B3 steppings
737 * Errata 122 for all steppings (F+ have it disabled by default)
738 */
739 if (c->x86 == 15) {
740 rdmsrl(MSR_K8_HWCR, value);
741 value |= 1 << 6;
742 wrmsrl(MSR_K8_HWCR, value);
743 }
744#endif
745
746 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
747 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
748 clear_cpu_cap(c, 0*32+31);
749
750 /* On C+ stepping K8 rep microcode works well for copy/memset */
751 level = cpuid_eax(1);
752 if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
753 level >= 0x0f58))
754 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
755 if (c->x86 == 0x10 || c->x86 == 0x11)
756 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
757
758 /* Enable workaround for FXSAVE leak */
759 if (c->x86 >= 6)
760 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
761
762 level = get_model_name(c);
763 if (!level) {
764 switch (c->x86) {
765 case 15:
766 /* Should distinguish Models here, but this is only
767 a fallback anyways. */
768 strcpy(c->x86_model_id, "Hammer");
769 break;
770 }
771 }
772 display_cacheinfo(c);
773
774 /* Multi core CPU? */
775 if (c->extended_cpuid_level >= 0x80000008)
776 amd_detect_cmp(c);
777
778 if (c->extended_cpuid_level >= 0x80000006 &&
779 (cpuid_edx(0x80000006) & 0xf000))
780 num_cache_leaves = 4;
781 else
782 num_cache_leaves = 3;
783
784 if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
785 set_cpu_cap(c, X86_FEATURE_K8);
786
787 /* MFENCE stops RDTSC speculation */
788 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
789
790 if (c->x86 == 0x10)
791 fam10h_check_enable_mmcfg();
792
793 if (amd_apic_timer_broken())
794 disable_apic_timer = 1;
795
796 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
797 unsigned long long tseg;
798
799 /*
800 * Split up direct mapping around the TSEG SMM area.
801 * Don't do it for gbpages because there seems very little
802 * benefit in doing so.
803 */
804 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
805 (tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
806 set_memory_4k((unsigned long)__va(tseg), 1);
807 }
808}
809
810void __cpuinit detect_ht(struct cpuinfo_x86 *c)
811{
812#ifdef CONFIG_SMP
813 u32 eax, ebx, ecx, edx;
814 int index_msb, core_bits;
815
816 cpuid(1, &eax, &ebx, &ecx, &edx);
817
818
819 if (!cpu_has(c, X86_FEATURE_HT))
820 return;
821 if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
822 goto out;
823
824 smp_num_siblings = (ebx & 0xff0000) >> 16;
825
826 if (smp_num_siblings == 1) {
827 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
828 } else if (smp_num_siblings > 1) {
829
830 if (smp_num_siblings > NR_CPUS) {
831 printk(KERN_WARNING "CPU: Unsupported number of "
832 "siblings %d", smp_num_siblings);
833 smp_num_siblings = 1;
834 return;
835 }
836
837 index_msb = get_count_order(smp_num_siblings);
838 c->phys_proc_id = phys_pkg_id(index_msb);
839
840 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
841
842 index_msb = get_count_order(smp_num_siblings);
843
844 core_bits = get_count_order(c->x86_max_cores);
845
846 c->cpu_core_id = phys_pkg_id(index_msb) &
847 ((1 << core_bits) - 1);
848 }
849out:
850 if ((c->x86_max_cores * smp_num_siblings) > 1) {
851 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
852 c->phys_proc_id);
853 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
854 c->cpu_core_id);
855 }
856
857#endif
858}
859
860/*
861 * find out the number of processor cores on the die
862 */
863static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
864{
865 unsigned int eax, t;
866
867 if (c->cpuid_level < 4)
868 return 1;
869
870 cpuid_count(4, 0, &eax, &t, &t, &t);
871
872 if (eax & 0x1f)
873 return ((eax >> 26) + 1);
874 else
875 return 1;
876}
877
878static void __cpuinit srat_detect_node(void)
879{
880#ifdef CONFIG_NUMA
881 unsigned node;
882 int cpu = smp_processor_id();
883 int apicid = hard_smp_processor_id();
884
885 /* Don't do the funky fallback heuristics the AMD version employs
886 for now. */
887 node = apicid_to_node[apicid];
888 if (node == NUMA_NO_NODE || !node_online(node))
889 node = first_node(node_online_map);
890 numa_set_node(cpu, node);
891
892 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
893#endif
894}
895
896static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
897{
898 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
899 (c->x86 == 0x6 && c->x86_model >= 0x0e))
900 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
901}
902
903static void __cpuinit init_intel(struct cpuinfo_x86 *c)
904{
905 /* Cache sizes */
906 unsigned n;
907
908 init_intel_cacheinfo(c);
909 if (c->cpuid_level > 9) {
910 unsigned eax = cpuid_eax(10);
911 /* Check for version and the number of counters */
912 if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
913 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
914 }
915
916 if (cpu_has_ds) {
917 unsigned int l1, l2;
918 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
919 if (!(l1 & (1<<11)))
920 set_cpu_cap(c, X86_FEATURE_BTS);
921 if (!(l1 & (1<<12)))
922 set_cpu_cap(c, X86_FEATURE_PEBS);
923 }
924
925
926 if (cpu_has_bts)
927 ds_init_intel(c);
928
929 n = c->extended_cpuid_level;
930 if (n >= 0x80000008) {
931 unsigned eax = cpuid_eax(0x80000008);
932 c->x86_virt_bits = (eax >> 8) & 0xff;
933 c->x86_phys_bits = eax & 0xff;
934 /* CPUID workaround for Intel 0F34 CPU */
935 if (c->x86_vendor == X86_VENDOR_INTEL &&
936 c->x86 == 0xF && c->x86_model == 0x3 &&
937 c->x86_mask == 0x4)
938 c->x86_phys_bits = 36;
939 }
940
941 if (c->x86 == 15)
942 c->x86_cache_alignment = c->x86_clflush_size * 2;
943 if (c->x86 == 6)
944 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
945 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
946 c->x86_max_cores = intel_num_cpu_cores(c);
947
948 srat_detect_node();
949}
950
951static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
952{
953 if (c->x86 == 0x6 && c->x86_model >= 0xf)
954 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
955}
956
957static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
958{
959 /* Cache sizes */
960 unsigned n;
961
962 n = c->extended_cpuid_level;
963 if (n >= 0x80000008) {
964 unsigned eax = cpuid_eax(0x80000008);
965 c->x86_virt_bits = (eax >> 8) & 0xff;
966 c->x86_phys_bits = eax & 0xff;
967 }
968
969 if (c->x86 == 0x6 && c->x86_model >= 0xf) {
970 c->x86_cache_alignment = c->x86_clflush_size * 2;
971 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
972 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
973 }
974 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
975}
976
977static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
978{
979 char *v = c->x86_vendor_id;
980
981 if (!strcmp(v, "AuthenticAMD"))
982 c->x86_vendor = X86_VENDOR_AMD;
983 else if (!strcmp(v, "GenuineIntel"))
984 c->x86_vendor = X86_VENDOR_INTEL;
985 else if (!strcmp(v, "CentaurHauls"))
986 c->x86_vendor = X86_VENDOR_CENTAUR;
987 else
988 c->x86_vendor = X86_VENDOR_UNKNOWN;
989}
990
991/* Do some early cpuid on the boot CPU to get some parameter that are
992 needed before check_bugs. Everything advanced is in identify_cpu
993 below. */
994static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
995{
996 u32 tfms, xlvl;
997
998 c->loops_per_jiffy = loops_per_jiffy;
999 c->x86_cache_size = -1;
1000 c->x86_vendor = X86_VENDOR_UNKNOWN;
1001 c->x86_model = c->x86_mask = 0; /* So far unknown... */
1002 c->x86_vendor_id[0] = '\0'; /* Unset */
1003 c->x86_model_id[0] = '\0'; /* Unset */
1004 c->x86_clflush_size = 64;
1005 c->x86_cache_alignment = c->x86_clflush_size;
1006 c->x86_max_cores = 1;
1007 c->x86_coreid_bits = 0;
1008 c->extended_cpuid_level = 0;
1009 memset(&c->x86_capability, 0, sizeof c->x86_capability);
1010
1011 /* Get vendor name */
1012 cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
1013 (unsigned int *)&c->x86_vendor_id[0],
1014 (unsigned int *)&c->x86_vendor_id[8],
1015 (unsigned int *)&c->x86_vendor_id[4]);
1016
1017 get_cpu_vendor(c);
1018
1019 /* Initialize the standard set of capabilities */
1020 /* Note that the vendor-specific code below might override */
1021
1022 /* Intel-defined flags: level 0x00000001 */
1023 if (c->cpuid_level >= 0x00000001) {
1024 __u32 misc;
1025 cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
1026 &c->x86_capability[0]);
1027 c->x86 = (tfms >> 8) & 0xf;
1028 c->x86_model = (tfms >> 4) & 0xf;
1029 c->x86_mask = tfms & 0xf;
1030 if (c->x86 == 0xf)
1031 c->x86 += (tfms >> 20) & 0xff;
1032 if (c->x86 >= 0x6)
1033 c->x86_model += ((tfms >> 16) & 0xF) << 4;
1034 if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
1035 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
1036 } else {
1037 /* Have CPUID level 0 only - unheard of */
1038 c->x86 = 4;
1039 }
1040
1041 c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
1042#ifdef CONFIG_SMP
1043 c->phys_proc_id = c->initial_apicid;
1044#endif
1045 /* AMD-defined flags: level 0x80000001 */
1046 xlvl = cpuid_eax(0x80000000);
1047 c->extended_cpuid_level = xlvl;
1048 if ((xlvl & 0xffff0000) == 0x80000000) {
1049 if (xlvl >= 0x80000001) {
1050 c->x86_capability[1] = cpuid_edx(0x80000001);
1051 c->x86_capability[6] = cpuid_ecx(0x80000001);
1052 }
1053 if (xlvl >= 0x80000004)
1054 get_model_name(c); /* Default name */
1055 }
1056
1057 /* Transmeta-defined flags: level 0x80860001 */
1058 xlvl = cpuid_eax(0x80860000);
1059 if ((xlvl & 0xffff0000) == 0x80860000) {
1060 /* Don't set x86_cpuid_level here for now to not confuse. */
1061 if (xlvl >= 0x80860001)
1062 c->x86_capability[2] = cpuid_edx(0x80860001);
1063 }
1064
1065 c->extended_cpuid_level = cpuid_eax(0x80000000);
1066 if (c->extended_cpuid_level >= 0x80000007)
1067 c->x86_power = cpuid_edx(0x80000007);
1068
1069 switch (c->x86_vendor) {
1070 case X86_VENDOR_AMD:
1071 early_init_amd(c);
1072 break;
1073 case X86_VENDOR_INTEL:
1074 early_init_intel(c);
1075 break;
1076 case X86_VENDOR_CENTAUR:
1077 early_init_centaur(c);
1078 break;
1079 }
1080
1081 validate_pat_support(c);
1082}
1083
1084/*
1085 * This does the hard work of actually picking apart the CPU stuff...
1086 */
1087void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
1088{
1089 int i;
1090
1091 early_identify_cpu(c);
1092
1093 init_scattered_cpuid_features(c);
1094
1095 c->apicid = phys_pkg_id(0);
1096
1097 /*
1098 * Vendor-specific initialization. In this section we
1099 * canonicalize the feature flags, meaning if there are
1100 * features a certain CPU supports which CPUID doesn't
1101 * tell us, CPUID claiming incorrect flags, or other bugs,
1102 * we handle them here.
1103 *
1104 * At the end of this section, c->x86_capability better
1105 * indicate the features this CPU genuinely supports!
1106 */
1107 switch (c->x86_vendor) {
1108 case X86_VENDOR_AMD:
1109 init_amd(c);
1110 break;
1111
1112 case X86_VENDOR_INTEL:
1113 init_intel(c);
1114 break;
1115
1116 case X86_VENDOR_CENTAUR:
1117 init_centaur(c);
1118 break;
1119
1120 case X86_VENDOR_UNKNOWN:
1121 default:
1122 display_cacheinfo(c);
1123 break;
1124 }
1125
1126 detect_ht(c);
1127
1128 /*
1129 * On SMP, boot_cpu_data holds the common feature set between
1130 * all CPUs; so make sure that we indicate which features are
1131 * common between the CPUs. The first time this routine gets
1132 * executed, c == &boot_cpu_data.
1133 */
1134 if (c != &boot_cpu_data) {
1135 /* AND the already accumulated flags with these */
1136 for (i = 0; i < NCAPINTS; i++)
1137 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
1138 }
1139
1140 /* Clear all flags overriden by options */
1141 for (i = 0; i < NCAPINTS; i++)
1142 c->x86_capability[i] &= ~cleared_cpu_caps[i];
1143
1144#ifdef CONFIG_X86_MCE
1145 mcheck_init(c);
1146#endif
1147 select_idle_routine(c);
1148
1149#ifdef CONFIG_NUMA
1150 numa_add_cpu(smp_processor_id());
1151#endif
1152
1153}
1154
1155void __cpuinit identify_boot_cpu(void)
1156{
1157 identify_cpu(&boot_cpu_data);
1158}
1159
1160void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
1161{
1162 BUG_ON(c == &boot_cpu_data);
1163 identify_cpu(c);
1164 mtrr_ap_init();
1165}
1166
1167static __init int setup_noclflush(char *arg)
1168{
1169 setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
1170 return 1;
1171}
1172__setup("noclflush", setup_noclflush);
1173
1174void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
1175{
1176 if (c->x86_model_id[0])
1177 printk(KERN_CONT "%s", c->x86_model_id);
1178
1179 if (c->x86_mask || c->cpuid_level >= 0)
1180 printk(KERN_CONT " stepping %02x\n", c->x86_mask);
1181 else
1182 printk(KERN_CONT "\n");
1183}
1184
1185static __init int setup_disablecpuid(char *arg)
1186{
1187 int bit;
1188 if (get_option(&arg, &bit) && bit < NCAPINTS*32)
1189 setup_clear_cpu_cap(bit);
1190 else
1191 return 0;
1192 return 1;
1193}
1194__setup("clearcpuid=", setup_disablecpuid);
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
new file mode 100644
index 000000000000..0e67f72d9316
--- /dev/null
+++ b/arch/x86/kernel/setup_percpu.c
@@ -0,0 +1,385 @@
1#include <linux/kernel.h>
2#include <linux/module.h>
3#include <linux/init.h>
4#include <linux/bootmem.h>
5#include <linux/percpu.h>
6#include <linux/kexec.h>
7#include <linux/crash_dump.h>
8#include <asm/smp.h>
9#include <asm/percpu.h>
10#include <asm/sections.h>
11#include <asm/processor.h>
12#include <asm/setup.h>
13#include <asm/topology.h>
14#include <asm/mpspec.h>
15#include <asm/apicdef.h>
16#include <asm/highmem.h>
17
18#ifdef CONFIG_X86_LOCAL_APIC
19unsigned int num_processors;
20unsigned disabled_cpus __cpuinitdata;
21/* Processor that is doing the boot up */
22unsigned int boot_cpu_physical_apicid = -1U;
23unsigned int max_physical_apicid;
24EXPORT_SYMBOL(boot_cpu_physical_apicid);
25
26/* Bitmask of physically existing CPUs */
27physid_mask_t phys_cpu_present_map;
28#endif
29
30/* map cpu index to physical APIC ID */
31DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
32DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
33EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
34EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
35
36#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
37#define X86_64_NUMA 1
38
39/* map cpu index to node index */
40DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
41EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
42
43/* which logical CPUs are on which nodes */
44cpumask_t *node_to_cpumask_map;
45EXPORT_SYMBOL(node_to_cpumask_map);
46
47/* setup node_to_cpumask_map */
48static void __init setup_node_to_cpumask_map(void);
49
50#else
51static inline void setup_node_to_cpumask_map(void) { }
52#endif
53
54#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
55/*
56 * Copy data used in early init routines from the initial arrays to the
57 * per cpu data areas. These arrays then become expendable and the
58 * *_early_ptr's are zeroed indicating that the static arrays are gone.
59 */
60static void __init setup_per_cpu_maps(void)
61{
62 int cpu;
63
64 for_each_possible_cpu(cpu) {
65 per_cpu(x86_cpu_to_apicid, cpu) =
66 early_per_cpu_map(x86_cpu_to_apicid, cpu);
67 per_cpu(x86_bios_cpu_apicid, cpu) =
68 early_per_cpu_map(x86_bios_cpu_apicid, cpu);
69#ifdef X86_64_NUMA
70 per_cpu(x86_cpu_to_node_map, cpu) =
71 early_per_cpu_map(x86_cpu_to_node_map, cpu);
72#endif
73 }
74
75 /* indicate the early static arrays will soon be gone */
76 early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
77 early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
78#ifdef X86_64_NUMA
79 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
80#endif
81}
82
83#ifdef CONFIG_X86_32
84/*
85 * Great future not-so-futuristic plan: make i386 and x86_64 do it
86 * the same way
87 */
88unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
89EXPORT_SYMBOL(__per_cpu_offset);
90static inline void setup_cpu_pda_map(void) { }
91
92#elif !defined(CONFIG_SMP)
93static inline void setup_cpu_pda_map(void) { }
94
95#else /* CONFIG_SMP && CONFIG_X86_64 */
96
97/*
98 * Allocate cpu_pda pointer table and array via alloc_bootmem.
99 */
100static void __init setup_cpu_pda_map(void)
101{
102 char *pda;
103 struct x8664_pda **new_cpu_pda;
104 unsigned long size;
105 int cpu;
106
107 size = roundup(sizeof(struct x8664_pda), cache_line_size());
108
109 /* allocate cpu_pda array and pointer table */
110 {
111 unsigned long tsize = nr_cpu_ids * sizeof(void *);
112 unsigned long asize = size * (nr_cpu_ids - 1);
113
114 tsize = roundup(tsize, cache_line_size());
115 new_cpu_pda = alloc_bootmem(tsize + asize);
116 pda = (char *)new_cpu_pda + tsize;
117 }
118
119 /* initialize pointer table to static pda's */
120 for_each_possible_cpu(cpu) {
121 if (cpu == 0) {
122 /* leave boot cpu pda in place */
123 new_cpu_pda[0] = cpu_pda(0);
124 continue;
125 }
126 new_cpu_pda[cpu] = (struct x8664_pda *)pda;
127 new_cpu_pda[cpu]->in_bootmem = 1;
128 pda += size;
129 }
130
131 /* point to new pointer table */
132 _cpu_pda = new_cpu_pda;
133}
134#endif
135
136/*
137 * Great future plan:
138 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
139 * Always point %gs to its beginning
140 */
141void __init setup_per_cpu_areas(void)
142{
143 ssize_t size = PERCPU_ENOUGH_ROOM;
144 char *ptr;
145 int cpu;
146
147 /* Setup cpu_pda map */
148 setup_cpu_pda_map();
149
150 /* Copy section for each CPU (we discard the original) */
151 size = PERCPU_ENOUGH_ROOM;
152 printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
153 size);
154
155 for_each_possible_cpu(cpu) {
156#ifndef CONFIG_NEED_MULTIPLE_NODES
157 ptr = alloc_bootmem_pages(size);
158#else
159 int node = early_cpu_to_node(cpu);
160 if (!node_online(node) || !NODE_DATA(node)) {
161 ptr = alloc_bootmem_pages(size);
162 printk(KERN_INFO
163 "cpu %d has no node %d or node-local memory\n",
164 cpu, node);
165 if (ptr)
166 printk(KERN_DEBUG "per cpu data for cpu%d at %016lx\n",
167 cpu, __pa(ptr));
168 }
169 else {
170 ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
171 if (ptr)
172 printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n",
173 cpu, node, __pa(ptr));
174 }
175#endif
176 per_cpu_offset(cpu) = ptr - __per_cpu_start;
177 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
178
179 }
180
181 printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
182 NR_CPUS, nr_cpu_ids, nr_node_ids);
183
184 /* Setup percpu data maps */
185 setup_per_cpu_maps();
186
187 /* Setup node to cpumask map */
188 setup_node_to_cpumask_map();
189}
190
191#endif
192
193#ifdef X86_64_NUMA
194
195/*
196 * Allocate node_to_cpumask_map based on number of available nodes
197 * Requires node_possible_map to be valid.
198 *
199 * Note: node_to_cpumask() is not valid until after this is done.
200 */
201static void __init setup_node_to_cpumask_map(void)
202{
203 unsigned int node, num = 0;
204 cpumask_t *map;
205
206 /* setup nr_node_ids if not done yet */
207 if (nr_node_ids == MAX_NUMNODES) {
208 for_each_node_mask(node, node_possible_map)
209 num = node;
210 nr_node_ids = num + 1;
211 }
212
213 /* allocate the map */
214 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
215
216 pr_debug(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n",
217 map, nr_node_ids);
218
219 /* node_to_cpumask() will now work */
220 node_to_cpumask_map = map;
221}
222
223void __cpuinit numa_set_node(int cpu, int node)
224{
225 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
226
227 if (cpu_pda(cpu) && node != NUMA_NO_NODE)
228 cpu_pda(cpu)->nodenumber = node;
229
230 if (cpu_to_node_map)
231 cpu_to_node_map[cpu] = node;
232
233 else if (per_cpu_offset(cpu))
234 per_cpu(x86_cpu_to_node_map, cpu) = node;
235
236 else
237 pr_debug("Setting node for non-present cpu %d\n", cpu);
238}
239
240void __cpuinit numa_clear_node(int cpu)
241{
242 numa_set_node(cpu, NUMA_NO_NODE);
243}
244
245#ifndef CONFIG_DEBUG_PER_CPU_MAPS
246
247void __cpuinit numa_add_cpu(int cpu)
248{
249 cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
250}
251
252void __cpuinit numa_remove_cpu(int cpu)
253{
254 cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]);
255}
256
257#else /* CONFIG_DEBUG_PER_CPU_MAPS */
258
259/*
260 * --------- debug versions of the numa functions ---------
261 */
262static void __cpuinit numa_set_cpumask(int cpu, int enable)
263{
264 int node = cpu_to_node(cpu);
265 cpumask_t *mask;
266 char buf[64];
267
268 if (node_to_cpumask_map == NULL) {
269 printk(KERN_ERR "node_to_cpumask_map NULL\n");
270 dump_stack();
271 return;
272 }
273
274 mask = &node_to_cpumask_map[node];
275 if (enable)
276 cpu_set(cpu, *mask);
277 else
278 cpu_clear(cpu, *mask);
279
280 cpulist_scnprintf(buf, sizeof(buf), *mask);
281 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
282 enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf);
283 }
284
285void __cpuinit numa_add_cpu(int cpu)
286{
287 numa_set_cpumask(cpu, 1);
288}
289
290void __cpuinit numa_remove_cpu(int cpu)
291{
292 numa_set_cpumask(cpu, 0);
293}
294
295int cpu_to_node(int cpu)
296{
297 if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
298 printk(KERN_WARNING
299 "cpu_to_node(%d): usage too early!\n", cpu);
300 dump_stack();
301 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
302 }
303 return per_cpu(x86_cpu_to_node_map, cpu);
304}
305EXPORT_SYMBOL(cpu_to_node);
306
307/*
308 * Same function as cpu_to_node() but used if called before the
309 * per_cpu areas are setup.
310 */
311int early_cpu_to_node(int cpu)
312{
313 if (early_per_cpu_ptr(x86_cpu_to_node_map))
314 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
315
316 if (!per_cpu_offset(cpu)) {
317 printk(KERN_WARNING
318 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
319 dump_stack();
320 return NUMA_NO_NODE;
321 }
322 return per_cpu(x86_cpu_to_node_map, cpu);
323}
324
325
326/* empty cpumask */
327static const cpumask_t cpu_mask_none;
328
329/*
330 * Returns a pointer to the bitmask of CPUs on Node 'node'.
331 */
332const cpumask_t *_node_to_cpumask_ptr(int node)
333{
334 if (node_to_cpumask_map == NULL) {
335 printk(KERN_WARNING
336 "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n",
337 node);
338 dump_stack();
339 return (const cpumask_t *)&cpu_online_map;
340 }
341 if (node >= nr_node_ids) {
342 printk(KERN_WARNING
343 "_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n",
344 node, nr_node_ids);
345 dump_stack();
346 return &cpu_mask_none;
347 }
348 return &node_to_cpumask_map[node];
349}
350EXPORT_SYMBOL(_node_to_cpumask_ptr);
351
352/*
353 * Returns a bitmask of CPUs on Node 'node'.
354 *
355 * Side note: this function creates the returned cpumask on the stack
356 * so with a high NR_CPUS count, excessive stack space is used. The
357 * node_to_cpumask_ptr function should be used whenever possible.
358 */
359cpumask_t node_to_cpumask(int node)
360{
361 if (node_to_cpumask_map == NULL) {
362 printk(KERN_WARNING
363 "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
364 dump_stack();
365 return cpu_online_map;
366 }
367 if (node >= nr_node_ids) {
368 printk(KERN_WARNING
369 "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
370 node, nr_node_ids);
371 dump_stack();
372 return cpu_mask_none;
373 }
374 return node_to_cpumask_map[node];
375}
376EXPORT_SYMBOL(node_to_cpumask);
377
378/*
379 * --------- end of debug versions of the numa functions ---------
380 */
381
382#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
383
384#endif /* X86_64_NUMA */
385
diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h
index 72bbb519d2dc..cc673aa55ce4 100644
--- a/arch/x86/kernel/sigframe.h
+++ b/arch/x86/kernel/sigframe.h
@@ -3,9 +3,18 @@ struct sigframe {
3 char __user *pretcode; 3 char __user *pretcode;
4 int sig; 4 int sig;
5 struct sigcontext sc; 5 struct sigcontext sc;
6 struct _fpstate fpstate; 6 /*
7 * fpstate is unused. fpstate is moved/allocated after
8 * retcode[] below. This movement allows to have the FP state and the
9 * future state extensions (xsave) stay together.
10 * And at the same time retaining the unused fpstate, prevents changing
11 * the offset of extramask[] in the sigframe and thus prevent any
12 * legacy application accessing/modifying it.
13 */
14 struct _fpstate fpstate_unused;
7 unsigned long extramask[_NSIG_WORDS-1]; 15 unsigned long extramask[_NSIG_WORDS-1];
8 char retcode[8]; 16 char retcode[8];
17 /* fp state follows here */
9}; 18};
10 19
11struct rt_sigframe { 20struct rt_sigframe {
@@ -15,13 +24,19 @@ struct rt_sigframe {
15 void __user *puc; 24 void __user *puc;
16 struct siginfo info; 25 struct siginfo info;
17 struct ucontext uc; 26 struct ucontext uc;
18 struct _fpstate fpstate;
19 char retcode[8]; 27 char retcode[8];
28 /* fp state follows here */
20}; 29};
21#else 30#else
22struct rt_sigframe { 31struct rt_sigframe {
23 char __user *pretcode; 32 char __user *pretcode;
24 struct ucontext uc; 33 struct ucontext uc;
25 struct siginfo info; 34 struct siginfo info;
35 /* fp state follows here */
26}; 36};
37
38int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
39 sigset_t *set, struct pt_regs *regs);
40int ia32_setup_frame(int sig, struct k_sigaction *ka,
41 sigset_t *set, struct pt_regs *regs);
27#endif 42#endif
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index d92373630963..d6dd057d0f22 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -17,6 +17,7 @@
17#include <linux/errno.h> 17#include <linux/errno.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/wait.h> 19#include <linux/wait.h>
20#include <linux/tracehook.h>
20#include <linux/elf.h> 21#include <linux/elf.h>
21#include <linux/smp.h> 22#include <linux/smp.h>
22#include <linux/mm.h> 23#include <linux/mm.h>
@@ -26,6 +27,8 @@
26#include <asm/uaccess.h> 27#include <asm/uaccess.h>
27#include <asm/i387.h> 28#include <asm/i387.h>
28#include <asm/vdso.h> 29#include <asm/vdso.h>
30#include <asm/syscall.h>
31#include <asm/syscalls.h>
29 32
30#include "sigframe.h" 33#include "sigframe.h"
31 34
@@ -110,6 +113,27 @@ asmlinkage int sys_sigaltstack(unsigned long bx)
110 return do_sigaltstack(uss, uoss, regs->sp); 113 return do_sigaltstack(uss, uoss, regs->sp);
111} 114}
112 115
116#define COPY(x) { \
117 err |= __get_user(regs->x, &sc->x); \
118}
119
120#define COPY_SEG(seg) { \
121 unsigned short tmp; \
122 err |= __get_user(tmp, &sc->seg); \
123 regs->seg = tmp; \
124}
125
126#define COPY_SEG_STRICT(seg) { \
127 unsigned short tmp; \
128 err |= __get_user(tmp, &sc->seg); \
129 regs->seg = tmp | 3; \
130}
131
132#define GET_SEG(seg) { \
133 unsigned short tmp; \
134 err |= __get_user(tmp, &sc->seg); \
135 loadsegment(seg, tmp); \
136}
113 137
114/* 138/*
115 * Do a signal return; undo the signal stack. 139 * Do a signal return; undo the signal stack.
@@ -118,28 +142,13 @@ static int
118restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, 142restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
119 unsigned long *pax) 143 unsigned long *pax)
120{ 144{
145 void __user *buf;
146 unsigned int tmpflags;
121 unsigned int err = 0; 147 unsigned int err = 0;
122 148
123 /* Always make any pending restarted system calls return -EINTR */ 149 /* Always make any pending restarted system calls return -EINTR */
124 current_thread_info()->restart_block.fn = do_no_restart_syscall; 150 current_thread_info()->restart_block.fn = do_no_restart_syscall;
125 151
126#define COPY(x) err |= __get_user(regs->x, &sc->x)
127
128#define COPY_SEG(seg) \
129 { unsigned short tmp; \
130 err |= __get_user(tmp, &sc->seg); \
131 regs->seg = tmp; }
132
133#define COPY_SEG_STRICT(seg) \
134 { unsigned short tmp; \
135 err |= __get_user(tmp, &sc->seg); \
136 regs->seg = tmp|3; }
137
138#define GET_SEG(seg) \
139 { unsigned short tmp; \
140 err |= __get_user(tmp, &sc->seg); \
141 loadsegment(seg, tmp); }
142
143 GET_SEG(gs); 152 GET_SEG(gs);
144 COPY_SEG(fs); 153 COPY_SEG(fs);
145 COPY_SEG(es); 154 COPY_SEG(es);
@@ -149,38 +158,15 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
149 COPY_SEG_STRICT(cs); 158 COPY_SEG_STRICT(cs);
150 COPY_SEG_STRICT(ss); 159 COPY_SEG_STRICT(ss);
151 160
152 { 161 err |= __get_user(tmpflags, &sc->flags);
153 unsigned int tmpflags; 162 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
154 163 regs->orig_ax = -1; /* disable syscall checks */
155 err |= __get_user(tmpflags, &sc->flags);
156 regs->flags = (regs->flags & ~FIX_EFLAGS) |
157 (tmpflags & FIX_EFLAGS);
158 regs->orig_ax = -1; /* disable syscall checks */
159 }
160 164
161 { 165 err |= __get_user(buf, &sc->fpstate);
162 struct _fpstate __user *buf; 166 err |= restore_i387_xstate(buf);
163
164 err |= __get_user(buf, &sc->fpstate);
165 if (buf) {
166 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
167 goto badframe;
168 err |= restore_i387(buf);
169 } else {
170 struct task_struct *me = current;
171
172 if (used_math()) {
173 clear_fpu(me);
174 clear_used_math();
175 }
176 }
177 }
178 167
179 err |= __get_user(*pax, &sc->ax); 168 err |= __get_user(*pax, &sc->ax);
180 return err; 169 return err;
181
182badframe:
183 return 1;
184} 170}
185 171
186asmlinkage unsigned long sys_sigreturn(unsigned long __unused) 172asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
@@ -212,7 +198,7 @@ asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
212 198
213badframe: 199badframe:
214 if (show_unhandled_signals && printk_ratelimit()) { 200 if (show_unhandled_signals && printk_ratelimit()) {
215 printk(KERN_INFO "%s%s[%d] bad frame in sigreturn frame:" 201 printk("%s%s[%d] bad frame in sigreturn frame:"
216 "%p ip:%lx sp:%lx oeax:%lx", 202 "%p ip:%lx sp:%lx oeax:%lx",
217 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, 203 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
218 current->comm, task_pid_nr(current), frame, regs->ip, 204 current->comm, task_pid_nr(current), frame, regs->ip,
@@ -226,9 +212,8 @@ badframe:
226 return 0; 212 return 0;
227} 213}
228 214
229asmlinkage int sys_rt_sigreturn(unsigned long __unused) 215static long do_rt_sigreturn(struct pt_regs *regs)
230{ 216{
231 struct pt_regs *regs = (struct pt_regs *)&__unused;
232 struct rt_sigframe __user *frame; 217 struct rt_sigframe __user *frame;
233 unsigned long ax; 218 unsigned long ax;
234 sigset_t set; 219 sigset_t set;
@@ -254,15 +239,22 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused)
254 return ax; 239 return ax;
255 240
256badframe: 241badframe:
257 force_sig(SIGSEGV, current); 242 signal_fault(regs, frame, "rt_sigreturn");
258 return 0; 243 return 0;
259} 244}
260 245
246asmlinkage int sys_rt_sigreturn(unsigned long __unused)
247{
248 struct pt_regs *regs = (struct pt_regs *)&__unused;
249
250 return do_rt_sigreturn(regs);
251}
252
261/* 253/*
262 * Set up a signal frame. 254 * Set up a signal frame.
263 */ 255 */
264static int 256static int
265setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, 257setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
266 struct pt_regs *regs, unsigned long mask) 258 struct pt_regs *regs, unsigned long mask)
267{ 259{
268 int tmp, err = 0; 260 int tmp, err = 0;
@@ -289,7 +281,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
289 err |= __put_user(regs->sp, &sc->sp_at_signal); 281 err |= __put_user(regs->sp, &sc->sp_at_signal);
290 err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); 282 err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss);
291 283
292 tmp = save_i387(fpstate); 284 tmp = save_i387_xstate(fpstate);
293 if (tmp < 0) 285 if (tmp < 0)
294 err = 1; 286 err = 1;
295 else 287 else
@@ -306,7 +298,8 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
306 * Determine which stack to use.. 298 * Determine which stack to use..
307 */ 299 */
308static inline void __user * 300static inline void __user *
309get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size) 301get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
302 void **fpstate)
310{ 303{
311 unsigned long sp; 304 unsigned long sp;
312 305
@@ -332,6 +325,11 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size)
332 sp = (unsigned long) ka->sa.sa_restorer; 325 sp = (unsigned long) ka->sa.sa_restorer;
333 } 326 }
334 327
328 if (used_math()) {
329 sp = sp - sig_xstate_size;
330 *fpstate = (struct _fpstate *) sp;
331 }
332
335 sp -= frame_size; 333 sp -= frame_size;
336 /* 334 /*
337 * Align the stack pointer according to the i386 ABI, 335 * Align the stack pointer according to the i386 ABI,
@@ -343,38 +341,29 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size)
343} 341}
344 342
345static int 343static int
346setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, 344__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
347 struct pt_regs *regs) 345 struct pt_regs *regs)
348{ 346{
349 struct sigframe __user *frame; 347 struct sigframe __user *frame;
350 void __user *restorer; 348 void __user *restorer;
351 int err = 0; 349 int err = 0;
352 int usig; 350 void __user *fpstate = NULL;
353 351
354 frame = get_sigframe(ka, regs, sizeof(*frame)); 352 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
355 353
356 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 354 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
357 goto give_sigsegv; 355 return -EFAULT;
358 356
359 usig = current_thread_info()->exec_domain 357 if (__put_user(sig, &frame->sig))
360 && current_thread_info()->exec_domain->signal_invmap 358 return -EFAULT;
361 && sig < 32
362 ? current_thread_info()->exec_domain->signal_invmap[sig]
363 : sig;
364 359
365 err = __put_user(usig, &frame->sig); 360 if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
366 if (err) 361 return -EFAULT;
367 goto give_sigsegv;
368
369 err = setup_sigcontext(&frame->sc, &frame->fpstate, regs, set->sig[0]);
370 if (err)
371 goto give_sigsegv;
372 362
373 if (_NSIG_WORDS > 1) { 363 if (_NSIG_WORDS > 1) {
374 err = __copy_to_user(&frame->extramask, &set->sig[1], 364 if (__copy_to_user(&frame->extramask, &set->sig[1],
375 sizeof(frame->extramask)); 365 sizeof(frame->extramask)))
376 if (err) 366 return -EFAULT;
377 goto give_sigsegv;
378 } 367 }
379 368
380 if (current->mm->context.vdso) 369 if (current->mm->context.vdso)
@@ -399,7 +388,7 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
399 err |= __put_user(0x80cd, (short __user *)(frame->retcode+6)); 388 err |= __put_user(0x80cd, (short __user *)(frame->retcode+6));
400 389
401 if (err) 390 if (err)
402 goto give_sigsegv; 391 return -EFAULT;
403 392
404 /* Set up registers for signal handler */ 393 /* Set up registers for signal handler */
405 regs->sp = (unsigned long)frame; 394 regs->sp = (unsigned long)frame;
@@ -414,50 +403,43 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
414 regs->cs = __USER_CS; 403 regs->cs = __USER_CS;
415 404
416 return 0; 405 return 0;
417
418give_sigsegv:
419 force_sigsegv(sig, current);
420 return -EFAULT;
421} 406}
422 407
423static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 408static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
424 sigset_t *set, struct pt_regs *regs) 409 sigset_t *set, struct pt_regs *regs)
425{ 410{
426 struct rt_sigframe __user *frame; 411 struct rt_sigframe __user *frame;
427 void __user *restorer; 412 void __user *restorer;
428 int err = 0; 413 int err = 0;
429 int usig; 414 void __user *fpstate = NULL;
430 415
431 frame = get_sigframe(ka, regs, sizeof(*frame)); 416 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
432 417
433 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 418 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
434 goto give_sigsegv; 419 return -EFAULT;
435
436 usig = current_thread_info()->exec_domain
437 && current_thread_info()->exec_domain->signal_invmap
438 && sig < 32
439 ? current_thread_info()->exec_domain->signal_invmap[sig]
440 : sig;
441 420
442 err |= __put_user(usig, &frame->sig); 421 err |= __put_user(sig, &frame->sig);
443 err |= __put_user(&frame->info, &frame->pinfo); 422 err |= __put_user(&frame->info, &frame->pinfo);
444 err |= __put_user(&frame->uc, &frame->puc); 423 err |= __put_user(&frame->uc, &frame->puc);
445 err |= copy_siginfo_to_user(&frame->info, info); 424 err |= copy_siginfo_to_user(&frame->info, info);
446 if (err) 425 if (err)
447 goto give_sigsegv; 426 return -EFAULT;
448 427
449 /* Create the ucontext. */ 428 /* Create the ucontext. */
450 err |= __put_user(0, &frame->uc.uc_flags); 429 if (cpu_has_xsave)
430 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
431 else
432 err |= __put_user(0, &frame->uc.uc_flags);
451 err |= __put_user(0, &frame->uc.uc_link); 433 err |= __put_user(0, &frame->uc.uc_link);
452 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 434 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
453 err |= __put_user(sas_ss_flags(regs->sp), 435 err |= __put_user(sas_ss_flags(regs->sp),
454 &frame->uc.uc_stack.ss_flags); 436 &frame->uc.uc_stack.ss_flags);
455 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); 437 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
456 err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, 438 err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
457 regs, set->sig[0]); 439 regs, set->sig[0]);
458 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); 440 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
459 if (err) 441 if (err)
460 goto give_sigsegv; 442 return -EFAULT;
461 443
462 /* Set up to return from userspace. */ 444 /* Set up to return from userspace. */
463 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); 445 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
@@ -477,12 +459,12 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
477 err |= __put_user(0x80cd, (short __user *)(frame->retcode+5)); 459 err |= __put_user(0x80cd, (short __user *)(frame->retcode+5));
478 460
479 if (err) 461 if (err)
480 goto give_sigsegv; 462 return -EFAULT;
481 463
482 /* Set up registers for signal handler */ 464 /* Set up registers for signal handler */
483 regs->sp = (unsigned long)frame; 465 regs->sp = (unsigned long)frame;
484 regs->ip = (unsigned long)ka->sa.sa_handler; 466 regs->ip = (unsigned long)ka->sa.sa_handler;
485 regs->ax = (unsigned long)usig; 467 regs->ax = (unsigned long)sig;
486 regs->dx = (unsigned long)&frame->info; 468 regs->dx = (unsigned long)&frame->info;
487 regs->cx = (unsigned long)&frame->uc; 469 regs->cx = (unsigned long)&frame->uc;
488 470
@@ -492,15 +474,48 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
492 regs->cs = __USER_CS; 474 regs->cs = __USER_CS;
493 475
494 return 0; 476 return 0;
495
496give_sigsegv:
497 force_sigsegv(sig, current);
498 return -EFAULT;
499} 477}
500 478
501/* 479/*
502 * OK, we're invoking a handler: 480 * OK, we're invoking a handler:
503 */ 481 */
482static int signr_convert(int sig)
483{
484 struct thread_info *info = current_thread_info();
485
486 if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32)
487 return info->exec_domain->signal_invmap[sig];
488 return sig;
489}
490
491#define is_ia32 1
492#define ia32_setup_frame __setup_frame
493#define ia32_setup_rt_frame __setup_rt_frame
494
495static int
496setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
497 sigset_t *set, struct pt_regs *regs)
498{
499 int usig = signr_convert(sig);
500 int ret;
501
502 /* Set up the stack frame */
503 if (is_ia32) {
504 if (ka->sa.sa_flags & SA_SIGINFO)
505 ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
506 else
507 ret = ia32_setup_frame(usig, ka, set, regs);
508 } else
509 ret = __setup_rt_frame(sig, ka, info, set, regs);
510
511 if (ret) {
512 force_sigsegv(sig, current);
513 return -EFAULT;
514 }
515
516 return ret;
517}
518
504static int 519static int
505handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, 520handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
506 sigset_t *oldset, struct pt_regs *regs) 521 sigset_t *oldset, struct pt_regs *regs)
@@ -508,9 +523,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
508 int ret; 523 int ret;
509 524
510 /* Are we from a system call? */ 525 /* Are we from a system call? */
511 if ((long)regs->orig_ax >= 0) { 526 if (syscall_get_nr(current, regs) >= 0) {
512 /* If so, check system call restarting.. */ 527 /* If so, check system call restarting.. */
513 switch (regs->ax) { 528 switch (syscall_get_error(current, regs)) {
514 case -ERESTART_RESTARTBLOCK: 529 case -ERESTART_RESTARTBLOCK:
515 case -ERESTARTNOHAND: 530 case -ERESTARTNOHAND:
516 regs->ax = -EINTR; 531 regs->ax = -EINTR;
@@ -537,15 +552,20 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
537 likely(test_and_clear_thread_flag(TIF_FORCED_TF))) 552 likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
538 regs->flags &= ~X86_EFLAGS_TF; 553 regs->flags &= ~X86_EFLAGS_TF;
539 554
540 /* Set up the stack frame */ 555 ret = setup_rt_frame(sig, ka, info, oldset, regs);
541 if (ka->sa.sa_flags & SA_SIGINFO)
542 ret = setup_rt_frame(sig, ka, info, oldset, regs);
543 else
544 ret = setup_frame(sig, ka, oldset, regs);
545 556
546 if (ret) 557 if (ret)
547 return ret; 558 return ret;
548 559
560#ifdef CONFIG_X86_64
561 /*
562 * This has nothing to do with segment registers,
563 * despite the name. This magic affects uaccess.h
564 * macros' behavior. Reset it to the normal setting.
565 */
566 set_fs(USER_DS);
567#endif
568
549 /* 569 /*
550 * Clear the direction flag as per the ABI for function entry. 570 * Clear the direction flag as per the ABI for function entry.
551 */ 571 */
@@ -558,8 +578,6 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
558 * handler too. 578 * handler too.
559 */ 579 */
560 regs->flags &= ~X86_EFLAGS_TF; 580 regs->flags &= ~X86_EFLAGS_TF;
561 if (test_thread_flag(TIF_SINGLESTEP))
562 ptrace_notify(SIGTRAP);
563 581
564 spin_lock_irq(&current->sighand->siglock); 582 spin_lock_irq(&current->sighand->siglock);
565 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask); 583 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
@@ -568,9 +586,13 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
568 recalc_sigpending(); 586 recalc_sigpending();
569 spin_unlock_irq(&current->sighand->siglock); 587 spin_unlock_irq(&current->sighand->siglock);
570 588
589 tracehook_signal_handler(sig, info, ka, regs,
590 test_thread_flag(TIF_SINGLESTEP));
591
571 return 0; 592 return 0;
572} 593}
573 594
595#define NR_restart_syscall __NR_restart_syscall
574/* 596/*
575 * Note that 'init' is a special process: it doesn't get signals it doesn't 597 * Note that 'init' is a special process: it doesn't get signals it doesn't
576 * want to handle. Thus you cannot kill init even with a SIGKILL even by 598 * want to handle. Thus you cannot kill init even with a SIGKILL even by
@@ -623,9 +645,9 @@ static void do_signal(struct pt_regs *regs)
623 } 645 }
624 646
625 /* Did we come from a system call? */ 647 /* Did we come from a system call? */
626 if ((long)regs->orig_ax >= 0) { 648 if (syscall_get_nr(current, regs) >= 0) {
627 /* Restart the system call - no handlers present */ 649 /* Restart the system call - no handlers present */
628 switch (regs->ax) { 650 switch (syscall_get_error(current, regs)) {
629 case -ERESTARTNOHAND: 651 case -ERESTARTNOHAND:
630 case -ERESTARTSYS: 652 case -ERESTARTSYS:
631 case -ERESTARTNOINTR: 653 case -ERESTARTNOINTR:
@@ -634,7 +656,7 @@ static void do_signal(struct pt_regs *regs)
634 break; 656 break;
635 657
636 case -ERESTART_RESTARTBLOCK: 658 case -ERESTART_RESTARTBLOCK:
637 regs->ax = __NR_restart_syscall; 659 regs->ax = NR_restart_syscall;
638 regs->ip -= 2; 660 regs->ip -= 2;
639 break; 661 break;
640 } 662 }
@@ -657,18 +679,38 @@ static void do_signal(struct pt_regs *regs)
657void 679void
658do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 680do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
659{ 681{
660 /* Pending single-step? */ 682#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
661 if (thread_info_flags & _TIF_SINGLESTEP) { 683 /* notify userspace of pending MCEs */
662 regs->flags |= X86_EFLAGS_TF; 684 if (thread_info_flags & _TIF_MCE_NOTIFY)
663 clear_thread_flag(TIF_SINGLESTEP); 685 mce_notify_user();
664 } 686#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
665 687
666 /* deal with pending signal delivery */ 688 /* deal with pending signal delivery */
667 if (thread_info_flags & _TIF_SIGPENDING) 689 if (thread_info_flags & _TIF_SIGPENDING)
668 do_signal(regs); 690 do_signal(regs);
669 691
670 if (thread_info_flags & _TIF_HRTICK_RESCHED) 692 if (thread_info_flags & _TIF_NOTIFY_RESUME) {
671 hrtick_resched(); 693 clear_thread_flag(TIF_NOTIFY_RESUME);
694 tracehook_notify_resume(regs);
695 }
672 696
697#ifdef CONFIG_X86_32
673 clear_thread_flag(TIF_IRET); 698 clear_thread_flag(TIF_IRET);
699#endif /* CONFIG_X86_32 */
700}
701
702void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
703{
704 struct task_struct *me = current;
705
706 if (show_unhandled_signals && printk_ratelimit()) {
707 printk(KERN_INFO
708 "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
709 me->comm, me->pid, where, frame,
710 regs->ip, regs->sp, regs->orig_ax);
711 print_vma_addr(" in ", regs->ip);
712 printk(KERN_CONT "\n");
713 }
714
715 force_sig(SIGSEGV, me);
674} 716}
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index e53b267662e7..a5c9627f4db9 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -15,17 +15,21 @@
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/wait.h> 16#include <linux/wait.h>
17#include <linux/ptrace.h> 17#include <linux/ptrace.h>
18#include <linux/tracehook.h>
18#include <linux/unistd.h> 19#include <linux/unistd.h>
19#include <linux/stddef.h> 20#include <linux/stddef.h>
20#include <linux/personality.h> 21#include <linux/personality.h>
21#include <linux/compiler.h> 22#include <linux/compiler.h>
23#include <linux/uaccess.h>
24
22#include <asm/processor.h> 25#include <asm/processor.h>
23#include <asm/ucontext.h> 26#include <asm/ucontext.h>
24#include <asm/uaccess.h>
25#include <asm/i387.h> 27#include <asm/i387.h>
26#include <asm/proto.h> 28#include <asm/proto.h>
27#include <asm/ia32_unistd.h> 29#include <asm/ia32_unistd.h>
28#include <asm/mce.h> 30#include <asm/mce.h>
31#include <asm/syscall.h>
32#include <asm/syscalls.h>
29#include "sigframe.h" 33#include "sigframe.h"
30 34
31#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) 35#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
@@ -41,11 +45,6 @@
41# define FIX_EFLAGS __FIX_EFLAGS 45# define FIX_EFLAGS __FIX_EFLAGS
42#endif 46#endif
43 47
44int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
45 sigset_t *set, struct pt_regs * regs);
46int ia32_setup_frame(int sig, struct k_sigaction *ka,
47 sigset_t *set, struct pt_regs * regs);
48
49asmlinkage long 48asmlinkage long
50sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, 49sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
51 struct pt_regs *regs) 50 struct pt_regs *regs)
@@ -53,6 +52,15 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
53 return do_sigaltstack(uss, uoss, regs->sp); 52 return do_sigaltstack(uss, uoss, regs->sp);
54} 53}
55 54
55#define COPY(x) { \
56 err |= __get_user(regs->x, &sc->x); \
57}
58
59#define COPY_SEG_STRICT(seg) { \
60 unsigned short tmp; \
61 err |= __get_user(tmp, &sc->seg); \
62 regs->seg = tmp | 3; \
63}
56 64
57/* 65/*
58 * Do a signal return; undo the signal stack. 66 * Do a signal return; undo the signal stack.
@@ -61,13 +69,13 @@ static int
61restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, 69restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
62 unsigned long *pax) 70 unsigned long *pax)
63{ 71{
72 void __user *buf;
73 unsigned int tmpflags;
64 unsigned int err = 0; 74 unsigned int err = 0;
65 75
66 /* Always make any pending restarted system calls return -EINTR */ 76 /* Always make any pending restarted system calls return -EINTR */
67 current_thread_info()->restart_block.fn = do_no_restart_syscall; 77 current_thread_info()->restart_block.fn = do_no_restart_syscall;
68 78
69#define COPY(x) err |= __get_user(regs->x, &sc->x)
70
71 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); 79 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
72 COPY(dx); COPY(cx); COPY(ip); 80 COPY(dx); COPY(cx); COPY(ip);
73 COPY(r8); 81 COPY(r8);
@@ -82,48 +90,24 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
82 /* Kernel saves and restores only the CS segment register on signals, 90 /* Kernel saves and restores only the CS segment register on signals,
83 * which is the bare minimum needed to allow mixed 32/64-bit code. 91 * which is the bare minimum needed to allow mixed 32/64-bit code.
84 * App's signal handler can save/restore other segments if needed. */ 92 * App's signal handler can save/restore other segments if needed. */
85 { 93 COPY_SEG_STRICT(cs);
86 unsigned cs;
87 err |= __get_user(cs, &sc->cs);
88 regs->cs = cs | 3; /* Force into user mode */
89 }
90 94
91 { 95 err |= __get_user(tmpflags, &sc->flags);
92 unsigned int tmpflags; 96 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
93 err |= __get_user(tmpflags, &sc->flags); 97 regs->orig_ax = -1; /* disable syscall checks */
94 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
95 regs->orig_ax = -1; /* disable syscall checks */
96 }
97 98
98 { 99 err |= __get_user(buf, &sc->fpstate);
99 struct _fpstate __user * buf; 100 err |= restore_i387_xstate(buf);
100 err |= __get_user(buf, &sc->fpstate);
101
102 if (buf) {
103 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
104 goto badframe;
105 err |= restore_i387(buf);
106 } else {
107 struct task_struct *me = current;
108 if (used_math()) {
109 clear_fpu(me);
110 clear_used_math();
111 }
112 }
113 }
114 101
115 err |= __get_user(*pax, &sc->ax); 102 err |= __get_user(*pax, &sc->ax);
116 return err; 103 return err;
117
118badframe:
119 return 1;
120} 104}
121 105
122asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) 106static long do_rt_sigreturn(struct pt_regs *regs)
123{ 107{
124 struct rt_sigframe __user *frame; 108 struct rt_sigframe __user *frame;
125 sigset_t set;
126 unsigned long ax; 109 unsigned long ax;
110 sigset_t set;
127 111
128 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); 112 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
129 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 113 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
@@ -136,7 +120,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
136 current->blocked = set; 120 current->blocked = set;
137 recalc_sigpending(); 121 recalc_sigpending();
138 spin_unlock_irq(&current->sighand->siglock); 122 spin_unlock_irq(&current->sighand->siglock);
139 123
140 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 124 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
141 goto badframe; 125 goto badframe;
142 126
@@ -146,16 +130,22 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
146 return ax; 130 return ax;
147 131
148badframe: 132badframe:
149 signal_fault(regs,frame,"sigreturn"); 133 signal_fault(regs, frame, "rt_sigreturn");
150 return 0; 134 return 0;
151} 135}
136
137asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
138{
139 return do_rt_sigreturn(regs);
140}
152 141
153/* 142/*
154 * Set up a signal frame. 143 * Set up a signal frame.
155 */ 144 */
156 145
157static inline int 146static inline int
158setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me) 147setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
148 unsigned long mask, struct task_struct *me)
159{ 149{
160 int err = 0; 150 int err = 0;
161 151
@@ -207,41 +197,40 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
207 sp = current->sas_ss_sp + current->sas_ss_size; 197 sp = current->sas_ss_sp + current->sas_ss_size;
208 } 198 }
209 199
210 return (void __user *)round_down(sp - size, 16); 200 return (void __user *)round_down(sp - size, 64);
211} 201}
212 202
213static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 203static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
214 sigset_t *set, struct pt_regs * regs) 204 sigset_t *set, struct pt_regs *regs)
215{ 205{
216 struct rt_sigframe __user *frame; 206 struct rt_sigframe __user *frame;
217 struct _fpstate __user *fp = NULL; 207 void __user *fp = NULL;
218 int err = 0; 208 int err = 0;
219 struct task_struct *me = current; 209 struct task_struct *me = current;
220 210
221 if (used_math()) { 211 if (used_math()) {
222 fp = get_stack(ka, regs, sizeof(struct _fpstate)); 212 fp = get_stack(ka, regs, sig_xstate_size);
223 frame = (void __user *)round_down( 213 frame = (void __user *)round_down(
224 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; 214 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
225 215
226 if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate))) 216 if (save_i387_xstate(fp) < 0)
227 goto give_sigsegv; 217 return -EFAULT;
228
229 if (save_i387(fp) < 0)
230 err |= -1;
231 } else 218 } else
232 frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; 219 frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
233 220
234 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 221 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
235 goto give_sigsegv; 222 return -EFAULT;
236 223
237 if (ka->sa.sa_flags & SA_SIGINFO) { 224 if (ka->sa.sa_flags & SA_SIGINFO) {
238 err |= copy_siginfo_to_user(&frame->info, info); 225 if (copy_siginfo_to_user(&frame->info, info))
239 if (err) 226 return -EFAULT;
240 goto give_sigsegv;
241 } 227 }
242 228
243 /* Create the ucontext. */ 229 /* Create the ucontext. */
244 err |= __put_user(0, &frame->uc.uc_flags); 230 if (cpu_has_xsave)
231 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
232 else
233 err |= __put_user(0, &frame->uc.uc_flags);
245 err |= __put_user(0, &frame->uc.uc_link); 234 err |= __put_user(0, &frame->uc.uc_link);
246 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 235 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
247 err |= __put_user(sas_ss_flags(regs->sp), 236 err |= __put_user(sas_ss_flags(regs->sp),
@@ -249,9 +238,9 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
249 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); 238 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
250 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); 239 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
251 err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate); 240 err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
252 if (sizeof(*set) == 16) { 241 if (sizeof(*set) == 16) {
253 __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); 242 __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
254 __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); 243 __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
255 } else 244 } else
256 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); 245 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
257 246
@@ -262,15 +251,15 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
262 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); 251 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
263 } else { 252 } else {
264 /* could use a vstub here */ 253 /* could use a vstub here */
265 goto give_sigsegv; 254 return -EFAULT;
266 } 255 }
267 256
268 if (err) 257 if (err)
269 goto give_sigsegv; 258 return -EFAULT;
270 259
271 /* Set up registers for signal handler */ 260 /* Set up registers for signal handler */
272 regs->di = sig; 261 regs->di = sig;
273 /* In case the signal handler was declared without prototypes */ 262 /* In case the signal handler was declared without prototypes */
274 regs->ax = 0; 263 regs->ax = 0;
275 264
276 /* This also works for non SA_SIGINFO handlers because they expect the 265 /* This also works for non SA_SIGINFO handlers because they expect the
@@ -286,44 +275,45 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
286 regs->cs = __USER_CS; 275 regs->cs = __USER_CS;
287 276
288 return 0; 277 return 0;
289
290give_sigsegv:
291 force_sigsegv(sig, current);
292 return -EFAULT;
293} 278}
294 279
295/* 280/*
296 * Return -1L or the syscall number that @regs is executing. 281 * OK, we're invoking a handler
297 */ 282 */
298static long current_syscall(struct pt_regs *regs) 283static int signr_convert(int sig)
299{ 284{
300 /* 285 return sig;
301 * We always sign-extend a -1 value being set here,
302 * so this is always either -1L or a syscall number.
303 */
304 return regs->orig_ax;
305} 286}
306 287
307/*
308 * Return a value that is -EFOO if the system call in @regs->orig_ax
309 * returned an error. This only works for @regs from @current.
310 */
311static long current_syscall_ret(struct pt_regs *regs)
312{
313#ifdef CONFIG_IA32_EMULATION 288#ifdef CONFIG_IA32_EMULATION
314 if (test_thread_flag(TIF_IA32)) 289#define is_ia32 test_thread_flag(TIF_IA32)
315 /* 290#else
316 * Sign-extend the value so (int)-EFOO becomes (long)-EFOO 291#define is_ia32 0
317 * and will match correctly in comparisons.
318 */
319 return (int) regs->ax;
320#endif 292#endif
321 return regs->ax;
322}
323 293
324/* 294static int
325 * OK, we're invoking a handler 295setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
326 */ 296 sigset_t *set, struct pt_regs *regs)
297{
298 int usig = signr_convert(sig);
299 int ret;
300
301 /* Set up the stack frame */
302 if (is_ia32) {
303 if (ka->sa.sa_flags & SA_SIGINFO)
304 ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
305 else
306 ret = ia32_setup_frame(usig, ka, set, regs);
307 } else
308 ret = __setup_rt_frame(sig, ka, info, set, regs);
309
310 if (ret) {
311 force_sigsegv(sig, current);
312 return -EFAULT;
313 }
314
315 return ret;
316}
327 317
328static int 318static int
329handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, 319handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
@@ -332,9 +322,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
332 int ret; 322 int ret;
333 323
334 /* Are we from a system call? */ 324 /* Are we from a system call? */
335 if (current_syscall(regs) >= 0) { 325 if (syscall_get_nr(current, regs) >= 0) {
336 /* If so, check system call restarting.. */ 326 /* If so, check system call restarting.. */
337 switch (current_syscall_ret(regs)) { 327 switch (syscall_get_error(current, regs)) {
338 case -ERESTART_RESTARTBLOCK: 328 case -ERESTART_RESTARTBLOCK:
339 case -ERESTARTNOHAND: 329 case -ERESTARTNOHAND:
340 regs->ax = -EINTR; 330 regs->ax = -EINTR;
@@ -361,50 +351,48 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
361 likely(test_and_clear_thread_flag(TIF_FORCED_TF))) 351 likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
362 regs->flags &= ~X86_EFLAGS_TF; 352 regs->flags &= ~X86_EFLAGS_TF;
363 353
364#ifdef CONFIG_IA32_EMULATION
365 if (test_thread_flag(TIF_IA32)) {
366 if (ka->sa.sa_flags & SA_SIGINFO)
367 ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs);
368 else
369 ret = ia32_setup_frame(sig, ka, oldset, regs);
370 } else
371#endif
372 ret = setup_rt_frame(sig, ka, info, oldset, regs); 354 ret = setup_rt_frame(sig, ka, info, oldset, regs);
373 355
374 if (ret == 0) { 356 if (ret)
375 /* 357 return ret;
376 * This has nothing to do with segment registers,
377 * despite the name. This magic affects uaccess.h
378 * macros' behavior. Reset it to the normal setting.
379 */
380 set_fs(USER_DS);
381 358
382 /* 359#ifdef CONFIG_X86_64
383 * Clear the direction flag as per the ABI for function entry. 360 /*
384 */ 361 * This has nothing to do with segment registers,
385 regs->flags &= ~X86_EFLAGS_DF; 362 * despite the name. This magic affects uaccess.h
363 * macros' behavior. Reset it to the normal setting.
364 */
365 set_fs(USER_DS);
366#endif
386 367
387 /* 368 /*
388 * Clear TF when entering the signal handler, but 369 * Clear the direction flag as per the ABI for function entry.
389 * notify any tracer that was single-stepping it. 370 */
390 * The tracer may want to single-step inside the 371 regs->flags &= ~X86_EFLAGS_DF;
391 * handler too.
392 */
393 regs->flags &= ~X86_EFLAGS_TF;
394 if (test_thread_flag(TIF_SINGLESTEP))
395 ptrace_notify(SIGTRAP);
396
397 spin_lock_irq(&current->sighand->siglock);
398 sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
399 if (!(ka->sa.sa_flags & SA_NODEFER))
400 sigaddset(&current->blocked,sig);
401 recalc_sigpending();
402 spin_unlock_irq(&current->sighand->siglock);
403 }
404 372
405 return ret; 373 /*
374 * Clear TF when entering the signal handler, but
375 * notify any tracer that was single-stepping it.
376 * The tracer may want to single-step inside the
377 * handler too.
378 */
379 regs->flags &= ~X86_EFLAGS_TF;
380
381 spin_lock_irq(&current->sighand->siglock);
382 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
383 if (!(ka->sa.sa_flags & SA_NODEFER))
384 sigaddset(&current->blocked, sig);
385 recalc_sigpending();
386 spin_unlock_irq(&current->sighand->siglock);
387
388 tracehook_signal_handler(sig, info, ka, regs,
389 test_thread_flag(TIF_SINGLESTEP));
390
391 return 0;
406} 392}
407 393
394#define NR_restart_syscall \
395 test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall
408/* 396/*
409 * Note that 'init' is a special process: it doesn't get signals it doesn't 397 * Note that 'init' is a special process: it doesn't get signals it doesn't
410 * want to handle. Thus you cannot kill init even with a SIGKILL even by 398 * want to handle. Thus you cannot kill init even with a SIGKILL even by
@@ -434,7 +422,8 @@ static void do_signal(struct pt_regs *regs)
434 422
435 signr = get_signal_to_deliver(&info, &ka, regs, NULL); 423 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
436 if (signr > 0) { 424 if (signr > 0) {
437 /* Re-enable any watchpoints before delivering the 425 /*
426 * Re-enable any watchpoints before delivering the
438 * signal to user space. The processor register will 427 * signal to user space. The processor register will
439 * have been cleared if the watchpoint triggered 428 * have been cleared if the watchpoint triggered
440 * inside the kernel. 429 * inside the kernel.
@@ -442,7 +431,7 @@ static void do_signal(struct pt_regs *regs)
442 if (current->thread.debugreg7) 431 if (current->thread.debugreg7)
443 set_debugreg(current->thread.debugreg7, 7); 432 set_debugreg(current->thread.debugreg7, 7);
444 433
445 /* Whee! Actually deliver the signal. */ 434 /* Whee! Actually deliver the signal. */
446 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { 435 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
447 /* 436 /*
448 * A signal was successfully delivered; the saved 437 * A signal was successfully delivered; the saved
@@ -456,19 +445,18 @@ static void do_signal(struct pt_regs *regs)
456 } 445 }
457 446
458 /* Did we come from a system call? */ 447 /* Did we come from a system call? */
459 if (current_syscall(regs) >= 0) { 448 if (syscall_get_nr(current, regs) >= 0) {
460 /* Restart the system call - no handlers present */ 449 /* Restart the system call - no handlers present */
461 switch (current_syscall_ret(regs)) { 450 switch (syscall_get_error(current, regs)) {
462 case -ERESTARTNOHAND: 451 case -ERESTARTNOHAND:
463 case -ERESTARTSYS: 452 case -ERESTARTSYS:
464 case -ERESTARTNOINTR: 453 case -ERESTARTNOINTR:
465 regs->ax = regs->orig_ax; 454 regs->ax = regs->orig_ax;
466 regs->ip -= 2; 455 regs->ip -= 2;
467 break; 456 break;
457
468 case -ERESTART_RESTARTBLOCK: 458 case -ERESTART_RESTARTBLOCK:
469 regs->ax = test_thread_flag(TIF_IA32) ? 459 regs->ax = NR_restart_syscall;
470 __NR_ia32_restart_syscall :
471 __NR_restart_syscall;
472 regs->ip -= 2; 460 regs->ip -= 2;
473 break; 461 break;
474 } 462 }
@@ -484,38 +472,45 @@ static void do_signal(struct pt_regs *regs)
484 } 472 }
485} 473}
486 474
487void do_notify_resume(struct pt_regs *regs, void *unused, 475/*
488 __u32 thread_info_flags) 476 * notification of userspace execution resumption
477 * - triggered by the TIF_WORK_MASK flags
478 */
479void
480do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
489{ 481{
490 /* Pending single-step? */ 482#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
491 if (thread_info_flags & _TIF_SINGLESTEP) {
492 regs->flags |= X86_EFLAGS_TF;
493 clear_thread_flag(TIF_SINGLESTEP);
494 }
495
496#ifdef CONFIG_X86_MCE
497 /* notify userspace of pending MCEs */ 483 /* notify userspace of pending MCEs */
498 if (thread_info_flags & _TIF_MCE_NOTIFY) 484 if (thread_info_flags & _TIF_MCE_NOTIFY)
499 mce_notify_user(); 485 mce_notify_user();
500#endif /* CONFIG_X86_MCE */ 486#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
501 487
502 /* deal with pending signal delivery */ 488 /* deal with pending signal delivery */
503 if (thread_info_flags & _TIF_SIGPENDING) 489 if (thread_info_flags & _TIF_SIGPENDING)
504 do_signal(regs); 490 do_signal(regs);
505 491
506 if (thread_info_flags & _TIF_HRTICK_RESCHED) 492 if (thread_info_flags & _TIF_NOTIFY_RESUME) {
507 hrtick_resched(); 493 clear_thread_flag(TIF_NOTIFY_RESUME);
494 tracehook_notify_resume(regs);
495 }
496
497#ifdef CONFIG_X86_32
498 clear_thread_flag(TIF_IRET);
499#endif /* CONFIG_X86_32 */
508} 500}
509 501
510void signal_fault(struct pt_regs *regs, void __user *frame, char *where) 502void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
511{ 503{
512 struct task_struct *me = current; 504 struct task_struct *me = current;
505
513 if (show_unhandled_signals && printk_ratelimit()) { 506 if (show_unhandled_signals && printk_ratelimit()) {
514 printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", 507 printk(KERN_INFO
515 me->comm,me->pid,where,frame,regs->ip,regs->sp,regs->orig_ax); 508 "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
509 me->comm, me->pid, where, frame,
510 regs->ip, regs->sp, regs->orig_ax);
516 print_vma_addr(" in ", regs->ip); 511 print_vma_addr(" in ", regs->ip);
517 printk("\n"); 512 printk(KERN_CONT "\n");
518 } 513 }
519 514
520 force_sig(SIGSEGV, me); 515 force_sig(SIGSEGV, me);
521} 516}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 0cb7aadc87cd..18f9b19f5f8f 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -121,132 +121,23 @@ static void native_smp_send_reschedule(int cpu)
121 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); 121 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
122} 122}
123 123
124/* 124void native_send_call_func_single_ipi(int cpu)
125 * Structure and data for smp_call_function(). This is designed to minimise
126 * static memory requirements. It also looks cleaner.
127 */
128static DEFINE_SPINLOCK(call_lock);
129
130struct call_data_struct {
131 void (*func) (void *info);
132 void *info;
133 atomic_t started;
134 atomic_t finished;
135 int wait;
136};
137
138void lock_ipi_call_lock(void)
139{
140 spin_lock_irq(&call_lock);
141}
142
143void unlock_ipi_call_lock(void)
144{ 125{
145 spin_unlock_irq(&call_lock); 126 send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_SINGLE_VECTOR);
146}
147
148static struct call_data_struct *call_data;
149
150static void __smp_call_function(void (*func) (void *info), void *info,
151 int nonatomic, int wait)
152{
153 struct call_data_struct data;
154 int cpus = num_online_cpus() - 1;
155
156 if (!cpus)
157 return;
158
159 data.func = func;
160 data.info = info;
161 atomic_set(&data.started, 0);
162 data.wait = wait;
163 if (wait)
164 atomic_set(&data.finished, 0);
165
166 call_data = &data;
167 mb();
168
169 /* Send a message to all other CPUs and wait for them to respond */
170 send_IPI_allbutself(CALL_FUNCTION_VECTOR);
171
172 /* Wait for response */
173 while (atomic_read(&data.started) != cpus)
174 cpu_relax();
175
176 if (wait)
177 while (atomic_read(&data.finished) != cpus)
178 cpu_relax();
179} 127}
180 128
181 129void native_send_call_func_ipi(cpumask_t mask)
182/**
183 * smp_call_function_mask(): Run a function on a set of other CPUs.
184 * @mask: The set of cpus to run on. Must not include the current cpu.
185 * @func: The function to run. This must be fast and non-blocking.
186 * @info: An arbitrary pointer to pass to the function.
187 * @wait: If true, wait (atomically) until function has completed on other CPUs.
188 *
189 * Returns 0 on success, else a negative status code.
190 *
191 * If @wait is true, then returns once @func has returned; otherwise
192 * it returns just before the target cpu calls @func.
193 *
194 * You must not call this function with disabled interrupts or from a
195 * hardware interrupt handler or from a bottom half handler.
196 */
197static int
198native_smp_call_function_mask(cpumask_t mask,
199 void (*func)(void *), void *info,
200 int wait)
201{ 130{
202 struct call_data_struct data;
203 cpumask_t allbutself; 131 cpumask_t allbutself;
204 int cpus;
205
206 /* Can deadlock when called with interrupts disabled */
207 WARN_ON(irqs_disabled());
208
209 /* Holding any lock stops cpus from going down. */
210 spin_lock(&call_lock);
211 132
212 allbutself = cpu_online_map; 133 allbutself = cpu_online_map;
213 cpu_clear(smp_processor_id(), allbutself); 134 cpu_clear(smp_processor_id(), allbutself);
214 135
215 cpus_and(mask, mask, allbutself);
216 cpus = cpus_weight(mask);
217
218 if (!cpus) {
219 spin_unlock(&call_lock);
220 return 0;
221 }
222
223 data.func = func;
224 data.info = info;
225 atomic_set(&data.started, 0);
226 data.wait = wait;
227 if (wait)
228 atomic_set(&data.finished, 0);
229
230 call_data = &data;
231 wmb();
232
233 /* Send a message to other CPUs */
234 if (cpus_equal(mask, allbutself) && 136 if (cpus_equal(mask, allbutself) &&
235 cpus_equal(cpu_online_map, cpu_callout_map)) 137 cpus_equal(cpu_online_map, cpu_callout_map))
236 send_IPI_allbutself(CALL_FUNCTION_VECTOR); 138 send_IPI_allbutself(CALL_FUNCTION_VECTOR);
237 else 139 else
238 send_IPI_mask(mask, CALL_FUNCTION_VECTOR); 140 send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
239
240 /* Wait for response */
241 while (atomic_read(&data.started) != cpus)
242 cpu_relax();
243
244 if (wait)
245 while (atomic_read(&data.finished) != cpus)
246 cpu_relax();
247 spin_unlock(&call_lock);
248
249 return 0;
250} 141}
251 142
252static void stop_this_cpu(void *dummy) 143static void stop_this_cpu(void *dummy)
@@ -268,18 +159,13 @@ static void stop_this_cpu(void *dummy)
268 159
269static void native_smp_send_stop(void) 160static void native_smp_send_stop(void)
270{ 161{
271 int nolock;
272 unsigned long flags; 162 unsigned long flags;
273 163
274 if (reboot_force) 164 if (reboot_force)
275 return; 165 return;
276 166
277 /* Don't deadlock on the call lock in panic */ 167 smp_call_function(stop_this_cpu, NULL, 0);
278 nolock = !spin_trylock(&call_lock);
279 local_irq_save(flags); 168 local_irq_save(flags);
280 __smp_call_function(stop_this_cpu, NULL, 0, 0);
281 if (!nolock)
282 spin_unlock(&call_lock);
283 disable_local_APIC(); 169 disable_local_APIC();
284 local_irq_restore(flags); 170 local_irq_restore(flags);
285} 171}
@@ -301,44 +187,44 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
301 187
302void smp_call_function_interrupt(struct pt_regs *regs) 188void smp_call_function_interrupt(struct pt_regs *regs)
303{ 189{
304 void (*func) (void *info) = call_data->func;
305 void *info = call_data->info;
306 int wait = call_data->wait;
307
308 ack_APIC_irq(); 190 ack_APIC_irq();
309 /*
310 * Notify initiating CPU that I've grabbed the data and am
311 * about to execute the function
312 */
313 mb();
314 atomic_inc(&call_data->started);
315 /*
316 * At this point the info structure may be out of scope unless wait==1
317 */
318 irq_enter(); 191 irq_enter();
319 (*func)(info); 192 generic_smp_call_function_interrupt();
320#ifdef CONFIG_X86_32 193#ifdef CONFIG_X86_32
321 __get_cpu_var(irq_stat).irq_call_count++; 194 __get_cpu_var(irq_stat).irq_call_count++;
322#else 195#else
323 add_pda(irq_call_count, 1); 196 add_pda(irq_call_count, 1);
324#endif 197#endif
325 irq_exit(); 198 irq_exit();
199}
326 200
327 if (wait) { 201void smp_call_function_single_interrupt(struct pt_regs *regs)
328 mb(); 202{
329 atomic_inc(&call_data->finished); 203 ack_APIC_irq();
330 } 204 irq_enter();
205 generic_smp_call_function_single_interrupt();
206#ifdef CONFIG_X86_32
207 __get_cpu_var(irq_stat).irq_call_count++;
208#else
209 add_pda(irq_call_count, 1);
210#endif
211 irq_exit();
331} 212}
332 213
333struct smp_ops smp_ops = { 214struct smp_ops smp_ops = {
334 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, 215 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
335 .smp_prepare_cpus = native_smp_prepare_cpus, 216 .smp_prepare_cpus = native_smp_prepare_cpus,
336 .cpu_up = native_cpu_up,
337 .smp_cpus_done = native_smp_cpus_done, 217 .smp_cpus_done = native_smp_cpus_done,
338 218
339 .smp_send_stop = native_smp_send_stop, 219 .smp_send_stop = native_smp_send_stop,
340 .smp_send_reschedule = native_smp_send_reschedule, 220 .smp_send_reschedule = native_smp_send_reschedule,
341 .smp_call_function_mask = native_smp_call_function_mask, 221
222 .cpu_up = native_cpu_up,
223 .cpu_die = native_cpu_die,
224 .cpu_disable = native_cpu_disable,
225 .play_dead = native_play_dead,
226
227 .send_call_func_ipi = native_send_call_func_ipi,
228 .send_call_func_single_ipi = native_send_call_func_single_ipi,
342}; 229};
343EXPORT_SYMBOL_GPL(smp_ops); 230EXPORT_SYMBOL_GPL(smp_ops);
344
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 56078d61c793..8c3aca7cb343 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -52,6 +52,7 @@
52#include <asm/desc.h> 52#include <asm/desc.h>
53#include <asm/nmi.h> 53#include <asm/nmi.h>
54#include <asm/irq.h> 54#include <asm/irq.h>
55#include <asm/idle.h>
55#include <asm/smp.h> 56#include <asm/smp.h>
56#include <asm/trampoline.h> 57#include <asm/trampoline.h>
57#include <asm/cpu.h> 58#include <asm/cpu.h>
@@ -59,7 +60,6 @@
59#include <asm/pgtable.h> 60#include <asm/pgtable.h>
60#include <asm/tlbflush.h> 61#include <asm/tlbflush.h>
61#include <asm/mtrr.h> 62#include <asm/mtrr.h>
62#include <asm/nmi.h>
63#include <asm/vmi.h> 63#include <asm/vmi.h>
64#include <asm/genapic.h> 64#include <asm/genapic.h>
65#include <linux/mc146818rtc.h> 65#include <linux/mc146818rtc.h>
@@ -68,22 +68,6 @@
68#include <mach_wakecpu.h> 68#include <mach_wakecpu.h>
69#include <smpboot_hooks.h> 69#include <smpboot_hooks.h>
70 70
71/*
72 * FIXME: For x86_64, those are defined in other files. But moving them here,
73 * would make the setup areas dependent on smp, which is a loss. When we
74 * integrate apic between arches, we can probably do a better job, but
75 * right now, they'll stay here -- glommer
76 */
77
78/* which logical CPU number maps to which CPU (physical APIC ID) */
79u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata =
80 { [0 ... NR_CPUS-1] = BAD_APICID };
81void *x86_cpu_to_apicid_early_ptr;
82
83u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
84 = { [0 ... NR_CPUS-1] = BAD_APICID };
85void *x86_bios_cpu_apicid_early_ptr;
86
87#ifdef CONFIG_X86_32 71#ifdef CONFIG_X86_32
88u8 apicid_2_node[MAX_APICID]; 72u8 apicid_2_node[MAX_APICID];
89static int low_mappings; 73static int low_mappings;
@@ -105,7 +89,7 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
105#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x)) 89#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x))
106#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p)) 90#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p))
107#else 91#else
108struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; 92static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
109#define get_idle_for_cpu(x) (idle_thread_array[(x)]) 93#define get_idle_for_cpu(x) (idle_thread_array[(x)])
110#define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p)) 94#define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p))
111#endif 95#endif
@@ -140,13 +124,12 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
140 124
141static atomic_t init_deasserted; 125static atomic_t init_deasserted;
142 126
143static int boot_cpu_logical_apicid;
144 127
145/* representing cpus for which sibling maps can be computed */ 128/* representing cpus for which sibling maps can be computed */
146static cpumask_t cpu_sibling_setup_map; 129static cpumask_t cpu_sibling_setup_map;
147 130
148/* Set if we find a B stepping CPU */ 131/* Set if we find a B stepping CPU */
149int __cpuinitdata smp_b_stepping; 132static int __cpuinitdata smp_b_stepping;
150 133
151#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) 134#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
152 135
@@ -182,6 +165,8 @@ static void unmap_cpu_to_node(int cpu)
182#endif 165#endif
183 166
184#ifdef CONFIG_X86_32 167#ifdef CONFIG_X86_32
168static int boot_cpu_logical_apicid;
169
185u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = 170u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly =
186 { [0 ... NR_CPUS-1] = BAD_APICID }; 171 { [0 ... NR_CPUS-1] = BAD_APICID };
187 172
@@ -198,13 +183,12 @@ static void map_cpu_to_logical_apicid(void)
198 map_cpu_to_node(cpu, node); 183 map_cpu_to_node(cpu, node);
199} 184}
200 185
201static void unmap_cpu_to_logical_apicid(int cpu) 186void numa_remove_cpu(int cpu)
202{ 187{
203 cpu_2_logical_apicid[cpu] = BAD_APICID; 188 cpu_2_logical_apicid[cpu] = BAD_APICID;
204 unmap_cpu_to_node(cpu); 189 unmap_cpu_to_node(cpu);
205} 190}
206#else 191#else
207#define unmap_cpu_to_logical_apicid(cpu) do {} while (0)
208#define map_cpu_to_logical_apicid() do {} while (0) 192#define map_cpu_to_logical_apicid() do {} while (0)
209#endif 193#endif
210 194
@@ -228,13 +212,13 @@ static void __cpuinit smp_callin(void)
228 /* 212 /*
229 * (This works even if the APIC is not enabled.) 213 * (This works even if the APIC is not enabled.)
230 */ 214 */
231 phys_id = GET_APIC_ID(read_apic_id()); 215 phys_id = read_apic_id();
232 cpuid = smp_processor_id(); 216 cpuid = smp_processor_id();
233 if (cpu_isset(cpuid, cpu_callin_map)) { 217 if (cpu_isset(cpuid, cpu_callin_map)) {
234 panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, 218 panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__,
235 phys_id, cpuid); 219 phys_id, cpuid);
236 } 220 }
237 Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); 221 pr_debug("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
238 222
239 /* 223 /*
240 * STARTUP IPIs are fragile beasts as they might sometimes 224 * STARTUP IPIs are fragile beasts as they might sometimes
@@ -269,12 +253,13 @@ static void __cpuinit smp_callin(void)
269 * boards) 253 * boards)
270 */ 254 */
271 255
272 Dprintk("CALLIN, before setup_local_APIC().\n"); 256 pr_debug("CALLIN, before setup_local_APIC().\n");
273 smp_callin_clear_local_apic(); 257 smp_callin_clear_local_apic();
274 setup_local_APIC(); 258 setup_local_APIC();
275 end_local_APIC_setup(); 259 end_local_APIC_setup();
276 map_cpu_to_logical_apicid(); 260 map_cpu_to_logical_apicid();
277 261
262 notify_cpu_starting(cpuid);
278 /* 263 /*
279 * Get our bogomips. 264 * Get our bogomips.
280 * 265 *
@@ -284,7 +269,7 @@ static void __cpuinit smp_callin(void)
284 local_irq_enable(); 269 local_irq_enable();
285 calibrate_delay(); 270 calibrate_delay();
286 local_irq_disable(); 271 local_irq_disable();
287 Dprintk("Stack at about %p\n", &cpuid); 272 pr_debug("Stack at about %p\n", &cpuid);
288 273
289 /* 274 /*
290 * Save our processor parameters 275 * Save our processor parameters
@@ -344,53 +329,30 @@ static void __cpuinit start_secondary(void *unused)
344 * for which cpus receive the IPI. Holding this 329 * for which cpus receive the IPI. Holding this
345 * lock helps us to not include this cpu in a currently in progress 330 * lock helps us to not include this cpu in a currently in progress
346 * smp_call_function(). 331 * smp_call_function().
332 *
333 * We need to hold vector_lock so there the set of online cpus
334 * does not change while we are assigning vectors to cpus. Holding
335 * this lock ensures we don't half assign or remove an irq from a cpu.
347 */ 336 */
348 lock_ipi_call_lock(); 337 ipi_call_lock();
349#ifdef CONFIG_X86_64 338 lock_vector_lock();
350 spin_lock(&vector_lock);
351
352 /* Setup the per cpu irq handling data structures */
353 __setup_vector_irq(smp_processor_id()); 339 __setup_vector_irq(smp_processor_id());
354 /*
355 * Allow the master to continue.
356 */
357 spin_unlock(&vector_lock);
358#endif
359 cpu_set(smp_processor_id(), cpu_online_map); 340 cpu_set(smp_processor_id(), cpu_online_map);
360 unlock_ipi_call_lock(); 341 unlock_vector_lock();
342 ipi_call_unlock();
361 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 343 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
362 344
345 /* enable local interrupts */
346 local_irq_enable();
347
363 setup_secondary_clock(); 348 setup_secondary_clock();
364 349
365 wmb(); 350 wmb();
366 cpu_idle(); 351 cpu_idle();
367} 352}
368 353
369#ifdef CONFIG_X86_32
370/*
371 * Everything has been set up for the secondary
372 * CPUs - they just need to reload everything
373 * from the task structure
374 * This function must not return.
375 */
376void __devinit initialize_secondary(void)
377{
378 /*
379 * We don't actually need to load the full TSS,
380 * basically just the stack pointer and the ip.
381 */
382
383 asm volatile(
384 "movl %0,%%esp\n\t"
385 "jmp *%1"
386 :
387 :"m" (current->thread.sp), "m" (current->thread.ip));
388}
389#endif
390
391static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c) 354static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
392{ 355{
393#ifdef CONFIG_X86_32
394 /* 356 /*
395 * Mask B, Pentium, but not Pentium MMX 357 * Mask B, Pentium, but not Pentium MMX
396 */ 358 */
@@ -440,7 +402,6 @@ static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
440 402
441valid_k7: 403valid_k7:
442 ; 404 ;
443#endif
444} 405}
445 406
446static void __cpuinit smp_checks(void) 407static void __cpuinit smp_checks(void)
@@ -487,7 +448,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
487 cpu_set(cpu, cpu_sibling_setup_map); 448 cpu_set(cpu, cpu_sibling_setup_map);
488 449
489 if (smp_num_siblings > 1) { 450 if (smp_num_siblings > 1) {
490 for_each_cpu_mask(i, cpu_sibling_setup_map) { 451 for_each_cpu_mask_nr(i, cpu_sibling_setup_map) {
491 if (c->phys_proc_id == cpu_data(i).phys_proc_id && 452 if (c->phys_proc_id == cpu_data(i).phys_proc_id &&
492 c->cpu_core_id == cpu_data(i).cpu_core_id) { 453 c->cpu_core_id == cpu_data(i).cpu_core_id) {
493 cpu_set(i, per_cpu(cpu_sibling_map, cpu)); 454 cpu_set(i, per_cpu(cpu_sibling_map, cpu));
@@ -510,7 +471,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
510 return; 471 return;
511 } 472 }
512 473
513 for_each_cpu_mask(i, cpu_sibling_setup_map) { 474 for_each_cpu_mask_nr(i, cpu_sibling_setup_map) {
514 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && 475 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
515 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { 476 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
516 cpu_set(i, c->llc_shared_map); 477 cpu_set(i, c->llc_shared_map);
@@ -555,23 +516,6 @@ cpumask_t cpu_coregroup_map(int cpu)
555 return c->llc_shared_map; 516 return c->llc_shared_map;
556} 517}
557 518
558#ifdef CONFIG_X86_32
559/*
560 * We are called very early to get the low memory for the
561 * SMP bootup trampoline page.
562 */
563void __init smp_alloc_memory(void)
564{
565 trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE);
566 /*
567 * Has to be in very low memory so we can execute
568 * real-mode AP code.
569 */
570 if (__pa(trampoline_base) >= 0x9F000)
571 BUG();
572}
573#endif
574
575static void impress_friends(void) 519static void impress_friends(void)
576{ 520{
577 int cpu; 521 int cpu;
@@ -579,7 +523,7 @@ static void impress_friends(void)
579 /* 523 /*
580 * Allow the user to impress friends. 524 * Allow the user to impress friends.
581 */ 525 */
582 Dprintk("Before bogomips.\n"); 526 pr_debug("Before bogomips.\n");
583 for_each_possible_cpu(cpu) 527 for_each_possible_cpu(cpu)
584 if (cpu_isset(cpu, cpu_callout_map)) 528 if (cpu_isset(cpu, cpu_callout_map))
585 bogosum += cpu_data(cpu).loops_per_jiffy; 529 bogosum += cpu_data(cpu).loops_per_jiffy;
@@ -589,7 +533,7 @@ static void impress_friends(void)
589 bogosum/(500000/HZ), 533 bogosum/(500000/HZ),
590 (bogosum/(5000/HZ))%100); 534 (bogosum/(5000/HZ))%100);
591 535
592 Dprintk("Before bogocount - setting activated=1.\n"); 536 pr_debug("Before bogocount - setting activated=1.\n");
593} 537}
594 538
595static inline void __inquire_remote_apic(int apicid) 539static inline void __inquire_remote_apic(int apicid)
@@ -612,8 +556,7 @@ static inline void __inquire_remote_apic(int apicid)
612 printk(KERN_CONT 556 printk(KERN_CONT
613 "a previous APIC delivery may have failed\n"); 557 "a previous APIC delivery may have failed\n");
614 558
615 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); 559 apic_icr_write(APIC_DM_REMRD | regs[i], apicid);
616 apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
617 560
618 timeout = 0; 561 timeout = 0;
619 do { 562 do {
@@ -645,29 +588,24 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
645 int maxlvt; 588 int maxlvt;
646 589
647 /* Target chip */ 590 /* Target chip */
648 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
649
650 /* Boot on the stack */ 591 /* Boot on the stack */
651 /* Kick the second */ 592 /* Kick the second */
652 apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); 593 apic_icr_write(APIC_DM_NMI | APIC_DEST_LOGICAL, logical_apicid);
653 594
654 Dprintk("Waiting for send to finish...\n"); 595 pr_debug("Waiting for send to finish...\n");
655 send_status = safe_apic_wait_icr_idle(); 596 send_status = safe_apic_wait_icr_idle();
656 597
657 /* 598 /*
658 * Give the other CPU some time to accept the IPI. 599 * Give the other CPU some time to accept the IPI.
659 */ 600 */
660 udelay(200); 601 udelay(200);
661 /* 602 if (APIC_INTEGRATED(apic_version[phys_apicid])) {
662 * Due to the Pentium erratum 3AP. 603 maxlvt = lapic_get_maxlvt();
663 */ 604 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
664 maxlvt = lapic_get_maxlvt(); 605 apic_write(APIC_ESR, 0);
665 if (maxlvt > 3) { 606 accept_status = (apic_read(APIC_ESR) & 0xEF);
666 apic_read_around(APIC_SPIV);
667 apic_write(APIC_ESR, 0);
668 } 607 }
669 accept_status = (apic_read(APIC_ESR) & 0xEF); 608 pr_debug("NMI sent.\n");
670 Dprintk("NMI sent.\n");
671 609
672 if (send_status) 610 if (send_status)
673 printk(KERN_ERR "APIC never delivered???\n"); 611 printk(KERN_ERR "APIC never delivered???\n");
@@ -691,42 +629,40 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
691 return send_status; 629 return send_status;
692 } 630 }
693 631
632 maxlvt = lapic_get_maxlvt();
633
694 /* 634 /*
695 * Be paranoid about clearing APIC errors. 635 * Be paranoid about clearing APIC errors.
696 */ 636 */
697 if (APIC_INTEGRATED(apic_version[phys_apicid])) { 637 if (APIC_INTEGRATED(apic_version[phys_apicid])) {
698 apic_read_around(APIC_SPIV); 638 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
699 apic_write(APIC_ESR, 0); 639 apic_write(APIC_ESR, 0);
700 apic_read(APIC_ESR); 640 apic_read(APIC_ESR);
701 } 641 }
702 642
703 Dprintk("Asserting INIT.\n"); 643 pr_debug("Asserting INIT.\n");
704 644
705 /* 645 /*
706 * Turn INIT on target chip 646 * Turn INIT on target chip
707 */ 647 */
708 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
709
710 /* 648 /*
711 * Send IPI 649 * Send IPI
712 */ 650 */
713 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT 651 apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,
714 | APIC_DM_INIT); 652 phys_apicid);
715 653
716 Dprintk("Waiting for send to finish...\n"); 654 pr_debug("Waiting for send to finish...\n");
717 send_status = safe_apic_wait_icr_idle(); 655 send_status = safe_apic_wait_icr_idle();
718 656
719 mdelay(10); 657 mdelay(10);
720 658
721 Dprintk("Deasserting INIT.\n"); 659 pr_debug("Deasserting INIT.\n");
722 660
723 /* Target chip */ 661 /* Target chip */
724 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
725
726 /* Send IPI */ 662 /* Send IPI */
727 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); 663 apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
728 664
729 Dprintk("Waiting for send to finish...\n"); 665 pr_debug("Waiting for send to finish...\n");
730 send_status = safe_apic_wait_icr_idle(); 666 send_status = safe_apic_wait_icr_idle();
731 667
732 mb(); 668 mb();
@@ -748,64 +684,51 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
748 * target processor state. 684 * target processor state.
749 */ 685 */
750 startup_ipi_hook(phys_apicid, (unsigned long) start_secondary, 686 startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
751#ifdef CONFIG_X86_64
752 (unsigned long)init_rsp);
753#else
754 (unsigned long)stack_start.sp); 687 (unsigned long)stack_start.sp);
755#endif
756 688
757 /* 689 /*
758 * Run STARTUP IPI loop. 690 * Run STARTUP IPI loop.
759 */ 691 */
760 Dprintk("#startup loops: %d.\n", num_starts); 692 pr_debug("#startup loops: %d.\n", num_starts);
761
762 maxlvt = lapic_get_maxlvt();
763 693
764 for (j = 1; j <= num_starts; j++) { 694 for (j = 1; j <= num_starts; j++) {
765 Dprintk("Sending STARTUP #%d.\n", j); 695 pr_debug("Sending STARTUP #%d.\n", j);
766 apic_read_around(APIC_SPIV); 696 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
767 apic_write(APIC_ESR, 0); 697 apic_write(APIC_ESR, 0);
768 apic_read(APIC_ESR); 698 apic_read(APIC_ESR);
769 Dprintk("After apic_write.\n"); 699 pr_debug("After apic_write.\n");
770 700
771 /* 701 /*
772 * STARTUP IPI 702 * STARTUP IPI
773 */ 703 */
774 704
775 /* Target chip */ 705 /* Target chip */
776 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
777
778 /* Boot on the stack */ 706 /* Boot on the stack */
779 /* Kick the second */ 707 /* Kick the second */
780 apic_write_around(APIC_ICR, APIC_DM_STARTUP 708 apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12),
781 | (start_eip >> 12)); 709 phys_apicid);
782 710
783 /* 711 /*
784 * Give the other CPU some time to accept the IPI. 712 * Give the other CPU some time to accept the IPI.
785 */ 713 */
786 udelay(300); 714 udelay(300);
787 715
788 Dprintk("Startup point 1.\n"); 716 pr_debug("Startup point 1.\n");
789 717
790 Dprintk("Waiting for send to finish...\n"); 718 pr_debug("Waiting for send to finish...\n");
791 send_status = safe_apic_wait_icr_idle(); 719 send_status = safe_apic_wait_icr_idle();
792 720
793 /* 721 /*
794 * Give the other CPU some time to accept the IPI. 722 * Give the other CPU some time to accept the IPI.
795 */ 723 */
796 udelay(200); 724 udelay(200);
797 /* 725 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
798 * Due to the Pentium erratum 3AP.
799 */
800 if (maxlvt > 3) {
801 apic_read_around(APIC_SPIV);
802 apic_write(APIC_ESR, 0); 726 apic_write(APIC_ESR, 0);
803 }
804 accept_status = (apic_read(APIC_ESR) & 0xEF); 727 accept_status = (apic_read(APIC_ESR) & 0xEF);
805 if (send_status || accept_status) 728 if (send_status || accept_status)
806 break; 729 break;
807 } 730 }
808 Dprintk("After Startup.\n"); 731 pr_debug("After Startup.\n");
809 732
810 if (send_status) 733 if (send_status)
811 printk(KERN_ERR "APIC never delivered???\n"); 734 printk(KERN_ERR "APIC never delivered???\n");
@@ -832,6 +755,52 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
832 complete(&c_idle->done); 755 complete(&c_idle->done);
833} 756}
834 757
758#ifdef CONFIG_X86_64
759
760/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */
761static void __ref free_bootmem_pda(struct x8664_pda *oldpda)
762{
763 if (!after_bootmem)
764 free_bootmem((unsigned long)oldpda, sizeof(*oldpda));
765}
766
767/*
768 * Allocate node local memory for the AP pda.
769 *
770 * Must be called after the _cpu_pda pointer table is initialized.
771 */
772int __cpuinit get_local_pda(int cpu)
773{
774 struct x8664_pda *oldpda, *newpda;
775 unsigned long size = sizeof(struct x8664_pda);
776 int node = cpu_to_node(cpu);
777
778 if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
779 return 0;
780
781 oldpda = cpu_pda(cpu);
782 newpda = kmalloc_node(size, GFP_ATOMIC, node);
783 if (!newpda) {
784 printk(KERN_ERR "Could not allocate node local PDA "
785 "for CPU %d on node %d\n", cpu, node);
786
787 if (oldpda)
788 return 0; /* have a usable pda */
789 else
790 return -1;
791 }
792
793 if (oldpda) {
794 memcpy(newpda, oldpda, size);
795 free_bootmem_pda(oldpda);
796 }
797
798 newpda->in_bootmem = 0;
799 cpu_pda(cpu) = newpda;
800 return 0;
801}
802#endif /* CONFIG_X86_64 */
803
835static int __cpuinit do_boot_cpu(int apicid, int cpu) 804static int __cpuinit do_boot_cpu(int apicid, int cpu)
836/* 805/*
837 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad 806 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -848,28 +817,14 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
848 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), 817 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
849 }; 818 };
850 INIT_WORK(&c_idle.work, do_fork_idle); 819 INIT_WORK(&c_idle.work, do_fork_idle);
851#ifdef CONFIG_X86_64
852 /* allocate memory for gdts of secondary cpus. Hotplug is considered */
853 if (!cpu_gdt_descr[cpu].address &&
854 !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) {
855 printk(KERN_ERR "Failed to allocate GDT for CPU %d\n", cpu);
856 return -1;
857 }
858 820
821#ifdef CONFIG_X86_64
859 /* Allocate node local memory for AP pdas */ 822 /* Allocate node local memory for AP pdas */
860 if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) { 823 if (cpu > 0) {
861 struct x8664_pda *newpda, *pda; 824 boot_error = get_local_pda(cpu);
862 int node = cpu_to_node(cpu); 825 if (boot_error)
863 pda = cpu_pda(cpu); 826 goto restore_state;
864 newpda = kmalloc_node(sizeof(struct x8664_pda), GFP_ATOMIC, 827 /* if can't get pda memory, can't start cpu */
865 node);
866 if (newpda) {
867 memcpy(newpda, pda, sizeof(struct x8664_pda));
868 cpu_pda(cpu) = newpda;
869 } else
870 printk(KERN_ERR
871 "Could not allocate node local PDA for CPU %d on node %d\n",
872 cpu, node);
873 } 828 }
874#endif 829#endif
875 830
@@ -905,18 +860,15 @@ do_rest:
905#ifdef CONFIG_X86_32 860#ifdef CONFIG_X86_32
906 per_cpu(current_task, cpu) = c_idle.idle; 861 per_cpu(current_task, cpu) = c_idle.idle;
907 init_gdt(cpu); 862 init_gdt(cpu);
908 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
909 c_idle.idle->thread.ip = (unsigned long) start_secondary;
910 /* Stack for startup_32 can be just as for start_secondary onwards */ 863 /* Stack for startup_32 can be just as for start_secondary onwards */
911 stack_start.sp = (void *) c_idle.idle->thread.sp;
912 irq_ctx_init(cpu); 864 irq_ctx_init(cpu);
913#else 865#else
914 cpu_pda(cpu)->pcurrent = c_idle.idle; 866 cpu_pda(cpu)->pcurrent = c_idle.idle;
915 init_rsp = c_idle.idle->thread.sp;
916 load_sp0(&per_cpu(init_tss, cpu), &c_idle.idle->thread);
917 initial_code = (unsigned long)start_secondary;
918 clear_tsk_thread_flag(c_idle.idle, TIF_FORK); 867 clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
919#endif 868#endif
869 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
870 initial_code = (unsigned long)start_secondary;
871 stack_start.sp = (void *) c_idle.idle->thread.sp;
920 872
921 /* start_ip had better be page-aligned! */ 873 /* start_ip had better be page-aligned! */
922 start_ip = setup_trampoline(); 874 start_ip = setup_trampoline();
@@ -934,7 +886,7 @@ do_rest:
934 886
935 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { 887 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
936 888
937 Dprintk("Setting warm reset code and vector.\n"); 889 pr_debug("Setting warm reset code and vector.\n");
938 890
939 store_NMI_vector(&nmi_high, &nmi_low); 891 store_NMI_vector(&nmi_high, &nmi_low);
940 892
@@ -955,9 +907,9 @@ do_rest:
955 /* 907 /*
956 * allow APs to start initializing. 908 * allow APs to start initializing.
957 */ 909 */
958 Dprintk("Before Callout %d.\n", cpu); 910 pr_debug("Before Callout %d.\n", cpu);
959 cpu_set(cpu, cpu_callout_map); 911 cpu_set(cpu, cpu_callout_map);
960 Dprintk("After Callout %d.\n", cpu); 912 pr_debug("After Callout %d.\n", cpu);
961 913
962 /* 914 /*
963 * Wait 5s total for a response 915 * Wait 5s total for a response
@@ -970,10 +922,10 @@ do_rest:
970 922
971 if (cpu_isset(cpu, cpu_callin_map)) { 923 if (cpu_isset(cpu, cpu_callin_map)) {
972 /* number CPUs logically, starting from 1 (BSP is 0) */ 924 /* number CPUs logically, starting from 1 (BSP is 0) */
973 Dprintk("OK.\n"); 925 pr_debug("OK.\n");
974 printk(KERN_INFO "CPU%d: ", cpu); 926 printk(KERN_INFO "CPU%d: ", cpu);
975 print_cpu_info(&cpu_data(cpu)); 927 print_cpu_info(&cpu_data(cpu));
976 Dprintk("CPU has booted.\n"); 928 pr_debug("CPU has booted.\n");
977 } else { 929 } else {
978 boot_error = 1; 930 boot_error = 1;
979 if (*((volatile unsigned char *)trampoline_base) 931 if (*((volatile unsigned char *)trampoline_base)
@@ -987,16 +939,14 @@ do_rest:
987 inquire_remote_apic(apicid); 939 inquire_remote_apic(apicid);
988 } 940 }
989 } 941 }
990
991 if (boot_error) {
992 /* Try to put things back the way they were before ... */
993 unmap_cpu_to_logical_apicid(cpu);
994#ifdef CONFIG_X86_64 942#ifdef CONFIG_X86_64
995 clear_node_cpumask(cpu); /* was set by numa_add_cpu */ 943restore_state:
996#endif 944#endif
945 if (boot_error) {
946 /* Try to put things back the way they were before ... */
947 numa_remove_cpu(cpu); /* was set by numa_add_cpu */
997 cpu_clear(cpu, cpu_callout_map); /* was set by do_boot_cpu() */ 948 cpu_clear(cpu, cpu_callout_map); /* was set by do_boot_cpu() */
998 cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */ 949 cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
999 cpu_clear(cpu, cpu_possible_map);
1000 cpu_clear(cpu, cpu_present_map); 950 cpu_clear(cpu, cpu_present_map);
1001 per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID; 951 per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
1002 } 952 }
@@ -1020,7 +970,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
1020 970
1021 WARN_ON(irqs_disabled()); 971 WARN_ON(irqs_disabled());
1022 972
1023 Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu); 973 pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu);
1024 974
1025 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || 975 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
1026 !physid_isset(apicid, phys_cpu_present_map)) { 976 !physid_isset(apicid, phys_cpu_present_map)) {
@@ -1032,7 +982,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
1032 * Already booted CPU? 982 * Already booted CPU?
1033 */ 983 */
1034 if (cpu_isset(cpu, cpu_callin_map)) { 984 if (cpu_isset(cpu, cpu_callin_map)) {
1035 Dprintk("do_boot_cpu %d Already started\n", cpu); 985 pr_debug("do_boot_cpu %d Already started\n", cpu);
1036 return -ENOSYS; 986 return -ENOSYS;
1037 } 987 }
1038 988
@@ -1059,7 +1009,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
1059 err = do_boot_cpu(apicid, cpu); 1009 err = do_boot_cpu(apicid, cpu);
1060#endif 1010#endif
1061 if (err) { 1011 if (err) {
1062 Dprintk("do_boot_cpu failed %d\n", err); 1012 pr_debug("do_boot_cpu failed %d\n", err);
1063 return -EIO; 1013 return -EIO;
1064 } 1014 }
1065 1015
@@ -1088,14 +1038,12 @@ static __init void disable_smp(void)
1088{ 1038{
1089 cpu_present_map = cpumask_of_cpu(0); 1039 cpu_present_map = cpumask_of_cpu(0);
1090 cpu_possible_map = cpumask_of_cpu(0); 1040 cpu_possible_map = cpumask_of_cpu(0);
1091#ifdef CONFIG_X86_32
1092 smpboot_clear_io_apic_irqs(); 1041 smpboot_clear_io_apic_irqs();
1093#endif 1042
1094 if (smp_found_config) 1043 if (smp_found_config)
1095 phys_cpu_present_map = 1044 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
1096 physid_mask_of_physid(boot_cpu_physical_apicid);
1097 else 1045 else
1098 phys_cpu_present_map = physid_mask_of_physid(0); 1046 physid_set_mask_of_physid(0, &phys_cpu_present_map);
1099 map_cpu_to_logical_apicid(); 1047 map_cpu_to_logical_apicid();
1100 cpu_set(0, per_cpu(cpu_sibling_map, 0)); 1048 cpu_set(0, per_cpu(cpu_sibling_map, 0));
1101 cpu_set(0, per_cpu(cpu_core_map, 0)); 1049 cpu_set(0, per_cpu(cpu_core_map, 0));
@@ -1107,6 +1055,34 @@ static __init void disable_smp(void)
1107static int __init smp_sanity_check(unsigned max_cpus) 1055static int __init smp_sanity_check(unsigned max_cpus)
1108{ 1056{
1109 preempt_disable(); 1057 preempt_disable();
1058
1059#if defined(CONFIG_X86_PC) && defined(CONFIG_X86_32)
1060 if (def_to_bigsmp && nr_cpu_ids > 8) {
1061 unsigned int cpu;
1062 unsigned nr;
1063
1064 printk(KERN_WARNING
1065 "More than 8 CPUs detected - skipping them.\n"
1066 "Use CONFIG_X86_GENERICARCH and CONFIG_X86_BIGSMP.\n");
1067
1068 nr = 0;
1069 for_each_present_cpu(cpu) {
1070 if (nr >= 8)
1071 cpu_clear(cpu, cpu_present_map);
1072 nr++;
1073 }
1074
1075 nr = 0;
1076 for_each_possible_cpu(cpu) {
1077 if (nr >= 8)
1078 cpu_clear(cpu, cpu_possible_map);
1079 nr++;
1080 }
1081
1082 nr_cpu_ids = 8;
1083 }
1084#endif
1085
1110 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { 1086 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
1111 printk(KERN_WARNING "weird, boot CPU (#%d) not listed" 1087 printk(KERN_WARNING "weird, boot CPU (#%d) not listed"
1112 "by the BIOS.\n", hard_smp_processor_id()); 1088 "by the BIOS.\n", hard_smp_processor_id());
@@ -1158,12 +1134,12 @@ static int __init smp_sanity_check(unsigned max_cpus)
1158 * If SMP should be disabled, then really disable it! 1134 * If SMP should be disabled, then really disable it!
1159 */ 1135 */
1160 if (!max_cpus) { 1136 if (!max_cpus) {
1161 printk(KERN_INFO "SMP mode deactivated," 1137 printk(KERN_INFO "SMP mode deactivated.\n");
1162 "forcing use of dummy APIC emulation.\n");
1163 smpboot_clear_io_apic(); 1138 smpboot_clear_io_apic();
1164#ifdef CONFIG_X86_32 1139
1140 localise_nmi_watchdog();
1141
1165 connect_bsp_APIC(); 1142 connect_bsp_APIC();
1166#endif
1167 setup_local_APIC(); 1143 setup_local_APIC();
1168 end_local_APIC_setup(); 1144 end_local_APIC_setup();
1169 return -1; 1145 return -1;
@@ -1191,7 +1167,6 @@ static void __init smp_cpu_index_default(void)
1191void __init native_smp_prepare_cpus(unsigned int max_cpus) 1167void __init native_smp_prepare_cpus(unsigned int max_cpus)
1192{ 1168{
1193 preempt_disable(); 1169 preempt_disable();
1194 nmi_watchdog_default();
1195 smp_cpu_index_default(); 1170 smp_cpu_index_default();
1196 current_cpu_data = boot_cpu_data; 1171 current_cpu_data = boot_cpu_data;
1197 cpu_callin_map = cpumask_of_cpu(0); 1172 cpu_callin_map = cpumask_of_cpu(0);
@@ -1200,10 +1175,17 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1200 * Setup boot CPU information 1175 * Setup boot CPU information
1201 */ 1176 */
1202 smp_store_cpu_info(0); /* Final full version of the data */ 1177 smp_store_cpu_info(0); /* Final full version of the data */
1178#ifdef CONFIG_X86_32
1203 boot_cpu_logical_apicid = logical_smp_processor_id(); 1179 boot_cpu_logical_apicid = logical_smp_processor_id();
1180#endif
1204 current_thread_info()->cpu = 0; /* needed? */ 1181 current_thread_info()->cpu = 0; /* needed? */
1205 set_cpu_sibling_map(0); 1182 set_cpu_sibling_map(0);
1206 1183
1184#ifdef CONFIG_X86_64
1185 enable_IR_x2apic();
1186 setup_apic_routing();
1187#endif
1188
1207 if (smp_sanity_check(max_cpus) < 0) { 1189 if (smp_sanity_check(max_cpus) < 0) {
1208 printk(KERN_INFO "SMP disabled\n"); 1190 printk(KERN_INFO "SMP disabled\n");
1209 disable_smp(); 1191 disable_smp();
@@ -1211,16 +1193,15 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1211 } 1193 }
1212 1194
1213 preempt_disable(); 1195 preempt_disable();
1214 if (GET_APIC_ID(read_apic_id()) != boot_cpu_physical_apicid) { 1196 if (read_apic_id() != boot_cpu_physical_apicid) {
1215 panic("Boot APIC ID in local APIC unexpected (%d vs %d)", 1197 panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
1216 GET_APIC_ID(read_apic_id()), boot_cpu_physical_apicid); 1198 read_apic_id(), boot_cpu_physical_apicid);
1217 /* Or can we switch back to PIC here? */ 1199 /* Or can we switch back to PIC here? */
1218 } 1200 }
1219 preempt_enable(); 1201 preempt_enable();
1220 1202
1221#ifdef CONFIG_X86_32
1222 connect_bsp_APIC(); 1203 connect_bsp_APIC();
1223#endif 1204
1224 /* 1205 /*
1225 * Switch from PIC to APIC mode. 1206 * Switch from PIC to APIC mode.
1226 */ 1207 */
@@ -1247,6 +1228,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1247 printk(KERN_INFO "CPU%d: ", 0); 1228 printk(KERN_INFO "CPU%d: ", 0);
1248 print_cpu_info(&cpu_data(0)); 1229 print_cpu_info(&cpu_data(0));
1249 setup_boot_clock(); 1230 setup_boot_clock();
1231
1232 if (is_uv_system())
1233 uv_system_init();
1250out: 1234out:
1251 preempt_enable(); 1235 preempt_enable();
1252} 1236}
@@ -1258,8 +1242,8 @@ void __init native_smp_prepare_boot_cpu(void)
1258 int me = smp_processor_id(); 1242 int me = smp_processor_id();
1259#ifdef CONFIG_X86_32 1243#ifdef CONFIG_X86_32
1260 init_gdt(me); 1244 init_gdt(me);
1261 switch_to_new_gdt();
1262#endif 1245#endif
1246 switch_to_new_gdt();
1263 /* already set me in cpu_online_map in boot_cpu_init() */ 1247 /* already set me in cpu_online_map in boot_cpu_init() */
1264 cpu_set(me, cpu_callout_map); 1248 cpu_set(me, cpu_callout_map);
1265 per_cpu(cpu_state, me) = CPU_ONLINE; 1249 per_cpu(cpu_state, me) = CPU_ONLINE;
@@ -1267,7 +1251,7 @@ void __init native_smp_prepare_boot_cpu(void)
1267 1251
1268void __init native_smp_cpus_done(unsigned int max_cpus) 1252void __init native_smp_cpus_done(unsigned int max_cpus)
1269{ 1253{
1270 Dprintk("Boot done.\n"); 1254 pr_debug("Boot done.\n");
1271 1255
1272 impress_friends(); 1256 impress_friends();
1273 smp_checks(); 1257 smp_checks();
@@ -1277,56 +1261,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1277 check_nmi_watchdog(); 1261 check_nmi_watchdog();
1278} 1262}
1279 1263
1280#ifdef CONFIG_HOTPLUG_CPU
1281
1282# ifdef CONFIG_X86_32
1283void cpu_exit_clear(void)
1284{
1285 int cpu = raw_smp_processor_id();
1286
1287 idle_task_exit();
1288
1289 cpu_uninit();
1290 irq_ctx_exit(cpu);
1291
1292 cpu_clear(cpu, cpu_callout_map);
1293 cpu_clear(cpu, cpu_callin_map);
1294
1295 unmap_cpu_to_logical_apicid(cpu);
1296}
1297# endif /* CONFIG_X86_32 */
1298
1299static void remove_siblinginfo(int cpu)
1300{
1301 int sibling;
1302 struct cpuinfo_x86 *c = &cpu_data(cpu);
1303
1304 for_each_cpu_mask(sibling, per_cpu(cpu_core_map, cpu)) {
1305 cpu_clear(cpu, per_cpu(cpu_core_map, sibling));
1306 /*/
1307 * last thread sibling in this cpu core going down
1308 */
1309 if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1)
1310 cpu_data(sibling).booted_cores--;
1311 }
1312
1313 for_each_cpu_mask(sibling, per_cpu(cpu_sibling_map, cpu))
1314 cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling));
1315 cpus_clear(per_cpu(cpu_sibling_map, cpu));
1316 cpus_clear(per_cpu(cpu_core_map, cpu));
1317 c->phys_proc_id = 0;
1318 c->cpu_core_id = 0;
1319 cpu_clear(cpu, cpu_sibling_setup_map);
1320}
1321
1322static int additional_cpus __initdata = -1;
1323
1324static __init int setup_additional_cpus(char *s)
1325{
1326 return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL;
1327}
1328early_param("additional_cpus", setup_additional_cpus);
1329
1330/* 1264/*
1331 * cpu_possible_map should be static, it cannot change as cpu's 1265 * cpu_possible_map should be static, it cannot change as cpu's
1332 * are onlined, or offlined. The reason is per-cpu data-structures 1266 * are onlined, or offlined. The reason is per-cpu data-structures
@@ -1346,16 +1280,13 @@ early_param("additional_cpus", setup_additional_cpus);
1346 */ 1280 */
1347__init void prefill_possible_map(void) 1281__init void prefill_possible_map(void)
1348{ 1282{
1349 int i; 1283 int i, possible;
1350 int possible;
1351 1284
1352 if (additional_cpus == -1) { 1285 /* no processor from mptable or madt */
1353 if (disabled_cpus > 0) 1286 if (!num_processors)
1354 additional_cpus = disabled_cpus; 1287 num_processors = 1;
1355 else 1288
1356 additional_cpus = 0; 1289 possible = num_processors + disabled_cpus;
1357 }
1358 possible = num_processors + additional_cpus;
1359 if (possible > NR_CPUS) 1290 if (possible > NR_CPUS)
1360 possible = NR_CPUS; 1291 possible = NR_CPUS;
1361 1292
@@ -1364,21 +1295,68 @@ __init void prefill_possible_map(void)
1364 1295
1365 for (i = 0; i < possible; i++) 1296 for (i = 0; i < possible; i++)
1366 cpu_set(i, cpu_possible_map); 1297 cpu_set(i, cpu_possible_map);
1298
1299 nr_cpu_ids = possible;
1300}
1301
1302#ifdef CONFIG_HOTPLUG_CPU
1303
1304static void remove_siblinginfo(int cpu)
1305{
1306 int sibling;
1307 struct cpuinfo_x86 *c = &cpu_data(cpu);
1308
1309 for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) {
1310 cpu_clear(cpu, per_cpu(cpu_core_map, sibling));
1311 /*/
1312 * last thread sibling in this cpu core going down
1313 */
1314 if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1)
1315 cpu_data(sibling).booted_cores--;
1316 }
1317
1318 for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu))
1319 cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling));
1320 cpus_clear(per_cpu(cpu_sibling_map, cpu));
1321 cpus_clear(per_cpu(cpu_core_map, cpu));
1322 c->phys_proc_id = 0;
1323 c->cpu_core_id = 0;
1324 cpu_clear(cpu, cpu_sibling_setup_map);
1367} 1325}
1368 1326
1369static void __ref remove_cpu_from_maps(int cpu) 1327static void __ref remove_cpu_from_maps(int cpu)
1370{ 1328{
1371 cpu_clear(cpu, cpu_online_map); 1329 cpu_clear(cpu, cpu_online_map);
1372#ifdef CONFIG_X86_64
1373 cpu_clear(cpu, cpu_callout_map); 1330 cpu_clear(cpu, cpu_callout_map);
1374 cpu_clear(cpu, cpu_callin_map); 1331 cpu_clear(cpu, cpu_callin_map);
1375 /* was set by cpu_init() */ 1332 /* was set by cpu_init() */
1376 clear_bit(cpu, (unsigned long *)&cpu_initialized); 1333 cpu_clear(cpu, cpu_initialized);
1377 clear_node_cpumask(cpu); 1334 numa_remove_cpu(cpu);
1378#endif
1379} 1335}
1380 1336
1381int __cpu_disable(void) 1337void cpu_disable_common(void)
1338{
1339 int cpu = smp_processor_id();
1340 /*
1341 * HACK:
1342 * Allow any queued timer interrupts to get serviced
1343 * This is only a temporary solution until we cleanup
1344 * fixup_irqs as we do for IA64.
1345 */
1346 local_irq_enable();
1347 mdelay(1);
1348
1349 local_irq_disable();
1350 remove_siblinginfo(cpu);
1351
1352 /* It's now safe to remove this processor from the online map */
1353 lock_vector_lock();
1354 remove_cpu_from_maps(cpu);
1355 unlock_vector_lock();
1356 fixup_irqs(cpu_online_map);
1357}
1358
1359int native_cpu_disable(void)
1382{ 1360{
1383 int cpu = smp_processor_id(); 1361 int cpu = smp_processor_id();
1384 1362
@@ -1397,25 +1375,11 @@ int __cpu_disable(void)
1397 stop_apic_nmi_watchdog(NULL); 1375 stop_apic_nmi_watchdog(NULL);
1398 clear_local_APIC(); 1376 clear_local_APIC();
1399 1377
1400 /* 1378 cpu_disable_common();
1401 * HACK:
1402 * Allow any queued timer interrupts to get serviced
1403 * This is only a temporary solution until we cleanup
1404 * fixup_irqs as we do for IA64.
1405 */
1406 local_irq_enable();
1407 mdelay(1);
1408
1409 local_irq_disable();
1410 remove_siblinginfo(cpu);
1411
1412 /* It's now safe to remove this processor from the online map */
1413 remove_cpu_from_maps(cpu);
1414 fixup_irqs(cpu_online_map);
1415 return 0; 1379 return 0;
1416} 1380}
1417 1381
1418void __cpu_die(unsigned int cpu) 1382void native_cpu_die(unsigned int cpu)
1419{ 1383{
1420 /* We don't do anything here: idle task is faking death itself. */ 1384 /* We don't do anything here: idle task is faking death itself. */
1421 unsigned int i; 1385 unsigned int i;
@@ -1432,28 +1396,45 @@ void __cpu_die(unsigned int cpu)
1432 } 1396 }
1433 printk(KERN_ERR "CPU %u didn't die...\n", cpu); 1397 printk(KERN_ERR "CPU %u didn't die...\n", cpu);
1434} 1398}
1399
1400void play_dead_common(void)
1401{
1402 idle_task_exit();
1403 reset_lazy_tlbstate();
1404 irq_ctx_exit(raw_smp_processor_id());
1405 c1e_remove_cpu(raw_smp_processor_id());
1406
1407 mb();
1408 /* Ack it */
1409 __get_cpu_var(cpu_state) = CPU_DEAD;
1410
1411 /*
1412 * With physical CPU hotplug, we should halt the cpu
1413 */
1414 local_irq_disable();
1415}
1416
1417void native_play_dead(void)
1418{
1419 play_dead_common();
1420 wbinvd_halt();
1421}
1422
1435#else /* ... !CONFIG_HOTPLUG_CPU */ 1423#else /* ... !CONFIG_HOTPLUG_CPU */
1436int __cpu_disable(void) 1424int native_cpu_disable(void)
1437{ 1425{
1438 return -ENOSYS; 1426 return -ENOSYS;
1439} 1427}
1440 1428
1441void __cpu_die(unsigned int cpu) 1429void native_cpu_die(unsigned int cpu)
1442{ 1430{
1443 /* We said "no" in __cpu_disable */ 1431 /* We said "no" in __cpu_disable */
1444 BUG(); 1432 BUG();
1445} 1433}
1446#endif
1447 1434
1448/* 1435void native_play_dead(void)
1449 * If the BIOS enumerates physical processors before logical,
1450 * maxcpus=N at enumeration-time can be used to disable HT.
1451 */
1452static int __init parse_maxcpus(char *arg)
1453{ 1436{
1454 extern unsigned int maxcpus; 1437 BUG();
1455
1456 maxcpus = simple_strtoul(arg, NULL, 0);
1457 return 0;
1458} 1438}
1459early_param("maxcpus", parse_maxcpus); 1439
1440#endif
diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
index 3449064d141a..397e309839dd 100644
--- a/arch/x86/kernel/smpcommon.c
+++ b/arch/x86/kernel/smpcommon.c
@@ -8,76 +8,23 @@
8DEFINE_PER_CPU(unsigned long, this_cpu_off); 8DEFINE_PER_CPU(unsigned long, this_cpu_off);
9EXPORT_PER_CPU_SYMBOL(this_cpu_off); 9EXPORT_PER_CPU_SYMBOL(this_cpu_off);
10 10
11/* Initialize the CPU's GDT. This is either the boot CPU doing itself 11/*
12 (still using the master per-cpu area), or a CPU doing it for a 12 * Initialize the CPU's GDT. This is either the boot CPU doing itself
13 secondary which will soon come up. */ 13 * (still using the master per-cpu area), or a CPU doing it for a
14 * secondary which will soon come up.
15 */
14__cpuinit void init_gdt(int cpu) 16__cpuinit void init_gdt(int cpu)
15{ 17{
16 struct desc_struct *gdt = get_cpu_gdt_table(cpu); 18 struct desc_struct gdt;
17 19
18 pack_descriptor(&gdt[GDT_ENTRY_PERCPU], 20 pack_descriptor(&gdt, __per_cpu_offset[cpu], 0xFFFFF,
19 __per_cpu_offset[cpu], 0xFFFFF,
20 0x2 | DESCTYPE_S, 0x8); 21 0x2 | DESCTYPE_S, 0x8);
22 gdt.s = 1;
21 23
22 gdt[GDT_ENTRY_PERCPU].s = 1; 24 write_gdt_entry(get_cpu_gdt_table(cpu),
25 GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
23 26
24 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu]; 27 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
25 per_cpu(cpu_number, cpu) = cpu; 28 per_cpu(cpu_number, cpu) = cpu;
26} 29}
27#endif 30#endif
28
29/**
30 * smp_call_function(): Run a function on all other CPUs.
31 * @func: The function to run. This must be fast and non-blocking.
32 * @info: An arbitrary pointer to pass to the function.
33 * @nonatomic: Unused.
34 * @wait: If true, wait (atomically) until function has completed on other CPUs.
35 *
36 * Returns 0 on success, else a negative status code.
37 *
38 * If @wait is true, then returns once @func has returned; otherwise
39 * it returns just before the target cpu calls @func.
40 *
41 * You must not call this function with disabled interrupts or from a
42 * hardware interrupt handler or from a bottom half handler.
43 */
44int smp_call_function(void (*func) (void *info), void *info, int nonatomic,
45 int wait)
46{
47 return smp_call_function_mask(cpu_online_map, func, info, wait);
48}
49EXPORT_SYMBOL(smp_call_function);
50
51/**
52 * smp_call_function_single - Run a function on a specific CPU
53 * @cpu: The target CPU. Cannot be the calling CPU.
54 * @func: The function to run. This must be fast and non-blocking.
55 * @info: An arbitrary pointer to pass to the function.
56 * @nonatomic: Unused.
57 * @wait: If true, wait until function has completed on other CPUs.
58 *
59 * Returns 0 on success, else a negative status code.
60 *
61 * If @wait is true, then returns once @func has returned; otherwise
62 * it returns just before the target cpu calls @func.
63 */
64int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
65 int nonatomic, int wait)
66{
67 /* prevent preemption and reschedule on another processor */
68 int ret;
69 int me = get_cpu();
70 if (cpu == me) {
71 local_irq_disable();
72 func(info);
73 local_irq_enable();
74 put_cpu();
75 return 0;
76 }
77
78 ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
79
80 put_cpu();
81 return ret;
82}
83EXPORT_SYMBOL(smp_call_function_single);
diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c
deleted file mode 100644
index 8b137891791f..000000000000
--- a/arch/x86/kernel/smpcommon_32.c
+++ /dev/null
@@ -1 +0,0 @@
1
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index c28c342c162f..a03e7f6d90c3 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -74,6 +74,7 @@ void save_stack_trace(struct stack_trace *trace)
74 if (trace->nr_entries < trace->max_entries) 74 if (trace->nr_entries < trace->max_entries)
75 trace->entries[trace->nr_entries++] = ULONG_MAX; 75 trace->entries[trace->nr_entries++] = ULONG_MAX;
76} 76}
77EXPORT_SYMBOL_GPL(save_stack_trace);
77 78
78void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) 79void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
79{ 80{
@@ -81,3 +82,4 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
81 if (trace->nr_entries < trace->max_entries) 82 if (trace->nr_entries < trace->max_entries)
82 trace->entries[trace->nr_entries++] = ULONG_MAX; 83 trace->entries[trace->nr_entries++] = ULONG_MAX;
83} 84}
85EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 92c20fee6781..e8b9863ef8c4 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -105,6 +105,20 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
105static int enable_single_step(struct task_struct *child) 105static int enable_single_step(struct task_struct *child)
106{ 106{
107 struct pt_regs *regs = task_pt_regs(child); 107 struct pt_regs *regs = task_pt_regs(child);
108 unsigned long oflags;
109
110 /*
111 * If we stepped into a sysenter/syscall insn, it trapped in
112 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
113 * If user-mode had set TF itself, then it's still clear from
114 * do_debug() and we need to set it again to restore the user
115 * state so we don't wrongly set TIF_FORCED_TF below.
116 * If enable_single_step() was used last and that is what
117 * set TIF_SINGLESTEP, then both TF and TIF_FORCED_TF are
118 * already set and our bookkeeping is fine.
119 */
120 if (unlikely(test_tsk_thread_flag(child, TIF_SINGLESTEP)))
121 regs->flags |= X86_EFLAGS_TF;
108 122
109 /* 123 /*
110 * Always set TIF_SINGLESTEP - this guarantees that 124 * Always set TIF_SINGLESTEP - this guarantees that
@@ -113,11 +127,7 @@ static int enable_single_step(struct task_struct *child)
113 */ 127 */
114 set_tsk_thread_flag(child, TIF_SINGLESTEP); 128 set_tsk_thread_flag(child, TIF_SINGLESTEP);
115 129
116 /* 130 oflags = regs->flags;
117 * If TF was already set, don't do anything else
118 */
119 if (regs->flags & X86_EFLAGS_TF)
120 return 0;
121 131
122 /* Set TF on the kernel stack.. */ 132 /* Set TF on the kernel stack.. */
123 regs->flags |= X86_EFLAGS_TF; 133 regs->flags |= X86_EFLAGS_TF;
@@ -126,9 +136,22 @@ static int enable_single_step(struct task_struct *child)
126 * ..but if TF is changed by the instruction we will trace, 136 * ..but if TF is changed by the instruction we will trace,
127 * don't mark it as being "us" that set it, so that we 137 * don't mark it as being "us" that set it, so that we
128 * won't clear it by hand later. 138 * won't clear it by hand later.
139 *
140 * Note that if we don't actually execute the popf because
141 * of a signal arriving right now or suchlike, we will lose
142 * track of the fact that it really was "us" that set it.
129 */ 143 */
130 if (is_setting_trap_flag(child, regs)) 144 if (is_setting_trap_flag(child, regs)) {
145 clear_tsk_thread_flag(child, TIF_FORCED_TF);
131 return 0; 146 return 0;
147 }
148
149 /*
150 * If TF was already set, check whether it was us who set it.
151 * If not, we should never attempt a block step.
152 */
153 if (oflags & X86_EFLAGS_TF)
154 return test_tsk_thread_flag(child, TIF_FORCED_TF);
132 155
133 set_tsk_thread_flag(child, TIF_FORCED_TF); 156 set_tsk_thread_flag(child, TIF_FORCED_TF);
134 157
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c
index ae751094eba9..7b987852e876 100644
--- a/arch/x86/kernel/summit_32.c
+++ b/arch/x86/kernel/summit_32.c
@@ -30,13 +30,15 @@
30#include <linux/init.h> 30#include <linux/init.h>
31#include <asm/io.h> 31#include <asm/io.h>
32#include <asm/bios_ebda.h> 32#include <asm/bios_ebda.h>
33#include <asm/mach-summit/mach_mpparse.h> 33#include <asm/summit/mpparse.h>
34 34
35static struct rio_table_hdr *rio_table_hdr __initdata; 35static struct rio_table_hdr *rio_table_hdr __initdata;
36static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; 36static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
37static struct rio_detail *rio_devs[MAX_NUMNODES*4] __initdata; 37static struct rio_detail *rio_devs[MAX_NUMNODES*4] __initdata;
38 38
39#ifndef CONFIG_X86_NUMAQ
39static int mp_bus_id_to_node[MAX_MP_BUSSES] __initdata; 40static int mp_bus_id_to_node[MAX_MP_BUSSES] __initdata;
41#endif
40 42
41static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) 43static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
42{ 44{
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index d2ab52cc1d6b..1884a8d12bfa 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -19,8 +19,10 @@
19#include <linux/utsname.h> 19#include <linux/utsname.h>
20#include <linux/ipc.h> 20#include <linux/ipc.h>
21 21
22#include <asm/uaccess.h> 22#include <linux/uaccess.h>
23#include <asm/unistd.h> 23#include <linux/unistd.h>
24
25#include <asm/syscalls.h>
24 26
25asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, 27asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
26 unsigned long prot, unsigned long flags, 28 unsigned long prot, unsigned long flags,
@@ -103,7 +105,7 @@ asmlinkage int old_select(struct sel_arg_struct __user *arg)
103 * 105 *
104 * This is really horribly ugly. 106 * This is really horribly ugly.
105 */ 107 */
106asmlinkage int sys_ipc (uint call, int first, int second, 108asmlinkage int sys_ipc(uint call, int first, int second,
107 int third, void __user *ptr, long fifth) 109 int third, void __user *ptr, long fifth)
108{ 110{
109 int version, ret; 111 int version, ret;
@@ -113,24 +115,24 @@ asmlinkage int sys_ipc (uint call, int first, int second,
113 115
114 switch (call) { 116 switch (call) {
115 case SEMOP: 117 case SEMOP:
116 return sys_semtimedop (first, (struct sembuf __user *)ptr, second, NULL); 118 return sys_semtimedop(first, (struct sembuf __user *)ptr, second, NULL);
117 case SEMTIMEDOP: 119 case SEMTIMEDOP:
118 return sys_semtimedop(first, (struct sembuf __user *)ptr, second, 120 return sys_semtimedop(first, (struct sembuf __user *)ptr, second,
119 (const struct timespec __user *)fifth); 121 (const struct timespec __user *)fifth);
120 122
121 case SEMGET: 123 case SEMGET:
122 return sys_semget (first, second, third); 124 return sys_semget(first, second, third);
123 case SEMCTL: { 125 case SEMCTL: {
124 union semun fourth; 126 union semun fourth;
125 if (!ptr) 127 if (!ptr)
126 return -EINVAL; 128 return -EINVAL;
127 if (get_user(fourth.__pad, (void __user * __user *) ptr)) 129 if (get_user(fourth.__pad, (void __user * __user *) ptr))
128 return -EFAULT; 130 return -EFAULT;
129 return sys_semctl (first, second, third, fourth); 131 return sys_semctl(first, second, third, fourth);
130 } 132 }
131 133
132 case MSGSND: 134 case MSGSND:
133 return sys_msgsnd (first, (struct msgbuf __user *) ptr, 135 return sys_msgsnd(first, (struct msgbuf __user *) ptr,
134 second, third); 136 second, third);
135 case MSGRCV: 137 case MSGRCV:
136 switch (version) { 138 switch (version) {
@@ -138,45 +140,45 @@ asmlinkage int sys_ipc (uint call, int first, int second,
138 struct ipc_kludge tmp; 140 struct ipc_kludge tmp;
139 if (!ptr) 141 if (!ptr)
140 return -EINVAL; 142 return -EINVAL;
141 143
142 if (copy_from_user(&tmp, 144 if (copy_from_user(&tmp,
143 (struct ipc_kludge __user *) ptr, 145 (struct ipc_kludge __user *) ptr,
144 sizeof (tmp))) 146 sizeof(tmp)))
145 return -EFAULT; 147 return -EFAULT;
146 return sys_msgrcv (first, tmp.msgp, second, 148 return sys_msgrcv(first, tmp.msgp, second,
147 tmp.msgtyp, third); 149 tmp.msgtyp, third);
148 } 150 }
149 default: 151 default:
150 return sys_msgrcv (first, 152 return sys_msgrcv(first,
151 (struct msgbuf __user *) ptr, 153 (struct msgbuf __user *) ptr,
152 second, fifth, third); 154 second, fifth, third);
153 } 155 }
154 case MSGGET: 156 case MSGGET:
155 return sys_msgget ((key_t) first, second); 157 return sys_msgget((key_t) first, second);
156 case MSGCTL: 158 case MSGCTL:
157 return sys_msgctl (first, second, (struct msqid_ds __user *) ptr); 159 return sys_msgctl(first, second, (struct msqid_ds __user *) ptr);
158 160
159 case SHMAT: 161 case SHMAT:
160 switch (version) { 162 switch (version) {
161 default: { 163 default: {
162 ulong raddr; 164 ulong raddr;
163 ret = do_shmat (first, (char __user *) ptr, second, &raddr); 165 ret = do_shmat(first, (char __user *) ptr, second, &raddr);
164 if (ret) 166 if (ret)
165 return ret; 167 return ret;
166 return put_user (raddr, (ulong __user *) third); 168 return put_user(raddr, (ulong __user *) third);
167 } 169 }
168 case 1: /* iBCS2 emulator entry point */ 170 case 1: /* iBCS2 emulator entry point */
169 if (!segment_eq(get_fs(), get_ds())) 171 if (!segment_eq(get_fs(), get_ds()))
170 return -EINVAL; 172 return -EINVAL;
171 /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */ 173 /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */
172 return do_shmat (first, (char __user *) ptr, second, (ulong *) third); 174 return do_shmat(first, (char __user *) ptr, second, (ulong *) third);
173 } 175 }
174 case SHMDT: 176 case SHMDT:
175 return sys_shmdt ((char __user *)ptr); 177 return sys_shmdt((char __user *)ptr);
176 case SHMGET: 178 case SHMGET:
177 return sys_shmget (first, second, third); 179 return sys_shmget(first, second, third);
178 case SHMCTL: 180 case SHMCTL:
179 return sys_shmctl (first, second, 181 return sys_shmctl(first, second,
180 (struct shmid_ds __user *) ptr); 182 (struct shmid_ds __user *) ptr);
181 default: 183 default:
182 return -ENOSYS; 184 return -ENOSYS;
@@ -186,28 +188,28 @@ asmlinkage int sys_ipc (uint call, int first, int second,
186/* 188/*
187 * Old cruft 189 * Old cruft
188 */ 190 */
189asmlinkage int sys_uname(struct old_utsname __user * name) 191asmlinkage int sys_uname(struct old_utsname __user *name)
190{ 192{
191 int err; 193 int err;
192 if (!name) 194 if (!name)
193 return -EFAULT; 195 return -EFAULT;
194 down_read(&uts_sem); 196 down_read(&uts_sem);
195 err = copy_to_user(name, utsname(), sizeof (*name)); 197 err = copy_to_user(name, utsname(), sizeof(*name));
196 up_read(&uts_sem); 198 up_read(&uts_sem);
197 return err?-EFAULT:0; 199 return err? -EFAULT:0;
198} 200}
199 201
200asmlinkage int sys_olduname(struct oldold_utsname __user * name) 202asmlinkage int sys_olduname(struct oldold_utsname __user *name)
201{ 203{
202 int error; 204 int error;
203 205
204 if (!name) 206 if (!name)
205 return -EFAULT; 207 return -EFAULT;
206 if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname))) 208 if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
207 return -EFAULT; 209 return -EFAULT;
208 210
209 down_read(&uts_sem); 211 down_read(&uts_sem);
210 212
211 error = __copy_to_user(&name->sysname, &utsname()->sysname, 213 error = __copy_to_user(&name->sysname, &utsname()->sysname,
212 __OLD_UTS_LEN); 214 __OLD_UTS_LEN);
213 error |= __put_user(0, name->sysname + __OLD_UTS_LEN); 215 error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
@@ -223,9 +225,9 @@ asmlinkage int sys_olduname(struct oldold_utsname __user * name)
223 error |= __copy_to_user(&name->machine, &utsname()->machine, 225 error |= __copy_to_user(&name->machine, &utsname()->machine,
224 __OLD_UTS_LEN); 226 __OLD_UTS_LEN);
225 error |= __put_user(0, name->machine + __OLD_UTS_LEN); 227 error |= __put_user(0, name->machine + __OLD_UTS_LEN);
226 228
227 up_read(&uts_sem); 229 up_read(&uts_sem);
228 230
229 error = error ? -EFAULT : 0; 231 error = error ? -EFAULT : 0;
230 232
231 return error; 233 return error;
@@ -241,6 +243,6 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[])
241 long __res; 243 long __res;
242 asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" 244 asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx"
243 : "=a" (__res) 245 : "=a" (__res)
244 : "0" (__NR_execve),"ri" (filename),"c" (argv), "d" (envp) : "memory"); 246 : "0" (__NR_execve), "ri" (filename), "c" (argv), "d" (envp) : "memory");
245 return __res; 247 return __res;
246} 248}
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 3b360ef33817..6bc211accf08 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -13,15 +13,17 @@
13#include <linux/utsname.h> 13#include <linux/utsname.h>
14#include <linux/personality.h> 14#include <linux/personality.h>
15#include <linux/random.h> 15#include <linux/random.h>
16#include <linux/uaccess.h>
16 17
17#include <asm/uaccess.h>
18#include <asm/ia32.h> 18#include <asm/ia32.h>
19#include <asm/syscalls.h>
19 20
20asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, 21asmlinkage long sys_mmap(unsigned long addr, unsigned long len,
21 unsigned long fd, unsigned long off) 22 unsigned long prot, unsigned long flags,
23 unsigned long fd, unsigned long off)
22{ 24{
23 long error; 25 long error;
24 struct file * file; 26 struct file *file;
25 27
26 error = -EINVAL; 28 error = -EINVAL;
27 if (off & ~PAGE_MASK) 29 if (off & ~PAGE_MASK)
@@ -56,9 +58,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
56 unmapped base down for this case. This can give 58 unmapped base down for this case. This can give
57 conflicts with the heap, but we assume that glibc 59 conflicts with the heap, but we assume that glibc
58 malloc knows how to fall back to mmap. Give it 1GB 60 malloc knows how to fall back to mmap. Give it 1GB
59 of playground for now. -AK */ 61 of playground for now. -AK */
60 *begin = 0x40000000; 62 *begin = 0x40000000;
61 *end = 0x80000000; 63 *end = 0x80000000;
62 if (current->flags & PF_RANDOMIZE) { 64 if (current->flags & PF_RANDOMIZE) {
63 new_begin = randomize_range(*begin, *begin + 0x02000000, 0); 65 new_begin = randomize_range(*begin, *begin + 0x02000000, 0);
64 if (new_begin) 66 if (new_begin)
@@ -66,9 +68,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
66 } 68 }
67 } else { 69 } else {
68 *begin = TASK_UNMAPPED_BASE; 70 *begin = TASK_UNMAPPED_BASE;
69 *end = TASK_SIZE; 71 *end = TASK_SIZE;
70 } 72 }
71} 73}
72 74
73unsigned long 75unsigned long
74arch_get_unmapped_area(struct file *filp, unsigned long addr, 76arch_get_unmapped_area(struct file *filp, unsigned long addr,
@@ -78,11 +80,11 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
78 struct vm_area_struct *vma; 80 struct vm_area_struct *vma;
79 unsigned long start_addr; 81 unsigned long start_addr;
80 unsigned long begin, end; 82 unsigned long begin, end;
81 83
82 if (flags & MAP_FIXED) 84 if (flags & MAP_FIXED)
83 return addr; 85 return addr;
84 86
85 find_start_end(flags, &begin, &end); 87 find_start_end(flags, &begin, &end);
86 88
87 if (len > end) 89 if (len > end)
88 return -ENOMEM; 90 return -ENOMEM;
@@ -96,12 +98,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
96 } 98 }
97 if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32)) 99 if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
98 && len <= mm->cached_hole_size) { 100 && len <= mm->cached_hole_size) {
99 mm->cached_hole_size = 0; 101 mm->cached_hole_size = 0;
100 mm->free_area_cache = begin; 102 mm->free_area_cache = begin;
101 } 103 }
102 addr = mm->free_area_cache; 104 addr = mm->free_area_cache;
103 if (addr < begin) 105 if (addr < begin)
104 addr = begin; 106 addr = begin;
105 start_addr = addr; 107 start_addr = addr;
106 108
107full_search: 109full_search:
@@ -127,7 +129,7 @@ full_search:
127 return addr; 129 return addr;
128 } 130 }
129 if (addr + mm->cached_hole_size < vma->vm_start) 131 if (addr + mm->cached_hole_size < vma->vm_start)
130 mm->cached_hole_size = vma->vm_start - addr; 132 mm->cached_hole_size = vma->vm_start - addr;
131 133
132 addr = vma->vm_end; 134 addr = vma->vm_end;
133 } 135 }
@@ -177,7 +179,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
177 vma = find_vma(mm, addr-len); 179 vma = find_vma(mm, addr-len);
178 if (!vma || addr <= vma->vm_start) 180 if (!vma || addr <= vma->vm_start)
179 /* remember the address as a hint for next time */ 181 /* remember the address as a hint for next time */
180 return (mm->free_area_cache = addr-len); 182 return mm->free_area_cache = addr-len;
181 } 183 }
182 184
183 if (mm->mmap_base < len) 185 if (mm->mmap_base < len)
@@ -194,7 +196,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
194 vma = find_vma(mm, addr); 196 vma = find_vma(mm, addr);
195 if (!vma || addr+len <= vma->vm_start) 197 if (!vma || addr+len <= vma->vm_start)
196 /* remember the address as a hint for next time */ 198 /* remember the address as a hint for next time */
197 return (mm->free_area_cache = addr); 199 return mm->free_area_cache = addr;
198 200
199 /* remember the largest hole we saw so far */ 201 /* remember the largest hole we saw so far */
200 if (addr + mm->cached_hole_size < vma->vm_start) 202 if (addr + mm->cached_hole_size < vma->vm_start)
@@ -224,13 +226,13 @@ bottomup:
224} 226}
225 227
226 228
227asmlinkage long sys_uname(struct new_utsname __user * name) 229asmlinkage long sys_uname(struct new_utsname __user *name)
228{ 230{
229 int err; 231 int err;
230 down_read(&uts_sem); 232 down_read(&uts_sem);
231 err = copy_to_user(name, utsname(), sizeof (*name)); 233 err = copy_to_user(name, utsname(), sizeof(*name));
232 up_read(&uts_sem); 234 up_read(&uts_sem);
233 if (personality(current->personality) == PER_LINUX32) 235 if (personality(current->personality) == PER_LINUX32)
234 err |= copy_to_user(&name->machine, "i686", 5); 236 err |= copy_to_user(&name->machine, "i686", 5);
235 return err ? -EFAULT : 0; 237 return err ? -EFAULT : 0;
236} 238}
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
index 170d43c17487..3d1be4f0fac5 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/kernel/syscall_64.c
@@ -8,12 +8,12 @@
8#define __NO_STUBS 8#define __NO_STUBS
9 9
10#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; 10#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
11#undef _ASM_X86_64_UNISTD_H_ 11#undef ASM_X86__UNISTD_64_H
12#include <asm/unistd_64.h> 12#include <asm/unistd_64.h>
13 13
14#undef __SYSCALL 14#undef __SYSCALL
15#define __SYSCALL(nr, sym) [nr] = sym, 15#define __SYSCALL(nr, sym) [nr] = sym,
16#undef _ASM_X86_64_UNISTD_H_ 16#undef ASM_X86__UNISTD_64_H
17 17
18typedef void (*sys_call_ptr_t)(void); 18typedef void (*sys_call_ptr_t)(void);
19 19
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index adff5562f5fd..d44395ff34c3 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -326,3 +326,9 @@ ENTRY(sys_call_table)
326 .long sys_fallocate 326 .long sys_fallocate
327 .long sys_timerfd_settime /* 325 */ 327 .long sys_timerfd_settime /* 325 */
328 .long sys_timerfd_gettime 328 .long sys_timerfd_gettime
329 .long sys_signalfd4
330 .long sys_eventfd2
331 .long sys_epoll_create1
332 .long sys_dup3 /* 330 */
333 .long sys_pipe2
334 .long sys_inotify_init1
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 2ff21f398934..77b400f06ea2 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -36,12 +36,10 @@
36#include <asm/arch_hooks.h> 36#include <asm/arch_hooks.h>
37#include <asm/hpet.h> 37#include <asm/hpet.h>
38#include <asm/time.h> 38#include <asm/time.h>
39#include <asm/timer.h>
39 40
40#include "do_timer.h" 41#include "do_timer.h"
41 42
42unsigned int cpu_khz; /* Detected as we calibrate the TSC */
43EXPORT_SYMBOL(cpu_khz);
44
45int timer_ack; 43int timer_ack;
46 44
47unsigned long profile_pc(struct pt_regs *regs) 45unsigned long profile_pc(struct pt_regs *regs)
@@ -49,10 +47,9 @@ unsigned long profile_pc(struct pt_regs *regs)
49 unsigned long pc = instruction_pointer(regs); 47 unsigned long pc = instruction_pointer(regs);
50 48
51#ifdef CONFIG_SMP 49#ifdef CONFIG_SMP
52 if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs) && 50 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
53 in_lock_functions(pc)) {
54#ifdef CONFIG_FRAME_POINTER 51#ifdef CONFIG_FRAME_POINTER
55 return *(unsigned long *)(regs->bp + 4); 52 return *(unsigned long *)(regs->bp + sizeof(long));
56#else 53#else
57 unsigned long *sp = (unsigned long *)&regs->sp; 54 unsigned long *sp = (unsigned long *)&regs->sp;
58 55
@@ -84,8 +81,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
84 if (timer_ack) { 81 if (timer_ack) {
85 /* 82 /*
86 * Subtle, when I/O APICs are used we have to ack timer IRQ 83 * Subtle, when I/O APICs are used we have to ack timer IRQ
87 * manually to reset the IRR bit for do_slow_gettimeoffset(). 84 * manually to deassert NMI lines for the watchdog if run
88 * This will also deassert NMI lines for the watchdog if run
89 * on an 82489DX-based system. 85 * on an 82489DX-based system.
90 */ 86 */
91 spin_lock(&i8259A_lock); 87 spin_lock(&i8259A_lock);
@@ -98,6 +94,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
98 94
99 do_timer_interrupt_hook(); 95 do_timer_interrupt_hook();
100 96
97#ifdef CONFIG_MCA
101 if (MCA_bus) { 98 if (MCA_bus) {
102 /* The PS/2 uses level-triggered interrupts. You can't 99 /* The PS/2 uses level-triggered interrupts. You can't
103 turn them off, nor would you want to (any attempt to 100 turn them off, nor would you want to (any attempt to
@@ -111,6 +108,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
111 u8 irq_v = inb_p( 0x61 ); /* read the current state */ 108 u8 irq_v = inb_p( 0x61 ); /* read the current state */
112 outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */ 109 outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */
113 } 110 }
111#endif
114 112
115 return IRQ_HANDLED; 113 return IRQ_HANDLED;
116} 114}
@@ -133,6 +131,7 @@ void __init hpet_time_init(void)
133 */ 131 */
134void __init time_init(void) 132void __init time_init(void)
135{ 133{
134 pre_time_init_hook();
136 tsc_init(); 135 tsc_init();
137 late_time_init = choose_time_init(); 136 late_time_init = choose_time_init();
138} 137}
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index c737849e2ef7..cb19d650c216 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -16,6 +16,7 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/time.h> 18#include <linux/time.h>
19#include <linux/mca.h>
19 20
20#include <asm/i8253.h> 21#include <asm/i8253.h>
21#include <asm/hpet.h> 22#include <asm/hpet.h>
@@ -33,30 +34,41 @@ unsigned long profile_pc(struct pt_regs *regs)
33 /* Assume the lock function has either no stack frame or a copy 34 /* Assume the lock function has either no stack frame or a copy
34 of flags from PUSHF 35 of flags from PUSHF
35 Eflags always has bits 22 and up cleared unlike kernel addresses. */ 36 Eflags always has bits 22 and up cleared unlike kernel addresses. */
36 if (!user_mode(regs) && in_lock_functions(pc)) { 37 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
38#ifdef CONFIG_FRAME_POINTER
39 return *(unsigned long *)(regs->bp + sizeof(long));
40#else
37 unsigned long *sp = (unsigned long *)regs->sp; 41 unsigned long *sp = (unsigned long *)regs->sp;
38 if (sp[0] >> 22) 42 if (sp[0] >> 22)
39 return sp[0]; 43 return sp[0];
40 if (sp[1] >> 22) 44 if (sp[1] >> 22)
41 return sp[1]; 45 return sp[1];
46#endif
42 } 47 }
43 return pc; 48 return pc;
44} 49}
45EXPORT_SYMBOL(profile_pc); 50EXPORT_SYMBOL(profile_pc);
46 51
47static irqreturn_t timer_event_interrupt(int irq, void *dev_id) 52irqreturn_t timer_interrupt(int irq, void *dev_id)
48{ 53{
49 add_pda(irq0_irqs, 1); 54 add_pda(irq0_irqs, 1);
50 55
51 global_clock_event->event_handler(global_clock_event); 56 global_clock_event->event_handler(global_clock_event);
52 57
58#ifdef CONFIG_MCA
59 if (MCA_bus) {
60 u8 irq_v = inb_p(0x61); /* read the current state */
61 outb_p(irq_v|0x80, 0x61); /* reset the IRQ */
62 }
63#endif
64
53 return IRQ_HANDLED; 65 return IRQ_HANDLED;
54} 66}
55 67
56/* calibrate_cpu is used on systems with fixed rate TSCs to determine 68/* calibrate_cpu is used on systems with fixed rate TSCs to determine
57 * processor frequency */ 69 * processor frequency */
58#define TICK_COUNT 100000000 70#define TICK_COUNT 100000000
59unsigned long __init native_calculate_cpu_khz(void) 71unsigned long __init calibrate_cpu(void)
60{ 72{
61 int tsc_start, tsc_now; 73 int tsc_start, tsc_now;
62 int i, no_ctr_free; 74 int i, no_ctr_free;
@@ -100,7 +112,7 @@ unsigned long __init native_calculate_cpu_khz(void)
100} 112}
101 113
102static struct irqaction irq0 = { 114static struct irqaction irq0 = {
103 .handler = timer_event_interrupt, 115 .handler = timer_interrupt,
104 .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING, 116 .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING,
105 .mask = CPU_MASK_NONE, 117 .mask = CPU_MASK_NONE,
106 .name = "timer" 118 .name = "timer"
@@ -111,28 +123,13 @@ void __init hpet_time_init(void)
111 if (!hpet_enable()) 123 if (!hpet_enable())
112 setup_pit_timer(); 124 setup_pit_timer();
113 125
126 irq0.mask = cpumask_of_cpu(0);
114 setup_irq(0, &irq0); 127 setup_irq(0, &irq0);
115} 128}
116 129
117void __init time_init(void) 130void __init time_init(void)
118{ 131{
119 tsc_calibrate(); 132 tsc_init();
120
121 cpu_khz = tsc_khz;
122 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
123 (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
124 cpu_khz = calculate_cpu_khz();
125
126 if (unsynchronized_tsc())
127 mark_tsc_unstable("TSCs unsynchronized");
128
129 if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
130 vgetcpu_mode = VGETCPU_RDTSCP;
131 else
132 vgetcpu_mode = VGETCPU_LSL;
133 133
134 printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
135 cpu_khz / 1000, cpu_khz % 1000);
136 init_tsc_clocksource();
137 late_time_init = choose_time_init(); 134 late_time_init = choose_time_init();
138} 135}
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index 9bb2363851af..e00534b33534 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -238,6 +238,14 @@ static void do_flush_tlb_all(void *info)
238 238
239void flush_tlb_all(void) 239void flush_tlb_all(void)
240{ 240{
241 on_each_cpu(do_flush_tlb_all, NULL, 1, 1); 241 on_each_cpu(do_flush_tlb_all, NULL, 1);
242}
243
244void reset_lazy_tlbstate(void)
245{
246 int cpu = raw_smp_processor_id();
247
248 per_cpu(cpu_tlbstate, cpu).state = 0;
249 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
242} 250}
243 251
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
index a1f07d793202..dcbf7a1159ea 100644
--- a/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/kernel/tlb_64.c
@@ -15,6 +15,8 @@
15#include <asm/proto.h> 15#include <asm/proto.h>
16#include <asm/apicdef.h> 16#include <asm/apicdef.h>
17#include <asm/idle.h> 17#include <asm/idle.h>
18#include <asm/uv/uv_hub.h>
19#include <asm/uv/uv_bau.h>
18 20
19#include <mach_ipi.h> 21#include <mach_ipi.h>
20/* 22/*
@@ -162,6 +164,9 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
162 union smp_flush_state *f; 164 union smp_flush_state *f;
163 cpumask_t cpumask = *cpumaskp; 165 cpumask_t cpumask = *cpumaskp;
164 166
167 if (is_uv_system() && uv_flush_tlb_others(&cpumask, mm, va))
168 return;
169
165 /* Caller has disabled preemption */ 170 /* Caller has disabled preemption */
166 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; 171 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
167 f = &per_cpu(flush_state, sender); 172 f = &per_cpu(flush_state, sender);
@@ -270,5 +275,5 @@ static void do_flush_tlb_all(void *info)
270 275
271void flush_tlb_all(void) 276void flush_tlb_all(void)
272{ 277{
273 on_each_cpu(do_flush_tlb_all, NULL, 1, 1); 278 on_each_cpu(do_flush_tlb_all, NULL, 1);
274} 279}
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
new file mode 100644
index 000000000000..8b8c0d6640fa
--- /dev/null
+++ b/arch/x86/kernel/tlb_uv.c
@@ -0,0 +1,793 @@
1/*
2 * SGI UltraViolet TLB flush routines.
3 *
4 * (c) 2008 Cliff Wickman <cpw@sgi.com>, SGI.
5 *
6 * This code is released under the GNU General Public License version 2 or
7 * later.
8 */
9#include <linux/mc146818rtc.h>
10#include <linux/proc_fs.h>
11#include <linux/kernel.h>
12
13#include <asm/mmu_context.h>
14#include <asm/uv/uv_mmrs.h>
15#include <asm/uv/uv_hub.h>
16#include <asm/uv/uv_bau.h>
17#include <asm/genapic.h>
18#include <asm/idle.h>
19#include <asm/tsc.h>
20#include <asm/irq_vectors.h>
21
22#include <mach_apic.h>
23
24static struct bau_control **uv_bau_table_bases __read_mostly;
25static int uv_bau_retry_limit __read_mostly;
26
27/* position of pnode (which is nasid>>1): */
28static int uv_nshift __read_mostly;
29
30static unsigned long uv_mmask __read_mostly;
31
32static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
33static DEFINE_PER_CPU(struct bau_control, bau_control);
34
35/*
36 * Free a software acknowledge hardware resource by clearing its Pending
37 * bit. This will return a reply to the sender.
38 * If the message has timed out, a reply has already been sent by the
39 * hardware but the resource has not been released. In that case our
40 * clear of the Timeout bit (as well) will free the resource. No reply will
41 * be sent (the hardware will only do one reply per message).
42 */
43static void uv_reply_to_message(int resource,
44 struct bau_payload_queue_entry *msg,
45 struct bau_msg_status *msp)
46{
47 unsigned long dw;
48
49 dw = (1 << (resource + UV_SW_ACK_NPENDING)) | (1 << resource);
50 msg->replied_to = 1;
51 msg->sw_ack_vector = 0;
52 if (msp)
53 msp->seen_by.bits = 0;
54 uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
55}
56
57/*
58 * Do all the things a cpu should do for a TLB shootdown message.
59 * Other cpu's may come here at the same time for this message.
60 */
61static void uv_bau_process_message(struct bau_payload_queue_entry *msg,
62 int msg_slot, int sw_ack_slot)
63{
64 unsigned long this_cpu_mask;
65 struct bau_msg_status *msp;
66 int cpu;
67
68 msp = __get_cpu_var(bau_control).msg_statuses + msg_slot;
69 cpu = uv_blade_processor_id();
70 msg->number_of_cpus =
71 uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id()));
72 this_cpu_mask = 1UL << cpu;
73 if (msp->seen_by.bits & this_cpu_mask)
74 return;
75 atomic_or_long(&msp->seen_by.bits, this_cpu_mask);
76
77 if (msg->replied_to == 1)
78 return;
79
80 if (msg->address == TLB_FLUSH_ALL) {
81 local_flush_tlb();
82 __get_cpu_var(ptcstats).alltlb++;
83 } else {
84 __flush_tlb_one(msg->address);
85 __get_cpu_var(ptcstats).onetlb++;
86 }
87
88 __get_cpu_var(ptcstats).requestee++;
89
90 atomic_inc_short(&msg->acknowledge_count);
91 if (msg->number_of_cpus == msg->acknowledge_count)
92 uv_reply_to_message(sw_ack_slot, msg, msp);
93}
94
95/*
96 * Examine the payload queue on one distribution node to see
97 * which messages have not been seen, and which cpu(s) have not seen them.
98 *
99 * Returns the number of cpu's that have not responded.
100 */
101static int uv_examine_destination(struct bau_control *bau_tablesp, int sender)
102{
103 struct bau_payload_queue_entry *msg;
104 struct bau_msg_status *msp;
105 int count = 0;
106 int i;
107 int j;
108
109 for (msg = bau_tablesp->va_queue_first, i = 0; i < DEST_Q_SIZE;
110 msg++, i++) {
111 if ((msg->sending_cpu == sender) && (!msg->replied_to)) {
112 msp = bau_tablesp->msg_statuses + i;
113 printk(KERN_DEBUG
114 "blade %d: address:%#lx %d of %d, not cpu(s): ",
115 i, msg->address, msg->acknowledge_count,
116 msg->number_of_cpus);
117 for (j = 0; j < msg->number_of_cpus; j++) {
118 if (!((1L << j) & msp->seen_by.bits)) {
119 count++;
120 printk("%d ", j);
121 }
122 }
123 printk("\n");
124 }
125 }
126 return count;
127}
128
129/*
130 * Examine the payload queue on all the distribution nodes to see
131 * which messages have not been seen, and which cpu(s) have not seen them.
132 *
133 * Returns the number of cpu's that have not responded.
134 */
135static int uv_examine_destinations(struct bau_target_nodemask *distribution)
136{
137 int sender;
138 int i;
139 int count = 0;
140
141 sender = smp_processor_id();
142 for (i = 0; i < sizeof(struct bau_target_nodemask) * BITSPERBYTE; i++) {
143 if (!bau_node_isset(i, distribution))
144 continue;
145 count += uv_examine_destination(uv_bau_table_bases[i], sender);
146 }
147 return count;
148}
149
150/*
151 * wait for completion of a broadcast message
152 *
153 * return COMPLETE, RETRY or GIVEUP
154 */
155static int uv_wait_completion(struct bau_desc *bau_desc,
156 unsigned long mmr_offset, int right_shift)
157{
158 int exams = 0;
159 long destination_timeouts = 0;
160 long source_timeouts = 0;
161 unsigned long descriptor_status;
162
163 while ((descriptor_status = (((unsigned long)
164 uv_read_local_mmr(mmr_offset) >>
165 right_shift) & UV_ACT_STATUS_MASK)) !=
166 DESC_STATUS_IDLE) {
167 if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
168 source_timeouts++;
169 if (source_timeouts > SOURCE_TIMEOUT_LIMIT)
170 source_timeouts = 0;
171 __get_cpu_var(ptcstats).s_retry++;
172 return FLUSH_RETRY;
173 }
174 /*
175 * spin here looking for progress at the destinations
176 */
177 if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) {
178 destination_timeouts++;
179 if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) {
180 /*
181 * returns number of cpus not responding
182 */
183 if (uv_examine_destinations
184 (&bau_desc->distribution) == 0) {
185 __get_cpu_var(ptcstats).d_retry++;
186 return FLUSH_RETRY;
187 }
188 exams++;
189 if (exams >= uv_bau_retry_limit) {
190 printk(KERN_DEBUG
191 "uv_flush_tlb_others");
192 printk("giving up on cpu %d\n",
193 smp_processor_id());
194 return FLUSH_GIVEUP;
195 }
196 /*
197 * delays can hang the simulator
198 udelay(1000);
199 */
200 destination_timeouts = 0;
201 }
202 }
203 }
204 return FLUSH_COMPLETE;
205}
206
207/**
208 * uv_flush_send_and_wait
209 *
210 * Send a broadcast and wait for a broadcast message to complete.
211 *
212 * The cpumaskp mask contains the cpus the broadcast was sent to.
213 *
214 * Returns 1 if all remote flushing was done. The mask is zeroed.
215 * Returns 0 if some remote flushing remains to be done. The mask is left
216 * unchanged.
217 */
218int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc,
219 cpumask_t *cpumaskp)
220{
221 int completion_status = 0;
222 int right_shift;
223 int tries = 0;
224 int blade;
225 int bit;
226 unsigned long mmr_offset;
227 unsigned long index;
228 cycles_t time1;
229 cycles_t time2;
230
231 if (cpu < UV_CPUS_PER_ACT_STATUS) {
232 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
233 right_shift = cpu * UV_ACT_STATUS_SIZE;
234 } else {
235 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
236 right_shift =
237 ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE);
238 }
239 time1 = get_cycles();
240 do {
241 tries++;
242 index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
243 cpu;
244 uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
245 completion_status = uv_wait_completion(bau_desc, mmr_offset,
246 right_shift);
247 } while (completion_status == FLUSH_RETRY);
248 time2 = get_cycles();
249 __get_cpu_var(ptcstats).sflush += (time2 - time1);
250 if (tries > 1)
251 __get_cpu_var(ptcstats).retriesok++;
252
253 if (completion_status == FLUSH_GIVEUP) {
254 /*
255 * Cause the caller to do an IPI-style TLB shootdown on
256 * the cpu's, all of which are still in the mask.
257 */
258 __get_cpu_var(ptcstats).ptc_i++;
259 return 0;
260 }
261
262 /*
263 * Success, so clear the remote cpu's from the mask so we don't
264 * use the IPI method of shootdown on them.
265 */
266 for_each_cpu_mask(bit, *cpumaskp) {
267 blade = uv_cpu_to_blade_id(bit);
268 if (blade == this_blade)
269 continue;
270 cpu_clear(bit, *cpumaskp);
271 }
272 if (!cpus_empty(*cpumaskp))
273 return 0;
274 return 1;
275}
276
277/**
278 * uv_flush_tlb_others - globally purge translation cache of a virtual
279 * address or all TLB's
280 * @cpumaskp: mask of all cpu's in which the address is to be removed
281 * @mm: mm_struct containing virtual address range
282 * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
283 *
284 * This is the entry point for initiating any UV global TLB shootdown.
285 *
286 * Purges the translation caches of all specified processors of the given
287 * virtual address, or purges all TLB's on specified processors.
288 *
289 * The caller has derived the cpumaskp from the mm_struct and has subtracted
290 * the local cpu from the mask. This function is called only if there
291 * are bits set in the mask. (e.g. flush_tlb_page())
292 *
293 * The cpumaskp is converted into a nodemask of the nodes containing
294 * the cpus.
295 *
296 * Returns 1 if all remote flushing was done.
297 * Returns 0 if some remote flushing remains to be done.
298 */
299int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm,
300 unsigned long va)
301{
302 int i;
303 int bit;
304 int blade;
305 int cpu;
306 int this_blade;
307 int locals = 0;
308 struct bau_desc *bau_desc;
309
310 cpu = uv_blade_processor_id();
311 this_blade = uv_numa_blade_id();
312 bau_desc = __get_cpu_var(bau_control).descriptor_base;
313 bau_desc += UV_ITEMS_PER_DESCRIPTOR * cpu;
314
315 bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
316
317 i = 0;
318 for_each_cpu_mask(bit, *cpumaskp) {
319 blade = uv_cpu_to_blade_id(bit);
320 BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1));
321 if (blade == this_blade) {
322 locals++;
323 continue;
324 }
325 bau_node_set(blade, &bau_desc->distribution);
326 i++;
327 }
328 if (i == 0) {
329 /*
330 * no off_node flushing; return status for local node
331 */
332 if (locals)
333 return 0;
334 else
335 return 1;
336 }
337 __get_cpu_var(ptcstats).requestor++;
338 __get_cpu_var(ptcstats).ntargeted += i;
339
340 bau_desc->payload.address = va;
341 bau_desc->payload.sending_cpu = smp_processor_id();
342
343 return uv_flush_send_and_wait(cpu, this_blade, bau_desc, cpumaskp);
344}
345
346/*
347 * The BAU message interrupt comes here. (registered by set_intr_gate)
348 * See entry_64.S
349 *
350 * We received a broadcast assist message.
351 *
352 * Interrupts may have been disabled; this interrupt could represent
353 * the receipt of several messages.
354 *
355 * All cores/threads on this node get this interrupt.
356 * The last one to see it does the s/w ack.
357 * (the resource will not be freed until noninterruptable cpus see this
358 * interrupt; hardware will timeout the s/w ack and reply ERROR)
359 */
360void uv_bau_message_interrupt(struct pt_regs *regs)
361{
362 struct bau_payload_queue_entry *va_queue_first;
363 struct bau_payload_queue_entry *va_queue_last;
364 struct bau_payload_queue_entry *msg;
365 struct pt_regs *old_regs = set_irq_regs(regs);
366 cycles_t time1;
367 cycles_t time2;
368 int msg_slot;
369 int sw_ack_slot;
370 int fw;
371 int count = 0;
372 unsigned long local_pnode;
373
374 ack_APIC_irq();
375 exit_idle();
376 irq_enter();
377
378 time1 = get_cycles();
379
380 local_pnode = uv_blade_to_pnode(uv_numa_blade_id());
381
382 va_queue_first = __get_cpu_var(bau_control).va_queue_first;
383 va_queue_last = __get_cpu_var(bau_control).va_queue_last;
384
385 msg = __get_cpu_var(bau_control).bau_msg_head;
386 while (msg->sw_ack_vector) {
387 count++;
388 fw = msg->sw_ack_vector;
389 msg_slot = msg - va_queue_first;
390 sw_ack_slot = ffs(fw) - 1;
391
392 uv_bau_process_message(msg, msg_slot, sw_ack_slot);
393
394 msg++;
395 if (msg > va_queue_last)
396 msg = va_queue_first;
397 __get_cpu_var(bau_control).bau_msg_head = msg;
398 }
399 if (!count)
400 __get_cpu_var(ptcstats).nomsg++;
401 else if (count > 1)
402 __get_cpu_var(ptcstats).multmsg++;
403
404 time2 = get_cycles();
405 __get_cpu_var(ptcstats).dflush += (time2 - time1);
406
407 irq_exit();
408 set_irq_regs(old_regs);
409}
410
411static void uv_enable_timeouts(void)
412{
413 int i;
414 int blade;
415 int last_blade;
416 int pnode;
417 int cur_cpu = 0;
418 unsigned long apicid;
419
420 last_blade = -1;
421 for_each_online_node(i) {
422 blade = uv_node_to_blade_id(i);
423 if (blade == last_blade)
424 continue;
425 last_blade = blade;
426 apicid = per_cpu(x86_cpu_to_apicid, cur_cpu);
427 pnode = uv_blade_to_pnode(blade);
428 cur_cpu += uv_blade_nr_possible_cpus(i);
429 }
430}
431
432static void *uv_ptc_seq_start(struct seq_file *file, loff_t *offset)
433{
434 if (*offset < num_possible_cpus())
435 return offset;
436 return NULL;
437}
438
439static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
440{
441 (*offset)++;
442 if (*offset < num_possible_cpus())
443 return offset;
444 return NULL;
445}
446
447static void uv_ptc_seq_stop(struct seq_file *file, void *data)
448{
449}
450
451/*
452 * Display the statistics thru /proc
453 * data points to the cpu number
454 */
455static int uv_ptc_seq_show(struct seq_file *file, void *data)
456{
457 struct ptc_stats *stat;
458 int cpu;
459
460 cpu = *(loff_t *)data;
461
462 if (!cpu) {
463 seq_printf(file,
464 "# cpu requestor requestee one all sretry dretry ptc_i ");
465 seq_printf(file,
466 "sw_ack sflush dflush sok dnomsg dmult starget\n");
467 }
468 if (cpu < num_possible_cpus() && cpu_online(cpu)) {
469 stat = &per_cpu(ptcstats, cpu);
470 seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld ",
471 cpu, stat->requestor,
472 stat->requestee, stat->onetlb, stat->alltlb,
473 stat->s_retry, stat->d_retry, stat->ptc_i);
474 seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld\n",
475 uv_read_global_mmr64(uv_blade_to_pnode
476 (uv_cpu_to_blade_id(cpu)),
477 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
478 stat->sflush, stat->dflush,
479 stat->retriesok, stat->nomsg,
480 stat->multmsg, stat->ntargeted);
481 }
482
483 return 0;
484}
485
486/*
487 * 0: display meaning of the statistics
488 * >0: retry limit
489 */
490static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
491 size_t count, loff_t *data)
492{
493 long newmode;
494 char optstr[64];
495
496 if (count == 0 || count > sizeof(optstr))
497 return -EINVAL;
498 if (copy_from_user(optstr, user, count))
499 return -EFAULT;
500 optstr[count - 1] = '\0';
501 if (strict_strtoul(optstr, 10, &newmode) < 0) {
502 printk(KERN_DEBUG "%s is invalid\n", optstr);
503 return -EINVAL;
504 }
505
506 if (newmode == 0) {
507 printk(KERN_DEBUG "# cpu: cpu number\n");
508 printk(KERN_DEBUG
509 "requestor: times this cpu was the flush requestor\n");
510 printk(KERN_DEBUG
511 "requestee: times this cpu was requested to flush its TLBs\n");
512 printk(KERN_DEBUG
513 "one: times requested to flush a single address\n");
514 printk(KERN_DEBUG
515 "all: times requested to flush all TLB's\n");
516 printk(KERN_DEBUG
517 "sretry: number of retries of source-side timeouts\n");
518 printk(KERN_DEBUG
519 "dretry: number of retries of destination-side timeouts\n");
520 printk(KERN_DEBUG
521 "ptc_i: times UV fell through to IPI-style flushes\n");
522 printk(KERN_DEBUG
523 "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
524 printk(KERN_DEBUG
525 "sflush_us: cycles spent in uv_flush_tlb_others()\n");
526 printk(KERN_DEBUG
527 "dflush_us: cycles spent in handling flush requests\n");
528 printk(KERN_DEBUG "sok: successes on retry\n");
529 printk(KERN_DEBUG "dnomsg: interrupts with no message\n");
530 printk(KERN_DEBUG
531 "dmult: interrupts with multiple messages\n");
532 printk(KERN_DEBUG "starget: nodes targeted\n");
533 } else {
534 uv_bau_retry_limit = newmode;
535 printk(KERN_DEBUG "timeout retry limit:%d\n",
536 uv_bau_retry_limit);
537 }
538
539 return count;
540}
541
542static const struct seq_operations uv_ptc_seq_ops = {
543 .start = uv_ptc_seq_start,
544 .next = uv_ptc_seq_next,
545 .stop = uv_ptc_seq_stop,
546 .show = uv_ptc_seq_show
547};
548
549static int uv_ptc_proc_open(struct inode *inode, struct file *file)
550{
551 return seq_open(file, &uv_ptc_seq_ops);
552}
553
554static const struct file_operations proc_uv_ptc_operations = {
555 .open = uv_ptc_proc_open,
556 .read = seq_read,
557 .write = uv_ptc_proc_write,
558 .llseek = seq_lseek,
559 .release = seq_release,
560};
561
562static int __init uv_ptc_init(void)
563{
564 struct proc_dir_entry *proc_uv_ptc;
565
566 if (!is_uv_system())
567 return 0;
568
569 if (!proc_mkdir("sgi_uv", NULL))
570 return -EINVAL;
571
572 proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL);
573 if (!proc_uv_ptc) {
574 printk(KERN_ERR "unable to create %s proc entry\n",
575 UV_PTC_BASENAME);
576 remove_proc_entry("sgi_uv", NULL);
577 return -EINVAL;
578 }
579 proc_uv_ptc->proc_fops = &proc_uv_ptc_operations;
580 return 0;
581}
582
583/*
584 * begin the initialization of the per-blade control structures
585 */
586static struct bau_control * __init uv_table_bases_init(int blade, int node)
587{
588 int i;
589 int *ip;
590 struct bau_msg_status *msp;
591 struct bau_control *bau_tabp;
592
593 bau_tabp =
594 kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, node);
595 BUG_ON(!bau_tabp);
596
597 bau_tabp->msg_statuses =
598 kmalloc_node(sizeof(struct bau_msg_status) *
599 DEST_Q_SIZE, GFP_KERNEL, node);
600 BUG_ON(!bau_tabp->msg_statuses);
601
602 for (i = 0, msp = bau_tabp->msg_statuses; i < DEST_Q_SIZE; i++, msp++)
603 bau_cpubits_clear(&msp->seen_by, (int)
604 uv_blade_nr_possible_cpus(blade));
605
606 bau_tabp->watching =
607 kmalloc_node(sizeof(int) * DEST_NUM_RESOURCES, GFP_KERNEL, node);
608 BUG_ON(!bau_tabp->watching);
609
610 for (i = 0, ip = bau_tabp->watching; i < DEST_Q_SIZE; i++, ip++)
611 *ip = 0;
612
613 uv_bau_table_bases[blade] = bau_tabp;
614
615 return bau_tabp;
616}
617
618/*
619 * finish the initialization of the per-blade control structures
620 */
621static void __init
622uv_table_bases_finish(int blade, int node, int cur_cpu,
623 struct bau_control *bau_tablesp,
624 struct bau_desc *adp)
625{
626 struct bau_control *bcp;
627 int i;
628
629 for (i = cur_cpu; i < cur_cpu + uv_blade_nr_possible_cpus(blade); i++) {
630 bcp = (struct bau_control *)&per_cpu(bau_control, i);
631
632 bcp->bau_msg_head = bau_tablesp->va_queue_first;
633 bcp->va_queue_first = bau_tablesp->va_queue_first;
634 bcp->va_queue_last = bau_tablesp->va_queue_last;
635 bcp->watching = bau_tablesp->watching;
636 bcp->msg_statuses = bau_tablesp->msg_statuses;
637 bcp->descriptor_base = adp;
638 }
639}
640
641/*
642 * initialize the sending side's sending buffers
643 */
644static struct bau_desc * __init
645uv_activation_descriptor_init(int node, int pnode)
646{
647 int i;
648 unsigned long pa;
649 unsigned long m;
650 unsigned long n;
651 unsigned long mmr_image;
652 struct bau_desc *adp;
653 struct bau_desc *ad2;
654
655 adp = (struct bau_desc *)
656 kmalloc_node(16384, GFP_KERNEL, node);
657 BUG_ON(!adp);
658
659 pa = __pa((unsigned long)adp);
660 n = pa >> uv_nshift;
661 m = pa & uv_mmask;
662
663 mmr_image = uv_read_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE);
664 if (mmr_image) {
665 uv_write_global_mmr64(pnode, (unsigned long)
666 UVH_LB_BAU_SB_DESCRIPTOR_BASE,
667 (n << UV_DESC_BASE_PNODE_SHIFT | m));
668 }
669
670 for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) {
671 memset(ad2, 0, sizeof(struct bau_desc));
672 ad2->header.sw_ack_flag = 1;
673 ad2->header.base_dest_nodeid =
674 uv_blade_to_pnode(uv_cpu_to_blade_id(0));
675 ad2->header.command = UV_NET_ENDPOINT_INTD;
676 ad2->header.int_both = 1;
677 /*
678 * all others need to be set to zero:
679 * fairness chaining multilevel count replied_to
680 */
681 }
682 return adp;
683}
684
685/*
686 * initialize the destination side's receiving buffers
687 */
688static struct bau_payload_queue_entry * __init
689uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp)
690{
691 struct bau_payload_queue_entry *pqp;
692 char *cp;
693
694 pqp = (struct bau_payload_queue_entry *) kmalloc_node(
695 (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry),
696 GFP_KERNEL, node);
697 BUG_ON(!pqp);
698
699 cp = (char *)pqp + 31;
700 pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
701 bau_tablesp->va_queue_first = pqp;
702 uv_write_global_mmr64(pnode,
703 UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
704 ((unsigned long)pnode <<
705 UV_PAYLOADQ_PNODE_SHIFT) |
706 uv_physnodeaddr(pqp));
707 uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
708 uv_physnodeaddr(pqp));
709 bau_tablesp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
710 uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
711 (unsigned long)
712 uv_physnodeaddr(bau_tablesp->va_queue_last));
713 memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE);
714
715 return pqp;
716}
717
718/*
719 * Initialization of each UV blade's structures
720 */
721static int __init uv_init_blade(int blade, int node, int cur_cpu)
722{
723 int pnode;
724 unsigned long pa;
725 unsigned long apicid;
726 struct bau_desc *adp;
727 struct bau_payload_queue_entry *pqp;
728 struct bau_control *bau_tablesp;
729
730 bau_tablesp = uv_table_bases_init(blade, node);
731 pnode = uv_blade_to_pnode(blade);
732 adp = uv_activation_descriptor_init(node, pnode);
733 pqp = uv_payload_queue_init(node, pnode, bau_tablesp);
734 uv_table_bases_finish(blade, node, cur_cpu, bau_tablesp, adp);
735 /*
736 * the below initialization can't be in firmware because the
737 * messaging IRQ will be determined by the OS
738 */
739 apicid = per_cpu(x86_cpu_to_apicid, cur_cpu);
740 pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
741 if ((pa & 0xff) != UV_BAU_MESSAGE) {
742 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
743 ((apicid << 32) | UV_BAU_MESSAGE));
744 }
745 return 0;
746}
747
748/*
749 * Initialization of BAU-related structures
750 */
751static int __init uv_bau_init(void)
752{
753 int blade;
754 int node;
755 int nblades;
756 int last_blade;
757 int cur_cpu = 0;
758
759 if (!is_uv_system())
760 return 0;
761
762 uv_bau_retry_limit = 1;
763 uv_nshift = uv_hub_info->n_val;
764 uv_mmask = (1UL << uv_hub_info->n_val) - 1;
765 nblades = 0;
766 last_blade = -1;
767 for_each_online_node(node) {
768 blade = uv_node_to_blade_id(node);
769 if (blade == last_blade)
770 continue;
771 last_blade = blade;
772 nblades++;
773 }
774 uv_bau_table_bases = (struct bau_control **)
775 kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL);
776 BUG_ON(!uv_bau_table_bases);
777
778 last_blade = -1;
779 for_each_online_node(node) {
780 blade = uv_node_to_blade_id(node);
781 if (blade == last_blade)
782 continue;
783 last_blade = blade;
784 uv_init_blade(blade, node, cur_cpu);
785 cur_cpu += uv_blade_nr_possible_cpus(blade);
786 }
787 alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1);
788 uv_enable_timeouts();
789
790 return 0;
791}
792__initcall(uv_bau_init);
793__initcall(uv_ptc_init);
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index ab6bf375a307..6bb7b8579e70 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -10,6 +10,7 @@
10#include <asm/ldt.h> 10#include <asm/ldt.h>
11#include <asm/processor.h> 11#include <asm/processor.h>
12#include <asm/proto.h> 12#include <asm/proto.h>
13#include <asm/syscalls.h>
13 14
14#include "tls.h" 15#include "tls.h"
15 16
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index abbf199adebb..1106fac6024d 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -2,7 +2,7 @@
2 2
3#include <asm/trampoline.h> 3#include <asm/trampoline.h>
4 4
5/* ready for x86_64, no harm for x86, since it will overwrite after alloc */ 5/* ready for x86_64 and x86 */
6unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); 6unsigned char *trampoline_base = __va(TRAMPOLINE_BASE);
7 7
8/* 8/*
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps.c
index 08d752de4eee..e062974cce34 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps.c
@@ -1,18 +1,17 @@
1/* 1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds 2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
3 * 4 *
4 * Pentium III FXSR, SSE support 5 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000 6 * Gareth Hughes <gareth@valinux.com>, May 2000
6 */ 7 */
7 8
8/* 9/*
9 * 'Traps.c' handles hardware traps and faults after we have saved some 10 * Handle hardware traps and faults.
10 * state in 'asm.s'.
11 */ 11 */
12#include <linux/interrupt.h> 12#include <linux/interrupt.h>
13#include <linux/kallsyms.h> 13#include <linux/kallsyms.h>
14#include <linux/spinlock.h> 14#include <linux/spinlock.h>
15#include <linux/highmem.h>
16#include <linux/kprobes.h> 15#include <linux/kprobes.h>
17#include <linux/uaccess.h> 16#include <linux/uaccess.h>
18#include <linux/utsname.h> 17#include <linux/utsname.h>
@@ -31,6 +30,8 @@
31#include <linux/bug.h> 30#include <linux/bug.h>
32#include <linux/nmi.h> 31#include <linux/nmi.h>
33#include <linux/mm.h> 32#include <linux/mm.h>
33#include <linux/smp.h>
34#include <linux/io.h>
34 35
35#ifdef CONFIG_EISA 36#ifdef CONFIG_EISA
36#include <linux/ioport.h> 37#include <linux/ioport.h>
@@ -45,22 +46,31 @@
45#include <linux/edac.h> 46#include <linux/edac.h>
46#endif 47#endif
47 48
48#include <asm/arch_hooks.h>
49#include <asm/stacktrace.h> 49#include <asm/stacktrace.h>
50#include <asm/processor.h> 50#include <asm/processor.h>
51#include <asm/debugreg.h> 51#include <asm/debugreg.h>
52#include <asm/atomic.h> 52#include <asm/atomic.h>
53#include <asm/system.h> 53#include <asm/system.h>
54#include <asm/unwind.h> 54#include <asm/unwind.h>
55#include <asm/traps.h>
55#include <asm/desc.h> 56#include <asm/desc.h>
56#include <asm/i387.h> 57#include <asm/i387.h>
58
59#include <mach_traps.h>
60
61#ifdef CONFIG_X86_64
62#include <asm/pgalloc.h>
63#include <asm/proto.h>
64#include <asm/pda.h>
65#else
66#include <asm/processor-flags.h>
67#include <asm/arch_hooks.h>
57#include <asm/nmi.h> 68#include <asm/nmi.h>
58#include <asm/smp.h> 69#include <asm/smp.h>
59#include <asm/io.h> 70#include <asm/io.h>
71#include <asm/traps.h>
60 72
61#include "mach_traps.h" 73#include "cpu/mcheck/mce.h"
62
63int panic_on_unrecovered_nmi;
64 74
65DECLARE_BITMAP(used_vectors, NR_VECTORS); 75DECLARE_BITMAP(used_vectors, NR_VECTORS);
66EXPORT_SYMBOL_GPL(used_vectors); 76EXPORT_SYMBOL_GPL(used_vectors);
@@ -77,437 +87,104 @@ char ignore_fpu_irq;
77 */ 87 */
78gate_desc idt_table[256] 88gate_desc idt_table[256]
79 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; 89 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
80
81asmlinkage void divide_error(void);
82asmlinkage void debug(void);
83asmlinkage void nmi(void);
84asmlinkage void int3(void);
85asmlinkage void overflow(void);
86asmlinkage void bounds(void);
87asmlinkage void invalid_op(void);
88asmlinkage void device_not_available(void);
89asmlinkage void coprocessor_segment_overrun(void);
90asmlinkage void invalid_TSS(void);
91asmlinkage void segment_not_present(void);
92asmlinkage void stack_segment(void);
93asmlinkage void general_protection(void);
94asmlinkage void page_fault(void);
95asmlinkage void coprocessor_error(void);
96asmlinkage void simd_coprocessor_error(void);
97asmlinkage void alignment_check(void);
98asmlinkage void spurious_interrupt_bug(void);
99asmlinkage void machine_check(void);
100
101int kstack_depth_to_print = 24;
102static unsigned int code_bytes = 64;
103
104void printk_address(unsigned long address, int reliable)
105{
106#ifdef CONFIG_KALLSYMS
107 char namebuf[KSYM_NAME_LEN];
108 unsigned long offset = 0;
109 unsigned long symsize;
110 const char *symname;
111 char reliab[4] = "";
112 char *delim = ":";
113 char *modname;
114
115 symname = kallsyms_lookup(address, &symsize, &offset,
116 &modname, namebuf);
117 if (!symname) {
118 printk(" [<%08lx>]\n", address);
119 return;
120 }
121 if (!reliable)
122 strcpy(reliab, "? ");
123
124 if (!modname)
125 modname = delim = "";
126 printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
127 address, reliab, delim, modname, delim, symname, offset, symsize);
128#else
129 printk(" [<%08lx>]\n", address);
130#endif 90#endif
131}
132
133static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
134{
135 return p > (void *)tinfo &&
136 p <= (void *)tinfo + THREAD_SIZE - size;
137}
138
139/* The form of the top of the frame on the stack */
140struct stack_frame {
141 struct stack_frame *next_frame;
142 unsigned long return_address;
143};
144
145static inline unsigned long
146print_context_stack(struct thread_info *tinfo,
147 unsigned long *stack, unsigned long bp,
148 const struct stacktrace_ops *ops, void *data)
149{
150 struct stack_frame *frame = (struct stack_frame *)bp;
151
152 while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) {
153 unsigned long addr;
154
155 addr = *stack;
156 if (__kernel_text_address(addr)) {
157 if ((unsigned long) stack == bp + 4) {
158 ops->address(data, addr, 1);
159 frame = frame->next_frame;
160 bp = (unsigned long) frame;
161 } else {
162 ops->address(data, addr, bp == 0);
163 }
164 }
165 stack++;
166 }
167 return bp;
168}
169
170#define MSG(msg) ops->warning(data, msg)
171
172void dump_trace(struct task_struct *task, struct pt_regs *regs,
173 unsigned long *stack, unsigned long bp,
174 const struct stacktrace_ops *ops, void *data)
175{
176 if (!task)
177 task = current;
178
179 if (!stack) {
180 unsigned long dummy;
181
182 stack = &dummy;
183 if (task != current)
184 stack = (unsigned long *)task->thread.sp;
185 }
186
187#ifdef CONFIG_FRAME_POINTER
188 if (!bp) {
189 if (task == current) {
190 /* Grab bp right from our regs */
191 asm("movl %%ebp, %0" : "=r" (bp) :);
192 } else {
193 /* bp is the last reg pushed by switch_to */
194 bp = *(unsigned long *) task->thread.sp;
195 }
196 }
197#endif
198
199 while (1) {
200 struct thread_info *context;
201
202 context = (struct thread_info *)
203 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
204 bp = print_context_stack(context, stack, bp, ops, data);
205 /*
206 * Should be after the line below, but somewhere
207 * in early boot context comes out corrupted and we
208 * can't reference it:
209 */
210 if (ops->stack(data, "IRQ") < 0)
211 break;
212 stack = (unsigned long *)context->previous_esp;
213 if (!stack)
214 break;
215 touch_nmi_watchdog();
216 }
217}
218EXPORT_SYMBOL(dump_trace);
219
220static void
221print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
222{
223 printk(data);
224 print_symbol(msg, symbol);
225 printk("\n");
226}
227
228static void print_trace_warning(void *data, char *msg)
229{
230 printk("%s%s\n", (char *)data, msg);
231}
232
233static int print_trace_stack(void *data, char *name)
234{
235 return 0;
236}
237 91
238/* 92static int ignore_nmis;
239 * Print one address/symbol entries per line.
240 */
241static void print_trace_address(void *data, unsigned long addr, int reliable)
242{
243 printk("%s [<%08lx>] ", (char *)data, addr);
244 if (!reliable)
245 printk("? ");
246 print_symbol("%s\n", addr);
247 touch_nmi_watchdog();
248}
249
250static const struct stacktrace_ops print_trace_ops = {
251 .warning = print_trace_warning,
252 .warning_symbol = print_trace_warning_symbol,
253 .stack = print_trace_stack,
254 .address = print_trace_address,
255};
256 93
257static void 94static inline void conditional_sti(struct pt_regs *regs)
258show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
259 unsigned long *stack, unsigned long bp, char *log_lvl)
260{ 95{
261 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); 96 if (regs->flags & X86_EFLAGS_IF)
262 printk("%s =======================\n", log_lvl); 97 local_irq_enable();
263} 98}
264 99
265void show_trace(struct task_struct *task, struct pt_regs *regs, 100static inline void preempt_conditional_sti(struct pt_regs *regs)
266 unsigned long *stack, unsigned long bp)
267{ 101{
268 show_trace_log_lvl(task, regs, stack, bp, ""); 102 inc_preempt_count();
103 if (regs->flags & X86_EFLAGS_IF)
104 local_irq_enable();
269} 105}
270 106
271static void 107static inline void preempt_conditional_cli(struct pt_regs *regs)
272show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
273 unsigned long *sp, unsigned long bp, char *log_lvl)
274{ 108{
275 unsigned long *stack; 109 if (regs->flags & X86_EFLAGS_IF)
276 int i; 110 local_irq_disable();
277 111 dec_preempt_count();
278 if (sp == NULL) {
279 if (task)
280 sp = (unsigned long *)task->thread.sp;
281 else
282 sp = (unsigned long *)&sp;
283 }
284
285 stack = sp;
286 for (i = 0; i < kstack_depth_to_print; i++) {
287 if (kstack_end(stack))
288 break;
289 if (i && ((i % 8) == 0))
290 printk("\n%s ", log_lvl);
291 printk("%08lx ", *stack++);
292 }
293 printk("\n%sCall Trace:\n", log_lvl);
294
295 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
296} 112}
297 113
298void show_stack(struct task_struct *task, unsigned long *sp) 114#ifdef CONFIG_X86_32
115static inline void
116die_if_kernel(const char *str, struct pt_regs *regs, long err)
299{ 117{
300 printk(" "); 118 if (!user_mode_vm(regs))
301 show_stack_log_lvl(task, NULL, sp, 0, ""); 119 die(str, regs, err);
302} 120}
303 121
304/* 122/*
305 * The architecture-independent dump_stack generator 123 * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
124 * invalid offset set (the LAZY one) and the faulting thread has
125 * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS,
126 * we set the offset field correctly and return 1.
306 */ 127 */
307void dump_stack(void) 128static int lazy_iobitmap_copy(void)
308{ 129{
309 unsigned long bp = 0; 130 struct thread_struct *thread;
310 unsigned long stack; 131 struct tss_struct *tss;
311 132 int cpu;
312#ifdef CONFIG_FRAME_POINTER
313 if (!bp)
314 asm("movl %%ebp, %0" : "=r" (bp):);
315#endif
316
317 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
318 current->pid, current->comm, print_tainted(),
319 init_utsname()->release,
320 (int)strcspn(init_utsname()->version, " "),
321 init_utsname()->version);
322
323 show_trace(current, NULL, &stack, bp);
324}
325
326EXPORT_SYMBOL(dump_stack);
327
328void show_registers(struct pt_regs *regs)
329{
330 int i;
331
332 print_modules();
333 __show_registers(regs, 0);
334
335 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
336 TASK_COMM_LEN, current->comm, task_pid_nr(current),
337 current_thread_info(), current, task_thread_info(current));
338 /*
339 * When in-kernel, we also print out the stack and code at the
340 * time of the fault..
341 */
342 if (!user_mode_vm(regs)) {
343 unsigned int code_prologue = code_bytes * 43 / 64;
344 unsigned int code_len = code_bytes;
345 unsigned char c;
346 u8 *ip;
347
348 printk("\n" KERN_EMERG "Stack: ");
349 show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
350
351 printk(KERN_EMERG "Code: ");
352
353 ip = (u8 *)regs->ip - code_prologue;
354 if (ip < (u8 *)PAGE_OFFSET ||
355 probe_kernel_address(ip, c)) {
356 /* try starting at EIP */
357 ip = (u8 *)regs->ip;
358 code_len = code_len - code_prologue + 1;
359 }
360 for (i = 0; i < code_len; i++, ip++) {
361 if (ip < (u8 *)PAGE_OFFSET ||
362 probe_kernel_address(ip, c)) {
363 printk(" Bad EIP value.");
364 break;
365 }
366 if (ip == (u8 *)regs->ip)
367 printk("<%02x> ", c);
368 else
369 printk("%02x ", c);
370 }
371 }
372 printk("\n");
373}
374
375int is_valid_bugaddr(unsigned long ip)
376{
377 unsigned short ud2;
378
379 if (ip < PAGE_OFFSET)
380 return 0;
381 if (probe_kernel_address((unsigned short *)ip, ud2))
382 return 0;
383
384 return ud2 == 0x0b0f;
385}
386
387static int die_counter;
388 133
389int __kprobes __die(const char *str, struct pt_regs *regs, long err) 134 cpu = get_cpu();
390{ 135 tss = &per_cpu(init_tss, cpu);
391 unsigned short ss; 136 thread = &current->thread;
392 unsigned long sp;
393 137
394 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); 138 if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
395#ifdef CONFIG_PREEMPT 139 thread->io_bitmap_ptr) {
396 printk("PREEMPT "); 140 memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
397#endif 141 thread->io_bitmap_max);
398#ifdef CONFIG_SMP 142 /*
399 printk("SMP "); 143 * If the previously set map was extending to higher ports
400#endif 144 * than the current one, pad extra space with 0xff (no access).
401#ifdef CONFIG_DEBUG_PAGEALLOC 145 */
402 printk("DEBUG_PAGEALLOC"); 146 if (thread->io_bitmap_max < tss->io_bitmap_max) {
403#endif 147 memset((char *) tss->io_bitmap +
404 printk("\n"); 148 thread->io_bitmap_max, 0xff,
405 149 tss->io_bitmap_max - thread->io_bitmap_max);
406 if (notify_die(DIE_OOPS, str, regs, err,
407 current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) {
408
409 show_registers(regs);
410 /* Executive summary in case the oops scrolled away */
411 sp = (unsigned long) (&regs->sp);
412 savesegment(ss, ss);
413 if (user_mode(regs)) {
414 sp = regs->sp;
415 ss = regs->ss & 0xffff;
416 } 150 }
417 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); 151 tss->io_bitmap_max = thread->io_bitmap_max;
418 print_symbol("%s", regs->ip); 152 tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
419 printk(" SS:ESP %04x:%08lx\n", ss, sp); 153 tss->io_bitmap_owner = thread;
420 154 put_cpu();
421 return 0;
422 }
423
424 return 1;
425}
426
427/*
428 * This is gone through when something in the kernel has done something bad
429 * and is about to be terminated:
430 */
431void die(const char *str, struct pt_regs *regs, long err)
432{
433 static struct {
434 raw_spinlock_t lock;
435 u32 lock_owner;
436 int lock_owner_depth;
437 } die = {
438 .lock = __RAW_SPIN_LOCK_UNLOCKED,
439 .lock_owner = -1,
440 .lock_owner_depth = 0
441 };
442 unsigned long flags;
443
444 oops_enter();
445
446 if (die.lock_owner != raw_smp_processor_id()) {
447 console_verbose();
448 raw_local_irq_save(flags);
449 __raw_spin_lock(&die.lock);
450 die.lock_owner = smp_processor_id();
451 die.lock_owner_depth = 0;
452 bust_spinlocks(1);
453 } else {
454 raw_local_irq_save(flags);
455 }
456
457 if (++die.lock_owner_depth < 3) {
458 report_bug(regs->ip, regs);
459 155
460 if (__die(str, regs, err)) 156 return 1;
461 regs = NULL;
462 } else {
463 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
464 } 157 }
158 put_cpu();
465 159
466 bust_spinlocks(0); 160 return 0;
467 die.lock_owner = -1;
468 add_taint(TAINT_DIE);
469 __raw_spin_unlock(&die.lock);
470 raw_local_irq_restore(flags);
471
472 if (!regs)
473 return;
474
475 if (kexec_should_crash(current))
476 crash_kexec(regs);
477
478 if (in_interrupt())
479 panic("Fatal exception in interrupt");
480
481 if (panic_on_oops)
482 panic("Fatal exception");
483
484 oops_exit();
485 do_exit(SIGSEGV);
486}
487
488static inline void
489die_if_kernel(const char *str, struct pt_regs *regs, long err)
490{
491 if (!user_mode_vm(regs))
492 die(str, regs, err);
493} 161}
162#endif
494 163
495static void __kprobes 164static void __kprobes
496do_trap(int trapnr, int signr, char *str, int vm86, struct pt_regs *regs, 165do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
497 long error_code, siginfo_t *info) 166 long error_code, siginfo_t *info)
498{ 167{
499 struct task_struct *tsk = current; 168 struct task_struct *tsk = current;
500 169
170#ifdef CONFIG_X86_32
501 if (regs->flags & X86_VM_MASK) { 171 if (regs->flags & X86_VM_MASK) {
502 if (vm86) 172 /*
173 * traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
174 * On nmi (interrupt 2), do_trap should not be called.
175 */
176 if (trapnr < 6)
503 goto vm86_trap; 177 goto vm86_trap;
504 goto trap_signal; 178 goto trap_signal;
505 } 179 }
180#endif
506 181
507 if (!user_mode(regs)) 182 if (!user_mode(regs))
508 goto kernel_trap; 183 goto kernel_trap;
509 184
185#ifdef CONFIG_X86_32
510trap_signal: 186trap_signal:
187#endif
511 /* 188 /*
512 * We want error_code and trap_no set for userspace faults and 189 * We want error_code and trap_no set for userspace faults and
513 * kernelspace faults which result in die(), but not 190 * kernelspace faults which result in die(), but not
@@ -520,6 +197,18 @@ trap_signal:
520 tsk->thread.error_code = error_code; 197 tsk->thread.error_code = error_code;
521 tsk->thread.trap_no = trapnr; 198 tsk->thread.trap_no = trapnr;
522 199
200#ifdef CONFIG_X86_64
201 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
202 printk_ratelimit()) {
203 printk(KERN_INFO
204 "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
205 tsk->comm, tsk->pid, str,
206 regs->ip, regs->sp, error_code);
207 print_vma_addr(" in ", regs->ip);
208 printk("\n");
209 }
210#endif
211
523 if (info) 212 if (info)
524 force_sig_info(signr, info, tsk); 213 force_sig_info(signr, info, tsk);
525 else 214 else
@@ -534,152 +223,136 @@ kernel_trap:
534 } 223 }
535 return; 224 return;
536 225
226#ifdef CONFIG_X86_32
537vm86_trap: 227vm86_trap:
538 if (handle_vm86_trap((struct kernel_vm86_regs *) regs, 228 if (handle_vm86_trap((struct kernel_vm86_regs *) regs,
539 error_code, trapnr)) 229 error_code, trapnr))
540 goto trap_signal; 230 goto trap_signal;
541 return; 231 return;
232#endif
542} 233}
543 234
544#define DO_ERROR(trapnr, signr, str, name) \ 235#define DO_ERROR(trapnr, signr, str, name) \
545void do_##name(struct pt_regs *regs, long error_code) \ 236dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
546{ \ 237{ \
547 trace_hardirqs_fixup(); \
548 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 238 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
549 == NOTIFY_STOP) \ 239 == NOTIFY_STOP) \
550 return; \ 240 return; \
551 do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ 241 conditional_sti(regs); \
242 do_trap(trapnr, signr, str, regs, error_code, NULL); \
552} 243}
553 244
554#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \ 245#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
555void do_##name(struct pt_regs *regs, long error_code) \ 246dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
556{ \ 247{ \
557 siginfo_t info; \ 248 siginfo_t info; \
558 if (irq) \
559 local_irq_enable(); \
560 info.si_signo = signr; \ 249 info.si_signo = signr; \
561 info.si_errno = 0; \ 250 info.si_errno = 0; \
562 info.si_code = sicode; \ 251 info.si_code = sicode; \
563 info.si_addr = (void __user *)siaddr; \ 252 info.si_addr = (void __user *)siaddr; \
564 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 253 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
565 == NOTIFY_STOP) \ 254 == NOTIFY_STOP) \
566 return; \ 255 return; \
567 do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ 256 conditional_sti(regs); \
257 do_trap(trapnr, signr, str, regs, error_code, &info); \
568} 258}
569 259
570#define DO_VM86_ERROR(trapnr, signr, str, name) \ 260DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
571void do_##name(struct pt_regs *regs, long error_code) \ 261DO_ERROR(4, SIGSEGV, "overflow", overflow)
572{ \ 262DO_ERROR(5, SIGSEGV, "bounds", bounds)
573 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 263DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
574 == NOTIFY_STOP) \ 264DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
575 return; \ 265DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
576 do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ 266DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
577} 267#ifdef CONFIG_X86_32
268DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
269#endif
270DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
578 271
579#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ 272#ifdef CONFIG_X86_64
580void do_##name(struct pt_regs *regs, long error_code) \ 273/* Runs on IST stack */
581{ \ 274dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
582 siginfo_t info; \ 275{
583 info.si_signo = signr; \ 276 if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
584 info.si_errno = 0; \ 277 12, SIGBUS) == NOTIFY_STOP)
585 info.si_code = sicode; \ 278 return;
586 info.si_addr = (void __user *)siaddr; \ 279 preempt_conditional_sti(regs);
587 trace_hardirqs_fixup(); \ 280 do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
588 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 281 preempt_conditional_cli(regs);
589 == NOTIFY_STOP) \
590 return; \
591 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
592} 282}
593 283
594DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) 284dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
595#ifndef CONFIG_KPROBES 285{
596DO_VM86_ERROR(3, SIGTRAP, "int3", int3) 286 static const char str[] = "double fault";
287 struct task_struct *tsk = current;
288
289 /* Return not checked because double check cannot be ignored */
290 notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
291
292 tsk->thread.error_code = error_code;
293 tsk->thread.trap_no = 8;
294
295 /* This is always a kernel trap and never fixable (and thus must
296 never return). */
297 for (;;)
298 die(str, regs, error_code);
299}
597#endif 300#endif
598DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
599DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
600DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
601DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
602DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
603DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
604DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
605DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
606DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1)
607 301
608void __kprobes do_general_protection(struct pt_regs *regs, long error_code) 302dotraplinkage void __kprobes
303do_general_protection(struct pt_regs *regs, long error_code)
609{ 304{
610 struct thread_struct *thread; 305 struct task_struct *tsk;
611 struct tss_struct *tss;
612 int cpu;
613 306
614 cpu = get_cpu(); 307 conditional_sti(regs);
615 tss = &per_cpu(init_tss, cpu);
616 thread = &current->thread;
617
618 /*
619 * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
620 * invalid offset set (the LAZY one) and the faulting thread has
621 * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS
622 * and we set the offset field correctly. Then we let the CPU to
623 * restart the faulting instruction.
624 */
625 if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
626 thread->io_bitmap_ptr) {
627 memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
628 thread->io_bitmap_max);
629 /*
630 * If the previously set map was extending to higher ports
631 * than the current one, pad extra space with 0xff (no access).
632 */
633 if (thread->io_bitmap_max < tss->io_bitmap_max) {
634 memset((char *) tss->io_bitmap +
635 thread->io_bitmap_max, 0xff,
636 tss->io_bitmap_max - thread->io_bitmap_max);
637 }
638 tss->io_bitmap_max = thread->io_bitmap_max;
639 tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
640 tss->io_bitmap_owner = thread;
641 put_cpu();
642 308
309#ifdef CONFIG_X86_32
310 if (lazy_iobitmap_copy()) {
311 /* restart the faulting instruction */
643 return; 312 return;
644 } 313 }
645 put_cpu();
646 314
647 if (regs->flags & X86_VM_MASK) 315 if (regs->flags & X86_VM_MASK)
648 goto gp_in_vm86; 316 goto gp_in_vm86;
317#endif
649 318
319 tsk = current;
650 if (!user_mode(regs)) 320 if (!user_mode(regs))
651 goto gp_in_kernel; 321 goto gp_in_kernel;
652 322
653 current->thread.error_code = error_code; 323 tsk->thread.error_code = error_code;
654 current->thread.trap_no = 13; 324 tsk->thread.trap_no = 13;
655 325
656 if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) && 326 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
657 printk_ratelimit()) { 327 printk_ratelimit()) {
658 printk(KERN_INFO 328 printk(KERN_INFO
659 "%s[%d] general protection ip:%lx sp:%lx error:%lx", 329 "%s[%d] general protection ip:%lx sp:%lx error:%lx",
660 current->comm, task_pid_nr(current), 330 tsk->comm, task_pid_nr(tsk),
661 regs->ip, regs->sp, error_code); 331 regs->ip, regs->sp, error_code);
662 print_vma_addr(" in ", regs->ip); 332 print_vma_addr(" in ", regs->ip);
663 printk("\n"); 333 printk("\n");
664 } 334 }
665 335
666 force_sig(SIGSEGV, current); 336 force_sig(SIGSEGV, tsk);
667 return; 337 return;
668 338
339#ifdef CONFIG_X86_32
669gp_in_vm86: 340gp_in_vm86:
670 local_irq_enable(); 341 local_irq_enable();
671 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); 342 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
672 return; 343 return;
344#endif
673 345
674gp_in_kernel: 346gp_in_kernel:
675 if (!fixup_exception(regs)) { 347 if (fixup_exception(regs))
676 current->thread.error_code = error_code; 348 return;
677 current->thread.trap_no = 13; 349
678 if (notify_die(DIE_GPF, "general protection fault", regs, 350 tsk->thread.error_code = error_code;
351 tsk->thread.trap_no = 13;
352 if (notify_die(DIE_GPF, "general protection fault", regs,
679 error_code, 13, SIGSEGV) == NOTIFY_STOP) 353 error_code, 13, SIGSEGV) == NOTIFY_STOP)
680 return; 354 return;
681 die("general protection fault", regs, error_code); 355 die("general protection fault", regs, error_code);
682 }
683} 356}
684 357
685static notrace __kprobes void 358static notrace __kprobes void
@@ -705,7 +378,8 @@ mem_parity_error(unsigned char reason, struct pt_regs *regs)
705 printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); 378 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
706 379
707 /* Clear and disable the memory parity error line. */ 380 /* Clear and disable the memory parity error line. */
708 clear_mem_error(reason); 381 reason = (reason & 0xf) | 4;
382 outb(reason, 0x61);
709} 383}
710 384
711static notrace __kprobes void 385static notrace __kprobes void
@@ -731,7 +405,8 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
731static notrace __kprobes void 405static notrace __kprobes void
732unknown_nmi_error(unsigned char reason, struct pt_regs *regs) 406unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
733{ 407{
734 if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) 408 if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) ==
409 NOTIFY_STOP)
735 return; 410 return;
736#ifdef CONFIG_MCA 411#ifdef CONFIG_MCA
737 /* 412 /*
@@ -754,50 +429,20 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
754 printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); 429 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
755} 430}
756 431
757static DEFINE_SPINLOCK(nmi_print_lock);
758
759void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
760{
761 if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP)
762 return;
763
764 spin_lock(&nmi_print_lock);
765 /*
766 * We are in trouble anyway, lets at least try
767 * to get a message out:
768 */
769 bust_spinlocks(1);
770 printk(KERN_EMERG "%s", msg);
771 printk(" on CPU%d, ip %08lx, registers:\n",
772 smp_processor_id(), regs->ip);
773 show_registers(regs);
774 console_silent();
775 spin_unlock(&nmi_print_lock);
776 bust_spinlocks(0);
777
778 /*
779 * If we are in kernel we are probably nested up pretty bad
780 * and might aswell get out now while we still can:
781 */
782 if (!user_mode_vm(regs)) {
783 current->thread.trap_no = 2;
784 crash_kexec(regs);
785 }
786
787 do_exit(SIGSEGV);
788}
789
790static notrace __kprobes void default_do_nmi(struct pt_regs *regs) 432static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
791{ 433{
792 unsigned char reason = 0; 434 unsigned char reason = 0;
435 int cpu;
793 436
794 /* Only the BSP gets external NMIs from the system: */ 437 cpu = smp_processor_id();
795 if (!smp_processor_id()) 438
439 /* Only the BSP gets external NMIs from the system. */
440 if (!cpu)
796 reason = get_nmi_reason(); 441 reason = get_nmi_reason();
797 442
798 if (!(reason & 0xc0)) { 443 if (!(reason & 0xc0)) {
799 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) 444 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
800 == NOTIFY_STOP) 445 == NOTIFY_STOP)
801 return; 446 return;
802#ifdef CONFIG_X86_LOCAL_APIC 447#ifdef CONFIG_X86_LOCAL_APIC
803 /* 448 /*
@@ -806,7 +451,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
806 */ 451 */
807 if (nmi_watchdog_tick(regs, reason)) 452 if (nmi_watchdog_tick(regs, reason))
808 return; 453 return;
809 if (!do_nmi_callback(regs, smp_processor_id())) 454 if (!do_nmi_callback(regs, cpu))
810 unknown_nmi_error(reason, regs); 455 unknown_nmi_error(reason, regs);
811#else 456#else
812 unknown_nmi_error(reason, regs); 457 unknown_nmi_error(reason, regs);
@@ -816,28 +461,31 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
816 } 461 }
817 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) 462 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
818 return; 463 return;
464
465 /* AK: following checks seem to be broken on modern chipsets. FIXME */
819 if (reason & 0x80) 466 if (reason & 0x80)
820 mem_parity_error(reason, regs); 467 mem_parity_error(reason, regs);
821 if (reason & 0x40) 468 if (reason & 0x40)
822 io_check_error(reason, regs); 469 io_check_error(reason, regs);
470#ifdef CONFIG_X86_32
823 /* 471 /*
824 * Reassert NMI in case it became active meanwhile 472 * Reassert NMI in case it became active meanwhile
825 * as it's edge-triggered: 473 * as it's edge-triggered:
826 */ 474 */
827 reassert_nmi(); 475 reassert_nmi();
476#endif
828} 477}
829 478
830static int ignore_nmis; 479dotraplinkage notrace __kprobes void
831 480do_nmi(struct pt_regs *regs, long error_code)
832notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code)
833{ 481{
834 int cpu;
835
836 nmi_enter(); 482 nmi_enter();
837 483
838 cpu = smp_processor_id(); 484#ifdef CONFIG_X86_32
839 485 { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); }
840 ++nmi_count(cpu); 486#else
487 add_pda(__nmi_count, 1);
488#endif
841 489
842 if (!ignore_nmis) 490 if (!ignore_nmis)
843 default_do_nmi(regs); 491 default_do_nmi(regs);
@@ -857,21 +505,44 @@ void restart_nmi(void)
857 acpi_nmi_enable(); 505 acpi_nmi_enable();
858} 506}
859 507
860#ifdef CONFIG_KPROBES 508/* May run on IST stack. */
861void __kprobes do_int3(struct pt_regs *regs, long error_code) 509dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
862{ 510{
863 trace_hardirqs_fixup(); 511#ifdef CONFIG_KPROBES
864
865 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) 512 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
866 == NOTIFY_STOP) 513 == NOTIFY_STOP)
867 return; 514 return;
868 /* 515#else
869 * This is an interrupt gate, because kprobes wants interrupts 516 if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP)
870 * disabled. Normal trap handlers don't. 517 == NOTIFY_STOP)
871 */ 518 return;
872 restore_interrupts(regs); 519#endif
873 520
874 do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); 521 preempt_conditional_sti(regs);
522 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
523 preempt_conditional_cli(regs);
524}
525
526#ifdef CONFIG_X86_64
527/* Help handler running on IST stack to switch back to user stack
528 for scheduling or signal handling. The actual stack switch is done in
529 entry.S */
530asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
531{
532 struct pt_regs *regs = eregs;
533 /* Did already sync */
534 if (eregs == (struct pt_regs *)eregs->sp)
535 ;
536 /* Exception from user space */
537 else if (user_mode(eregs))
538 regs = task_pt_regs(current);
539 /* Exception from kernel and interrupts are enabled. Move to
540 kernel process stack. */
541 else if (eregs->flags & X86_EFLAGS_IF)
542 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
543 if (eregs != regs)
544 *regs = *eregs;
545 return regs;
875} 546}
876#endif 547#endif
877 548
@@ -896,13 +567,14 @@ void __kprobes do_int3(struct pt_regs *regs, long error_code)
896 * about restoring all the debug state, and ptrace doesn't have to 567 * about restoring all the debug state, and ptrace doesn't have to
897 * find every occurrence of the TF bit that could be saved away even 568 * find every occurrence of the TF bit that could be saved away even
898 * by user code) 569 * by user code)
570 *
571 * May run on IST stack.
899 */ 572 */
900void __kprobes do_debug(struct pt_regs *regs, long error_code) 573dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
901{ 574{
902 struct task_struct *tsk = current; 575 struct task_struct *tsk = current;
903 unsigned int condition; 576 unsigned long condition;
904 577 int si_code;
905 trace_hardirqs_fixup();
906 578
907 get_debugreg(condition, 6); 579 get_debugreg(condition, 6);
908 580
@@ -913,11 +585,11 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
913 tsk->thread.debugctlmsr = 0; 585 tsk->thread.debugctlmsr = 0;
914 586
915 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, 587 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
916 SIGTRAP) == NOTIFY_STOP) 588 SIGTRAP) == NOTIFY_STOP)
917 return; 589 return;
590
918 /* It's safe to allow irq's after DR6 has been saved */ 591 /* It's safe to allow irq's after DR6 has been saved */
919 if (regs->flags & X86_EFLAGS_IF) 592 preempt_conditional_sti(regs);
920 local_irq_enable();
921 593
922 /* Mask out spurious debug traps due to lazy DR7 setting */ 594 /* Mask out spurious debug traps due to lazy DR7 setting */
923 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { 595 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
@@ -925,8 +597,10 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
925 goto clear_dr7; 597 goto clear_dr7;
926 } 598 }
927 599
600#ifdef CONFIG_X86_32
928 if (regs->flags & X86_VM_MASK) 601 if (regs->flags & X86_VM_MASK)
929 goto debug_vm86; 602 goto debug_vm86;
603#endif
930 604
931 /* Save debug status register where ptrace can see it */ 605 /* Save debug status register where ptrace can see it */
932 tsk->thread.debugreg6 = condition; 606 tsk->thread.debugreg6 = condition;
@@ -936,17 +610,13 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
936 * kernel space (but re-enable TF when returning to user mode). 610 * kernel space (but re-enable TF when returning to user mode).
937 */ 611 */
938 if (condition & DR_STEP) { 612 if (condition & DR_STEP) {
939 /*
940 * We already checked v86 mode above, so we can
941 * check for kernel mode by just checking the CPL
942 * of CS.
943 */
944 if (!user_mode(regs)) 613 if (!user_mode(regs))
945 goto clear_TF_reenable; 614 goto clear_TF_reenable;
946 } 615 }
947 616
617 si_code = get_si_code(condition);
948 /* Ok, finally something we can handle */ 618 /* Ok, finally something we can handle */
949 send_sigtrap(tsk, regs, error_code); 619 send_sigtrap(tsk, regs, error_code, si_code);
950 620
951 /* 621 /*
952 * Disable additional traps. They'll be re-enabled when 622 * Disable additional traps. They'll be re-enabled when
@@ -954,18 +624,37 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
954 */ 624 */
955clear_dr7: 625clear_dr7:
956 set_debugreg(0, 7); 626 set_debugreg(0, 7);
627 preempt_conditional_cli(regs);
957 return; 628 return;
958 629
630#ifdef CONFIG_X86_32
959debug_vm86: 631debug_vm86:
960 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); 632 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
633 preempt_conditional_cli(regs);
961 return; 634 return;
635#endif
962 636
963clear_TF_reenable: 637clear_TF_reenable:
964 set_tsk_thread_flag(tsk, TIF_SINGLESTEP); 638 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
965 regs->flags &= ~X86_EFLAGS_TF; 639 regs->flags &= ~X86_EFLAGS_TF;
640 preempt_conditional_cli(regs);
966 return; 641 return;
967} 642}
968 643
644#ifdef CONFIG_X86_64
645static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
646{
647 if (fixup_exception(regs))
648 return 1;
649
650 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
651 /* Illegal floating point operation in the kernel */
652 current->thread.trap_no = trapnr;
653 die(str, regs, 0);
654 return 0;
655}
656#endif
657
969/* 658/*
970 * Note that we play around with the 'TS' bit in an attempt to get 659 * Note that we play around with the 'TS' bit in an attempt to get
971 * the correct behaviour even in the presence of the asynchronous 660 * the correct behaviour even in the presence of the asynchronous
@@ -974,9 +663,8 @@ clear_TF_reenable:
974void math_error(void __user *ip) 663void math_error(void __user *ip)
975{ 664{
976 struct task_struct *task; 665 struct task_struct *task;
977 unsigned short cwd;
978 unsigned short swd;
979 siginfo_t info; 666 siginfo_t info;
667 unsigned short cwd, swd;
980 668
981 /* 669 /*
982 * Save the info for the exception handler and clear the error. 670 * Save the info for the exception handler and clear the error.
@@ -995,7 +683,7 @@ void math_error(void __user *ip)
995 * C1 reg you need in case of a stack fault, 0x040 is the stack 683 * C1 reg you need in case of a stack fault, 0x040 is the stack
996 * fault bit. We should only be taking one exception at a time, 684 * fault bit. We should only be taking one exception at a time,
997 * so if this combination doesn't produce any single exception, 685 * so if this combination doesn't produce any single exception,
998 * then we have a bad program that isn't syncronizing its FPU usage 686 * then we have a bad program that isn't synchronizing its FPU usage
999 * and it will suffer the consequences since we won't be able to 687 * and it will suffer the consequences since we won't be able to
1000 * fully reproduce the context of the exception 688 * fully reproduce the context of the exception
1001 */ 689 */
@@ -1003,8 +691,10 @@ void math_error(void __user *ip)
1003 swd = get_fpu_swd(task); 691 swd = get_fpu_swd(task);
1004 switch (swd & ~cwd & 0x3f) { 692 switch (swd & ~cwd & 0x3f) {
1005 case 0x000: /* No unmasked exception */ 693 case 0x000: /* No unmasked exception */
694#ifdef CONFIG_X86_32
1006 return; 695 return;
1007 default: /* Multiple exceptions */ 696#endif
697 default: /* Multiple exceptions */
1008 break; 698 break;
1009 case 0x001: /* Invalid Op */ 699 case 0x001: /* Invalid Op */
1010 /* 700 /*
@@ -1031,17 +721,26 @@ void math_error(void __user *ip)
1031 force_sig_info(SIGFPE, &info, task); 721 force_sig_info(SIGFPE, &info, task);
1032} 722}
1033 723
1034void do_coprocessor_error(struct pt_regs *regs, long error_code) 724dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
1035{ 725{
726 conditional_sti(regs);
727
728#ifdef CONFIG_X86_32
1036 ignore_fpu_irq = 1; 729 ignore_fpu_irq = 1;
730#else
731 if (!user_mode(regs) &&
732 kernel_math_error(regs, "kernel x87 math error", 16))
733 return;
734#endif
735
1037 math_error((void __user *)regs->ip); 736 math_error((void __user *)regs->ip);
1038} 737}
1039 738
1040static void simd_math_error(void __user *ip) 739static void simd_math_error(void __user *ip)
1041{ 740{
1042 struct task_struct *task; 741 struct task_struct *task;
1043 unsigned short mxcsr;
1044 siginfo_t info; 742 siginfo_t info;
743 unsigned short mxcsr;
1045 744
1046 /* 745 /*
1047 * Save the info for the exception handler and clear the error. 746 * Save the info for the exception handler and clear the error.
@@ -1085,8 +784,12 @@ static void simd_math_error(void __user *ip)
1085 force_sig_info(SIGFPE, &info, task); 784 force_sig_info(SIGFPE, &info, task);
1086} 785}
1087 786
1088void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) 787dotraplinkage void
788do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
1089{ 789{
790 conditional_sti(regs);
791
792#ifdef CONFIG_X86_32
1090 if (cpu_has_xmm) { 793 if (cpu_has_xmm) {
1091 /* Handle SIMD FPU exceptions on PIII+ processors. */ 794 /* Handle SIMD FPU exceptions on PIII+ processors. */
1092 ignore_fpu_irq = 1; 795 ignore_fpu_irq = 1;
@@ -1105,19 +808,28 @@ void do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
1105 current->thread.error_code = error_code; 808 current->thread.error_code = error_code;
1106 die_if_kernel("cache flush denied", regs, error_code); 809 die_if_kernel("cache flush denied", regs, error_code);
1107 force_sig(SIGSEGV, current); 810 force_sig(SIGSEGV, current);
811#else
812 if (!user_mode(regs) &&
813 kernel_math_error(regs, "kernel simd math error", 19))
814 return;
815 simd_math_error((void __user *)regs->ip);
816#endif
1108} 817}
1109 818
1110void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) 819dotraplinkage void
820do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
1111{ 821{
822 conditional_sti(regs);
1112#if 0 823#if 0
1113 /* No need to warn about this any longer. */ 824 /* No need to warn about this any longer. */
1114 printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); 825 printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
1115#endif 826#endif
1116} 827}
1117 828
829#ifdef CONFIG_X86_32
1118unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) 830unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
1119{ 831{
1120 struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt; 832 struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id());
1121 unsigned long base = (kesp - uesp) & -THREAD_SIZE; 833 unsigned long base = (kesp - uesp) & -THREAD_SIZE;
1122 unsigned long new_kesp = kesp - base; 834 unsigned long new_kesp = kesp - base;
1123 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; 835 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
@@ -1133,6 +845,15 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
1133 845
1134 return new_kesp; 846 return new_kesp;
1135} 847}
848#else
849asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
850{
851}
852
853asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
854{
855}
856#endif
1136 857
1137/* 858/*
1138 * 'math_state_restore()' saves the current math information in the 859 * 'math_state_restore()' saves the current math information in the
@@ -1165,14 +886,24 @@ asmlinkage void math_state_restore(void)
1165 } 886 }
1166 887
1167 clts(); /* Allow maths ops (or we recurse) */ 888 clts(); /* Allow maths ops (or we recurse) */
889#ifdef CONFIG_X86_32
1168 restore_fpu(tsk); 890 restore_fpu(tsk);
891#else
892 /*
893 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
894 */
895 if (unlikely(restore_fpu_checking(tsk))) {
896 stts();
897 force_sig(SIGSEGV, tsk);
898 return;
899 }
900#endif
1169 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ 901 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
1170 tsk->fpu_counter++; 902 tsk->fpu_counter++;
1171} 903}
1172EXPORT_SYMBOL_GPL(math_state_restore); 904EXPORT_SYMBOL_GPL(math_state_restore);
1173 905
1174#ifndef CONFIG_MATH_EMULATION 906#ifndef CONFIG_MATH_EMULATION
1175
1176asmlinkage void math_emulate(long arg) 907asmlinkage void math_emulate(long arg)
1177{ 908{
1178 printk(KERN_EMERG 909 printk(KERN_EMERG
@@ -1181,12 +912,54 @@ asmlinkage void math_emulate(long arg)
1181 force_sig(SIGFPE, current); 912 force_sig(SIGFPE, current);
1182 schedule(); 913 schedule();
1183} 914}
1184
1185#endif /* CONFIG_MATH_EMULATION */ 915#endif /* CONFIG_MATH_EMULATION */
1186 916
917dotraplinkage void __kprobes
918do_device_not_available(struct pt_regs *regs, long error)
919{
920#ifdef CONFIG_X86_32
921 if (read_cr0() & X86_CR0_EM) {
922 conditional_sti(regs);
923 math_emulate(0);
924 } else {
925 math_state_restore(); /* interrupts still off */
926 conditional_sti(regs);
927 }
928#else
929 math_state_restore();
930#endif
931}
932
933#ifdef CONFIG_X86_32
934#ifdef CONFIG_X86_MCE
935dotraplinkage void __kprobes do_machine_check(struct pt_regs *regs, long error)
936{
937 conditional_sti(regs);
938 machine_check_vector(regs, error);
939}
940#endif
941
942dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
943{
944 siginfo_t info;
945 local_irq_enable();
946
947 info.si_signo = SIGILL;
948 info.si_errno = 0;
949 info.si_code = ILL_BADSTK;
950 info.si_addr = 0;
951 if (notify_die(DIE_TRAP, "iret exception",
952 regs, error_code, 32, SIGILL) == NOTIFY_STOP)
953 return;
954 do_trap(32, SIGILL, "iret exception", regs, error_code, &info);
955}
956#endif
957
1187void __init trap_init(void) 958void __init trap_init(void)
1188{ 959{
960#ifdef CONFIG_X86_32
1189 int i; 961 int i;
962#endif
1190 963
1191#ifdef CONFIG_EISA 964#ifdef CONFIG_EISA
1192 void __iomem *p = early_ioremap(0x0FFFD9, 4); 965 void __iomem *p = early_ioremap(0x0FFFD9, 4);
@@ -1196,32 +969,40 @@ void __init trap_init(void)
1196 early_iounmap(p, 4); 969 early_iounmap(p, 4);
1197#endif 970#endif
1198 971
1199#ifdef CONFIG_X86_LOCAL_APIC 972 set_intr_gate(0, &divide_error);
1200 init_apic_mappings(); 973 set_intr_gate_ist(1, &debug, DEBUG_STACK);
974 set_intr_gate_ist(2, &nmi, NMI_STACK);
975 /* int3 can be called from all */
976 set_system_intr_gate_ist(3, &int3, DEBUG_STACK);
977 /* int4 can be called from all */
978 set_system_intr_gate(4, &overflow);
979 set_intr_gate(5, &bounds);
980 set_intr_gate(6, &invalid_op);
981 set_intr_gate(7, &device_not_available);
982#ifdef CONFIG_X86_32
983 set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS);
984#else
985 set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK);
1201#endif 986#endif
1202 set_trap_gate(0, &divide_error); 987 set_intr_gate(9, &coprocessor_segment_overrun);
1203 set_intr_gate(1, &debug); 988 set_intr_gate(10, &invalid_TSS);
1204 set_intr_gate(2, &nmi); 989 set_intr_gate(11, &segment_not_present);
1205 set_system_intr_gate(3, &int3); /* int3/4 can be called from all */ 990 set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
1206 set_system_gate(4, &overflow); 991 set_intr_gate(13, &general_protection);
1207 set_trap_gate(5, &bounds);
1208 set_trap_gate(6, &invalid_op);
1209 set_trap_gate(7, &device_not_available);
1210 set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS);
1211 set_trap_gate(9, &coprocessor_segment_overrun);
1212 set_trap_gate(10, &invalid_TSS);
1213 set_trap_gate(11, &segment_not_present);
1214 set_trap_gate(12, &stack_segment);
1215 set_trap_gate(13, &general_protection);
1216 set_intr_gate(14, &page_fault); 992 set_intr_gate(14, &page_fault);
1217 set_trap_gate(15, &spurious_interrupt_bug); 993 set_intr_gate(15, &spurious_interrupt_bug);
1218 set_trap_gate(16, &coprocessor_error); 994 set_intr_gate(16, &coprocessor_error);
1219 set_trap_gate(17, &alignment_check); 995 set_intr_gate(17, &alignment_check);
1220#ifdef CONFIG_X86_MCE 996#ifdef CONFIG_X86_MCE
1221 set_trap_gate(18, &machine_check); 997 set_intr_gate_ist(18, &machine_check, MCE_STACK);
1222#endif 998#endif
1223 set_trap_gate(19, &simd_coprocessor_error); 999 set_intr_gate(19, &simd_coprocessor_error);
1224 1000
1001#ifdef CONFIG_IA32_EMULATION
1002 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
1003#endif
1004
1005#ifdef CONFIG_X86_32
1225 if (cpu_has_fxsr) { 1006 if (cpu_has_fxsr) {
1226 printk(KERN_INFO "Enabling fast FPU save and restore... "); 1007 printk(KERN_INFO "Enabling fast FPU save and restore... ");
1227 set_in_cr4(X86_CR4_OSFXSR); 1008 set_in_cr4(X86_CR4_OSFXSR);
@@ -1234,37 +1015,20 @@ void __init trap_init(void)
1234 printk("done.\n"); 1015 printk("done.\n");
1235 } 1016 }
1236 1017
1237 set_system_gate(SYSCALL_VECTOR, &system_call); 1018 set_system_trap_gate(SYSCALL_VECTOR, &system_call);
1238 1019
1239 /* Reserve all the builtin and the syscall vector: */ 1020 /* Reserve all the builtin and the syscall vector: */
1240 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) 1021 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
1241 set_bit(i, used_vectors); 1022 set_bit(i, used_vectors);
1242 1023
1243 set_bit(SYSCALL_VECTOR, used_vectors); 1024 set_bit(SYSCALL_VECTOR, used_vectors);
1244 1025#endif
1245 init_thread_xstate();
1246 /* 1026 /*
1247 * Should be a barrier for any external CPU state: 1027 * Should be a barrier for any external CPU state:
1248 */ 1028 */
1249 cpu_init(); 1029 cpu_init();
1250 1030
1031#ifdef CONFIG_X86_32
1251 trap_init_hook(); 1032 trap_init_hook();
1033#endif
1252} 1034}
1253
1254static int __init kstack_setup(char *s)
1255{
1256 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
1257
1258 return 1;
1259}
1260__setup("kstack=", kstack_setup);
1261
1262static int __init code_bytes_setup(char *s)
1263{
1264 code_bytes = simple_strtoul(s, NULL, 0);
1265 if (code_bytes > 8192)
1266 code_bytes = 8192;
1267
1268 return 1;
1269}
1270__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
deleted file mode 100644
index adff76ea97c4..000000000000
--- a/arch/x86/kernel/traps_64.c
+++ /dev/null
@@ -1,1218 +0,0 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 *
5 * Pentium III FXSR, SSE support
6 * Gareth Hughes <gareth@valinux.com>, May 2000
7 */
8
9/*
10 * 'Traps.c' handles hardware traps and faults after we have saved some
11 * state in 'entry.S'.
12 */
13#include <linux/sched.h>
14#include <linux/kernel.h>
15#include <linux/string.h>
16#include <linux/errno.h>
17#include <linux/ptrace.h>
18#include <linux/timer.h>
19#include <linux/mm.h>
20#include <linux/init.h>
21#include <linux/delay.h>
22#include <linux/spinlock.h>
23#include <linux/interrupt.h>
24#include <linux/kallsyms.h>
25#include <linux/module.h>
26#include <linux/moduleparam.h>
27#include <linux/nmi.h>
28#include <linux/kprobes.h>
29#include <linux/kexec.h>
30#include <linux/unwind.h>
31#include <linux/uaccess.h>
32#include <linux/bug.h>
33#include <linux/kdebug.h>
34#include <linux/utsname.h>
35
36#include <mach_traps.h>
37
38#if defined(CONFIG_EDAC)
39#include <linux/edac.h>
40#endif
41
42#include <asm/system.h>
43#include <asm/io.h>
44#include <asm/atomic.h>
45#include <asm/debugreg.h>
46#include <asm/desc.h>
47#include <asm/i387.h>
48#include <asm/processor.h>
49#include <asm/unwind.h>
50#include <asm/smp.h>
51#include <asm/pgalloc.h>
52#include <asm/pda.h>
53#include <asm/proto.h>
54#include <asm/nmi.h>
55#include <asm/stacktrace.h>
56
57asmlinkage void divide_error(void);
58asmlinkage void debug(void);
59asmlinkage void nmi(void);
60asmlinkage void int3(void);
61asmlinkage void overflow(void);
62asmlinkage void bounds(void);
63asmlinkage void invalid_op(void);
64asmlinkage void device_not_available(void);
65asmlinkage void double_fault(void);
66asmlinkage void coprocessor_segment_overrun(void);
67asmlinkage void invalid_TSS(void);
68asmlinkage void segment_not_present(void);
69asmlinkage void stack_segment(void);
70asmlinkage void general_protection(void);
71asmlinkage void page_fault(void);
72asmlinkage void coprocessor_error(void);
73asmlinkage void simd_coprocessor_error(void);
74asmlinkage void reserved(void);
75asmlinkage void alignment_check(void);
76asmlinkage void machine_check(void);
77asmlinkage void spurious_interrupt_bug(void);
78
79static unsigned int code_bytes = 64;
80
81static inline void conditional_sti(struct pt_regs *regs)
82{
83 if (regs->flags & X86_EFLAGS_IF)
84 local_irq_enable();
85}
86
87static inline void preempt_conditional_sti(struct pt_regs *regs)
88{
89 inc_preempt_count();
90 if (regs->flags & X86_EFLAGS_IF)
91 local_irq_enable();
92}
93
94static inline void preempt_conditional_cli(struct pt_regs *regs)
95{
96 if (regs->flags & X86_EFLAGS_IF)
97 local_irq_disable();
98 /* Make sure to not schedule here because we could be running
99 on an exception stack. */
100 dec_preempt_count();
101}
102
103int kstack_depth_to_print = 12;
104
105void printk_address(unsigned long address, int reliable)
106{
107#ifdef CONFIG_KALLSYMS
108 unsigned long offset = 0, symsize;
109 const char *symname;
110 char *modname;
111 char *delim = ":";
112 char namebuf[KSYM_NAME_LEN];
113 char reliab[4] = "";
114
115 symname = kallsyms_lookup(address, &symsize, &offset,
116 &modname, namebuf);
117 if (!symname) {
118 printk(" [<%016lx>]\n", address);
119 return;
120 }
121 if (!reliable)
122 strcpy(reliab, "? ");
123
124 if (!modname)
125 modname = delim = "";
126 printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
127 address, reliab, delim, modname, delim, symname, offset, symsize);
128#else
129 printk(" [<%016lx>]\n", address);
130#endif
131}
132
133static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
134 unsigned *usedp, char **idp)
135{
136 static char ids[][8] = {
137 [DEBUG_STACK - 1] = "#DB",
138 [NMI_STACK - 1] = "NMI",
139 [DOUBLEFAULT_STACK - 1] = "#DF",
140 [STACKFAULT_STACK - 1] = "#SS",
141 [MCE_STACK - 1] = "#MC",
142#if DEBUG_STKSZ > EXCEPTION_STKSZ
143 [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
144#endif
145 };
146 unsigned k;
147
148 /*
149 * Iterate over all exception stacks, and figure out whether
150 * 'stack' is in one of them:
151 */
152 for (k = 0; k < N_EXCEPTION_STACKS; k++) {
153 unsigned long end = per_cpu(orig_ist, cpu).ist[k];
154 /*
155 * Is 'stack' above this exception frame's end?
156 * If yes then skip to the next frame.
157 */
158 if (stack >= end)
159 continue;
160 /*
161 * Is 'stack' above this exception frame's start address?
162 * If yes then we found the right frame.
163 */
164 if (stack >= end - EXCEPTION_STKSZ) {
165 /*
166 * Make sure we only iterate through an exception
167 * stack once. If it comes up for the second time
168 * then there's something wrong going on - just
169 * break out and return NULL:
170 */
171 if (*usedp & (1U << k))
172 break;
173 *usedp |= 1U << k;
174 *idp = ids[k];
175 return (unsigned long *)end;
176 }
177 /*
178 * If this is a debug stack, and if it has a larger size than
179 * the usual exception stacks, then 'stack' might still
180 * be within the lower portion of the debug stack:
181 */
182#if DEBUG_STKSZ > EXCEPTION_STKSZ
183 if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
184 unsigned j = N_EXCEPTION_STACKS - 1;
185
186 /*
187 * Black magic. A large debug stack is composed of
188 * multiple exception stack entries, which we
189 * iterate through now. Dont look:
190 */
191 do {
192 ++j;
193 end -= EXCEPTION_STKSZ;
194 ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
195 } while (stack < end - EXCEPTION_STKSZ);
196 if (*usedp & (1U << j))
197 break;
198 *usedp |= 1U << j;
199 *idp = ids[j];
200 return (unsigned long *)end;
201 }
202#endif
203 }
204 return NULL;
205}
206
207#define MSG(txt) ops->warning(data, txt)
208
209/*
210 * x86-64 can have up to three kernel stacks:
211 * process stack
212 * interrupt stack
213 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
214 */
215
216static inline int valid_stack_ptr(struct thread_info *tinfo,
217 void *p, unsigned int size, void *end)
218{
219 void *t = tinfo;
220 if (end) {
221 if (p < end && p >= (end-THREAD_SIZE))
222 return 1;
223 else
224 return 0;
225 }
226 return p > t && p < t + THREAD_SIZE - size;
227}
228
229/* The form of the top of the frame on the stack */
230struct stack_frame {
231 struct stack_frame *next_frame;
232 unsigned long return_address;
233};
234
235
236static inline unsigned long print_context_stack(struct thread_info *tinfo,
237 unsigned long *stack, unsigned long bp,
238 const struct stacktrace_ops *ops, void *data,
239 unsigned long *end)
240{
241 struct stack_frame *frame = (struct stack_frame *)bp;
242
243 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
244 unsigned long addr;
245
246 addr = *stack;
247 if (__kernel_text_address(addr)) {
248 if ((unsigned long) stack == bp + 8) {
249 ops->address(data, addr, 1);
250 frame = frame->next_frame;
251 bp = (unsigned long) frame;
252 } else {
253 ops->address(data, addr, bp == 0);
254 }
255 }
256 stack++;
257 }
258 return bp;
259}
260
261void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
262 unsigned long *stack, unsigned long bp,
263 const struct stacktrace_ops *ops, void *data)
264{
265 const unsigned cpu = get_cpu();
266 unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr;
267 unsigned used = 0;
268 struct thread_info *tinfo;
269
270 if (!tsk)
271 tsk = current;
272 tinfo = task_thread_info(tsk);
273
274 if (!stack) {
275 unsigned long dummy;
276 stack = &dummy;
277 if (tsk && tsk != current)
278 stack = (unsigned long *)tsk->thread.sp;
279 }
280
281#ifdef CONFIG_FRAME_POINTER
282 if (!bp) {
283 if (tsk == current) {
284 /* Grab bp right from our regs */
285 asm("movq %%rbp, %0" : "=r" (bp):);
286 } else {
287 /* bp is the last reg pushed by switch_to */
288 bp = *(unsigned long *) tsk->thread.sp;
289 }
290 }
291#endif
292
293
294
295 /*
296 * Print function call entries in all stacks, starting at the
297 * current stack address. If the stacks consist of nested
298 * exceptions
299 */
300 for (;;) {
301 char *id;
302 unsigned long *estack_end;
303 estack_end = in_exception_stack(cpu, (unsigned long)stack,
304 &used, &id);
305
306 if (estack_end) {
307 if (ops->stack(data, id) < 0)
308 break;
309
310 bp = print_context_stack(tinfo, stack, bp, ops,
311 data, estack_end);
312 ops->stack(data, "<EOE>");
313 /*
314 * We link to the next stack via the
315 * second-to-last pointer (index -2 to end) in the
316 * exception stack:
317 */
318 stack = (unsigned long *) estack_end[-2];
319 continue;
320 }
321 if (irqstack_end) {
322 unsigned long *irqstack;
323 irqstack = irqstack_end -
324 (IRQSTACKSIZE - 64) / sizeof(*irqstack);
325
326 if (stack >= irqstack && stack < irqstack_end) {
327 if (ops->stack(data, "IRQ") < 0)
328 break;
329 bp = print_context_stack(tinfo, stack, bp,
330 ops, data, irqstack_end);
331 /*
332 * We link to the next stack (which would be
333 * the process stack normally) the last
334 * pointer (index -1 to end) in the IRQ stack:
335 */
336 stack = (unsigned long *) (irqstack_end[-1]);
337 irqstack_end = NULL;
338 ops->stack(data, "EOI");
339 continue;
340 }
341 }
342 break;
343 }
344
345 /*
346 * This handles the process stack:
347 */
348 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
349 put_cpu();
350}
351EXPORT_SYMBOL(dump_trace);
352
353static void
354print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
355{
356 print_symbol(msg, symbol);
357 printk("\n");
358}
359
360static void print_trace_warning(void *data, char *msg)
361{
362 printk("%s\n", msg);
363}
364
365static int print_trace_stack(void *data, char *name)
366{
367 printk(" <%s> ", name);
368 return 0;
369}
370
371static void print_trace_address(void *data, unsigned long addr, int reliable)
372{
373 touch_nmi_watchdog();
374 printk_address(addr, reliable);
375}
376
377static const struct stacktrace_ops print_trace_ops = {
378 .warning = print_trace_warning,
379 .warning_symbol = print_trace_warning_symbol,
380 .stack = print_trace_stack,
381 .address = print_trace_address,
382};
383
384void
385show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
386 unsigned long bp)
387{
388 printk("\nCall Trace:\n");
389 dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
390 printk("\n");
391}
392
393static void
394_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
395 unsigned long bp)
396{
397 unsigned long *stack;
398 int i;
399 const int cpu = smp_processor_id();
400 unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
401 unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
402
403 // debugging aid: "show_stack(NULL, NULL);" prints the
404 // back trace for this cpu.
405
406 if (sp == NULL) {
407 if (tsk)
408 sp = (unsigned long *)tsk->thread.sp;
409 else
410 sp = (unsigned long *)&sp;
411 }
412
413 stack = sp;
414 for(i=0; i < kstack_depth_to_print; i++) {
415 if (stack >= irqstack && stack <= irqstack_end) {
416 if (stack == irqstack_end) {
417 stack = (unsigned long *) (irqstack_end[-1]);
418 printk(" <EOI> ");
419 }
420 } else {
421 if (((long) stack & (THREAD_SIZE-1)) == 0)
422 break;
423 }
424 if (i && ((i % 4) == 0))
425 printk("\n");
426 printk(" %016lx", *stack++);
427 touch_nmi_watchdog();
428 }
429 show_trace(tsk, regs, sp, bp);
430}
431
432void show_stack(struct task_struct *tsk, unsigned long * sp)
433{
434 _show_stack(tsk, NULL, sp, 0);
435}
436
437/*
438 * The architecture-independent dump_stack generator
439 */
440void dump_stack(void)
441{
442 unsigned long dummy;
443 unsigned long bp = 0;
444
445#ifdef CONFIG_FRAME_POINTER
446 if (!bp)
447 asm("movq %%rbp, %0" : "=r" (bp):);
448#endif
449
450 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
451 current->pid, current->comm, print_tainted(),
452 init_utsname()->release,
453 (int)strcspn(init_utsname()->version, " "),
454 init_utsname()->version);
455 show_trace(NULL, NULL, &dummy, bp);
456}
457
458EXPORT_SYMBOL(dump_stack);
459
460void show_registers(struct pt_regs *regs)
461{
462 int i;
463 unsigned long sp;
464 const int cpu = smp_processor_id();
465 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
466 u8 *ip;
467 unsigned int code_prologue = code_bytes * 43 / 64;
468 unsigned int code_len = code_bytes;
469
470 sp = regs->sp;
471 ip = (u8 *) regs->ip - code_prologue;
472 printk("CPU %d ", cpu);
473 __show_regs(regs);
474 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
475 cur->comm, cur->pid, task_thread_info(cur), cur);
476
477 /*
478 * When in-kernel, we also print out the stack and code at the
479 * time of the fault..
480 */
481 if (!user_mode(regs)) {
482 unsigned char c;
483 printk("Stack: ");
484 _show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
485 printk("\n");
486
487 printk(KERN_EMERG "Code: ");
488 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
489 /* try starting at RIP */
490 ip = (u8 *) regs->ip;
491 code_len = code_len - code_prologue + 1;
492 }
493 for (i = 0; i < code_len; i++, ip++) {
494 if (ip < (u8 *)PAGE_OFFSET ||
495 probe_kernel_address(ip, c)) {
496 printk(" Bad RIP value.");
497 break;
498 }
499 if (ip == (u8 *)regs->ip)
500 printk("<%02x> ", c);
501 else
502 printk("%02x ", c);
503 }
504 }
505 printk("\n");
506}
507
508int is_valid_bugaddr(unsigned long ip)
509{
510 unsigned short ud2;
511
512 if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
513 return 0;
514
515 return ud2 == 0x0b0f;
516}
517
518static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
519static int die_owner = -1;
520static unsigned int die_nest_count;
521
522unsigned __kprobes long oops_begin(void)
523{
524 int cpu;
525 unsigned long flags;
526
527 oops_enter();
528
529 /* racy, but better than risking deadlock. */
530 raw_local_irq_save(flags);
531 cpu = smp_processor_id();
532 if (!__raw_spin_trylock(&die_lock)) {
533 if (cpu == die_owner)
534 /* nested oops. should stop eventually */;
535 else
536 __raw_spin_lock(&die_lock);
537 }
538 die_nest_count++;
539 die_owner = cpu;
540 console_verbose();
541 bust_spinlocks(1);
542 return flags;
543}
544
545void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
546{
547 die_owner = -1;
548 bust_spinlocks(0);
549 die_nest_count--;
550 if (!die_nest_count)
551 /* Nest count reaches zero, release the lock. */
552 __raw_spin_unlock(&die_lock);
553 raw_local_irq_restore(flags);
554 if (!regs) {
555 oops_exit();
556 return;
557 }
558 if (panic_on_oops)
559 panic("Fatal exception");
560 oops_exit();
561 do_exit(signr);
562}
563
564int __kprobes __die(const char * str, struct pt_regs * regs, long err)
565{
566 static int die_counter;
567 printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
568#ifdef CONFIG_PREEMPT
569 printk("PREEMPT ");
570#endif
571#ifdef CONFIG_SMP
572 printk("SMP ");
573#endif
574#ifdef CONFIG_DEBUG_PAGEALLOC
575 printk("DEBUG_PAGEALLOC");
576#endif
577 printk("\n");
578 if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
579 return 1;
580 show_registers(regs);
581 add_taint(TAINT_DIE);
582 /* Executive summary in case the oops scrolled away */
583 printk(KERN_ALERT "RIP ");
584 printk_address(regs->ip, 1);
585 printk(" RSP <%016lx>\n", regs->sp);
586 if (kexec_should_crash(current))
587 crash_kexec(regs);
588 return 0;
589}
590
591void die(const char * str, struct pt_regs * regs, long err)
592{
593 unsigned long flags = oops_begin();
594
595 if (!user_mode(regs))
596 report_bug(regs->ip, regs);
597
598 if (__die(str, regs, err))
599 regs = NULL;
600 oops_end(flags, regs, SIGSEGV);
601}
602
603notrace __kprobes void
604die_nmi(char *str, struct pt_regs *regs, int do_panic)
605{
606 unsigned long flags;
607
608 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) ==
609 NOTIFY_STOP)
610 return;
611
612 flags = oops_begin();
613 /*
614 * We are in trouble anyway, lets at least try
615 * to get a message out.
616 */
617 printk(str, smp_processor_id());
618 show_registers(regs);
619 if (kexec_should_crash(current))
620 crash_kexec(regs);
621 if (do_panic || panic_on_oops)
622 panic("Non maskable interrupt");
623 oops_end(flags, NULL, SIGBUS);
624 nmi_exit();
625 local_irq_enable();
626 do_exit(SIGBUS);
627}
628
629static void __kprobes do_trap(int trapnr, int signr, char *str,
630 struct pt_regs * regs, long error_code,
631 siginfo_t *info)
632{
633 struct task_struct *tsk = current;
634
635 if (user_mode(regs)) {
636 /*
637 * We want error_code and trap_no set for userspace
638 * faults and kernelspace faults which result in
639 * die(), but not kernelspace faults which are fixed
640 * up. die() gives the process no chance to handle
641 * the signal and notice the kernel fault information,
642 * so that won't result in polluting the information
643 * about previously queued, but not yet delivered,
644 * faults. See also do_general_protection below.
645 */
646 tsk->thread.error_code = error_code;
647 tsk->thread.trap_no = trapnr;
648
649 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
650 printk_ratelimit()) {
651 printk(KERN_INFO
652 "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
653 tsk->comm, tsk->pid, str,
654 regs->ip, regs->sp, error_code);
655 print_vma_addr(" in ", regs->ip);
656 printk("\n");
657 }
658
659 if (info)
660 force_sig_info(signr, info, tsk);
661 else
662 force_sig(signr, tsk);
663 return;
664 }
665
666
667 if (!fixup_exception(regs)) {
668 tsk->thread.error_code = error_code;
669 tsk->thread.trap_no = trapnr;
670 die(str, regs, error_code);
671 }
672 return;
673}
674
675#define DO_ERROR(trapnr, signr, str, name) \
676asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
677{ \
678 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
679 == NOTIFY_STOP) \
680 return; \
681 conditional_sti(regs); \
682 do_trap(trapnr, signr, str, regs, error_code, NULL); \
683}
684
685#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
686asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
687{ \
688 siginfo_t info; \
689 info.si_signo = signr; \
690 info.si_errno = 0; \
691 info.si_code = sicode; \
692 info.si_addr = (void __user *)siaddr; \
693 trace_hardirqs_fixup(); \
694 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
695 == NOTIFY_STOP) \
696 return; \
697 conditional_sti(regs); \
698 do_trap(trapnr, signr, str, regs, error_code, &info); \
699}
700
701DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
702DO_ERROR( 4, SIGSEGV, "overflow", overflow)
703DO_ERROR( 5, SIGSEGV, "bounds", bounds)
704DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
705DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
706DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
707DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
708DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
709DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
710DO_ERROR(18, SIGSEGV, "reserved", reserved)
711
712/* Runs on IST stack */
713asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
714{
715 if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
716 12, SIGBUS) == NOTIFY_STOP)
717 return;
718 preempt_conditional_sti(regs);
719 do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
720 preempt_conditional_cli(regs);
721}
722
723asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
724{
725 static const char str[] = "double fault";
726 struct task_struct *tsk = current;
727
728 /* Return not checked because double check cannot be ignored */
729 notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
730
731 tsk->thread.error_code = error_code;
732 tsk->thread.trap_no = 8;
733
734 /* This is always a kernel trap and never fixable (and thus must
735 never return). */
736 for (;;)
737 die(str, regs, error_code);
738}
739
740asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
741 long error_code)
742{
743 struct task_struct *tsk = current;
744
745 conditional_sti(regs);
746
747 if (user_mode(regs)) {
748 tsk->thread.error_code = error_code;
749 tsk->thread.trap_no = 13;
750
751 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
752 printk_ratelimit()) {
753 printk(KERN_INFO
754 "%s[%d] general protection ip:%lx sp:%lx error:%lx",
755 tsk->comm, tsk->pid,
756 regs->ip, regs->sp, error_code);
757 print_vma_addr(" in ", regs->ip);
758 printk("\n");
759 }
760
761 force_sig(SIGSEGV, tsk);
762 return;
763 }
764
765 if (fixup_exception(regs))
766 return;
767
768 tsk->thread.error_code = error_code;
769 tsk->thread.trap_no = 13;
770 if (notify_die(DIE_GPF, "general protection fault", regs,
771 error_code, 13, SIGSEGV) == NOTIFY_STOP)
772 return;
773 die("general protection fault", regs, error_code);
774}
775
776static notrace __kprobes void
777mem_parity_error(unsigned char reason, struct pt_regs * regs)
778{
779 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
780 reason);
781 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
782
783#if defined(CONFIG_EDAC)
784 if(edac_handler_set()) {
785 edac_atomic_assert_error();
786 return;
787 }
788#endif
789
790 if (panic_on_unrecovered_nmi)
791 panic("NMI: Not continuing");
792
793 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
794
795 /* Clear and disable the memory parity error line. */
796 reason = (reason & 0xf) | 4;
797 outb(reason, 0x61);
798}
799
800static notrace __kprobes void
801io_check_error(unsigned char reason, struct pt_regs * regs)
802{
803 printk("NMI: IOCK error (debug interrupt?)\n");
804 show_registers(regs);
805
806 /* Re-enable the IOCK line, wait for a few seconds */
807 reason = (reason & 0xf) | 8;
808 outb(reason, 0x61);
809 mdelay(2000);
810 reason &= ~8;
811 outb(reason, 0x61);
812}
813
814static notrace __kprobes void
815unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
816{
817 if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
818 return;
819 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
820 reason);
821 printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
822
823 if (panic_on_unrecovered_nmi)
824 panic("NMI: Not continuing");
825
826 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
827}
828
829/* Runs on IST stack. This code must keep interrupts off all the time.
830 Nested NMIs are prevented by the CPU. */
831asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
832{
833 unsigned char reason = 0;
834 int cpu;
835
836 cpu = smp_processor_id();
837
838 /* Only the BSP gets external NMIs from the system. */
839 if (!cpu)
840 reason = get_nmi_reason();
841
842 if (!(reason & 0xc0)) {
843 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
844 == NOTIFY_STOP)
845 return;
846 /*
847 * Ok, so this is none of the documented NMI sources,
848 * so it must be the NMI watchdog.
849 */
850 if (nmi_watchdog_tick(regs,reason))
851 return;
852 if (!do_nmi_callback(regs,cpu))
853 unknown_nmi_error(reason, regs);
854
855 return;
856 }
857 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
858 return;
859
860 /* AK: following checks seem to be broken on modern chipsets. FIXME */
861
862 if (reason & 0x80)
863 mem_parity_error(reason, regs);
864 if (reason & 0x40)
865 io_check_error(reason, regs);
866}
867
868/* runs on IST stack. */
869asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
870{
871 trace_hardirqs_fixup();
872
873 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
874 return;
875 }
876 preempt_conditional_sti(regs);
877 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
878 preempt_conditional_cli(regs);
879}
880
881/* Help handler running on IST stack to switch back to user stack
882 for scheduling or signal handling. The actual stack switch is done in
883 entry.S */
884asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
885{
886 struct pt_regs *regs = eregs;
887 /* Did already sync */
888 if (eregs == (struct pt_regs *)eregs->sp)
889 ;
890 /* Exception from user space */
891 else if (user_mode(eregs))
892 regs = task_pt_regs(current);
893 /* Exception from kernel and interrupts are enabled. Move to
894 kernel process stack. */
895 else if (eregs->flags & X86_EFLAGS_IF)
896 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
897 if (eregs != regs)
898 *regs = *eregs;
899 return regs;
900}
901
902/* runs on IST stack. */
903asmlinkage void __kprobes do_debug(struct pt_regs * regs,
904 unsigned long error_code)
905{
906 unsigned long condition;
907 struct task_struct *tsk = current;
908 siginfo_t info;
909
910 trace_hardirqs_fixup();
911
912 get_debugreg(condition, 6);
913
914 /*
915 * The processor cleared BTF, so don't mark that we need it set.
916 */
917 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
918 tsk->thread.debugctlmsr = 0;
919
920 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
921 SIGTRAP) == NOTIFY_STOP)
922 return;
923
924 preempt_conditional_sti(regs);
925
926 /* Mask out spurious debug traps due to lazy DR7 setting */
927 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
928 if (!tsk->thread.debugreg7) {
929 goto clear_dr7;
930 }
931 }
932
933 tsk->thread.debugreg6 = condition;
934
935
936 /*
937 * Single-stepping through TF: make sure we ignore any events in
938 * kernel space (but re-enable TF when returning to user mode).
939 */
940 if (condition & DR_STEP) {
941 if (!user_mode(regs))
942 goto clear_TF_reenable;
943 }
944
945 /* Ok, finally something we can handle */
946 tsk->thread.trap_no = 1;
947 tsk->thread.error_code = error_code;
948 info.si_signo = SIGTRAP;
949 info.si_errno = 0;
950 info.si_code = TRAP_BRKPT;
951 info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
952 force_sig_info(SIGTRAP, &info, tsk);
953
954clear_dr7:
955 set_debugreg(0UL, 7);
956 preempt_conditional_cli(regs);
957 return;
958
959clear_TF_reenable:
960 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
961 regs->flags &= ~X86_EFLAGS_TF;
962 preempt_conditional_cli(regs);
963}
964
965static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
966{
967 if (fixup_exception(regs))
968 return 1;
969
970 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
971 /* Illegal floating point operation in the kernel */
972 current->thread.trap_no = trapnr;
973 die(str, regs, 0);
974 return 0;
975}
976
977/*
978 * Note that we play around with the 'TS' bit in an attempt to get
979 * the correct behaviour even in the presence of the asynchronous
980 * IRQ13 behaviour
981 */
982asmlinkage void do_coprocessor_error(struct pt_regs *regs)
983{
984 void __user *ip = (void __user *)(regs->ip);
985 struct task_struct * task;
986 siginfo_t info;
987 unsigned short cwd, swd;
988
989 conditional_sti(regs);
990 if (!user_mode(regs) &&
991 kernel_math_error(regs, "kernel x87 math error", 16))
992 return;
993
994 /*
995 * Save the info for the exception handler and clear the error.
996 */
997 task = current;
998 save_init_fpu(task);
999 task->thread.trap_no = 16;
1000 task->thread.error_code = 0;
1001 info.si_signo = SIGFPE;
1002 info.si_errno = 0;
1003 info.si_code = __SI_FAULT;
1004 info.si_addr = ip;
1005 /*
1006 * (~cwd & swd) will mask out exceptions that are not set to unmasked
1007 * status. 0x3f is the exception bits in these regs, 0x200 is the
1008 * C1 reg you need in case of a stack fault, 0x040 is the stack
1009 * fault bit. We should only be taking one exception at a time,
1010 * so if this combination doesn't produce any single exception,
1011 * then we have a bad program that isn't synchronizing its FPU usage
1012 * and it will suffer the consequences since we won't be able to
1013 * fully reproduce the context of the exception
1014 */
1015 cwd = get_fpu_cwd(task);
1016 swd = get_fpu_swd(task);
1017 switch (swd & ~cwd & 0x3f) {
1018 case 0x000:
1019 default:
1020 break;
1021 case 0x001: /* Invalid Op */
1022 /*
1023 * swd & 0x240 == 0x040: Stack Underflow
1024 * swd & 0x240 == 0x240: Stack Overflow
1025 * User must clear the SF bit (0x40) if set
1026 */
1027 info.si_code = FPE_FLTINV;
1028 break;
1029 case 0x002: /* Denormalize */
1030 case 0x010: /* Underflow */
1031 info.si_code = FPE_FLTUND;
1032 break;
1033 case 0x004: /* Zero Divide */
1034 info.si_code = FPE_FLTDIV;
1035 break;
1036 case 0x008: /* Overflow */
1037 info.si_code = FPE_FLTOVF;
1038 break;
1039 case 0x020: /* Precision */
1040 info.si_code = FPE_FLTRES;
1041 break;
1042 }
1043 force_sig_info(SIGFPE, &info, task);
1044}
1045
1046asmlinkage void bad_intr(void)
1047{
1048 printk("bad interrupt");
1049}
1050
1051asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
1052{
1053 void __user *ip = (void __user *)(regs->ip);
1054 struct task_struct * task;
1055 siginfo_t info;
1056 unsigned short mxcsr;
1057
1058 conditional_sti(regs);
1059 if (!user_mode(regs) &&
1060 kernel_math_error(regs, "kernel simd math error", 19))
1061 return;
1062
1063 /*
1064 * Save the info for the exception handler and clear the error.
1065 */
1066 task = current;
1067 save_init_fpu(task);
1068 task->thread.trap_no = 19;
1069 task->thread.error_code = 0;
1070 info.si_signo = SIGFPE;
1071 info.si_errno = 0;
1072 info.si_code = __SI_FAULT;
1073 info.si_addr = ip;
1074 /*
1075 * The SIMD FPU exceptions are handled a little differently, as there
1076 * is only a single status/control register. Thus, to determine which
1077 * unmasked exception was caught we must mask the exception mask bits
1078 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
1079 */
1080 mxcsr = get_fpu_mxcsr(task);
1081 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
1082 case 0x000:
1083 default:
1084 break;
1085 case 0x001: /* Invalid Op */
1086 info.si_code = FPE_FLTINV;
1087 break;
1088 case 0x002: /* Denormalize */
1089 case 0x010: /* Underflow */
1090 info.si_code = FPE_FLTUND;
1091 break;
1092 case 0x004: /* Zero Divide */
1093 info.si_code = FPE_FLTDIV;
1094 break;
1095 case 0x008: /* Overflow */
1096 info.si_code = FPE_FLTOVF;
1097 break;
1098 case 0x020: /* Precision */
1099 info.si_code = FPE_FLTRES;
1100 break;
1101 }
1102 force_sig_info(SIGFPE, &info, task);
1103}
1104
1105asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
1106{
1107}
1108
1109asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
1110{
1111}
1112
1113asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
1114{
1115}
1116
1117/*
1118 * 'math_state_restore()' saves the current math information in the
1119 * old math state array, and gets the new ones from the current task
1120 *
1121 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
1122 * Don't touch unless you *really* know how it works.
1123 */
1124asmlinkage void math_state_restore(void)
1125{
1126 struct task_struct *me = current;
1127
1128 if (!used_math()) {
1129 local_irq_enable();
1130 /*
1131 * does a slab alloc which can sleep
1132 */
1133 if (init_fpu(me)) {
1134 /*
1135 * ran out of memory!
1136 */
1137 do_group_exit(SIGKILL);
1138 return;
1139 }
1140 local_irq_disable();
1141 }
1142
1143 clts(); /* Allow maths ops (or we recurse) */
1144 restore_fpu_checking(&me->thread.xstate->fxsave);
1145 task_thread_info(me)->status |= TS_USEDFPU;
1146 me->fpu_counter++;
1147}
1148EXPORT_SYMBOL_GPL(math_state_restore);
1149
1150void __init trap_init(void)
1151{
1152 set_intr_gate(0,&divide_error);
1153 set_intr_gate_ist(1,&debug,DEBUG_STACK);
1154 set_intr_gate_ist(2,&nmi,NMI_STACK);
1155 set_system_gate_ist(3,&int3,DEBUG_STACK); /* int3 can be called from all */
1156 set_system_gate(4,&overflow); /* int4 can be called from all */
1157 set_intr_gate(5,&bounds);
1158 set_intr_gate(6,&invalid_op);
1159 set_intr_gate(7,&device_not_available);
1160 set_intr_gate_ist(8,&double_fault, DOUBLEFAULT_STACK);
1161 set_intr_gate(9,&coprocessor_segment_overrun);
1162 set_intr_gate(10,&invalid_TSS);
1163 set_intr_gate(11,&segment_not_present);
1164 set_intr_gate_ist(12,&stack_segment,STACKFAULT_STACK);
1165 set_intr_gate(13,&general_protection);
1166 set_intr_gate(14,&page_fault);
1167 set_intr_gate(15,&spurious_interrupt_bug);
1168 set_intr_gate(16,&coprocessor_error);
1169 set_intr_gate(17,&alignment_check);
1170#ifdef CONFIG_X86_MCE
1171 set_intr_gate_ist(18,&machine_check, MCE_STACK);
1172#endif
1173 set_intr_gate(19,&simd_coprocessor_error);
1174
1175#ifdef CONFIG_IA32_EMULATION
1176 set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
1177#endif
1178
1179 /*
1180 * initialize the per thread extended state:
1181 */
1182 init_thread_xstate();
1183 /*
1184 * Should be a barrier for any external CPU state.
1185 */
1186 cpu_init();
1187}
1188
1189
1190static int __init oops_setup(char *s)
1191{
1192 if (!s)
1193 return -EINVAL;
1194 if (!strcmp(s, "panic"))
1195 panic_on_oops = 1;
1196 return 0;
1197}
1198early_param("oops", oops_setup);
1199
1200static int __init kstack_setup(char *s)
1201{
1202 if (!s)
1203 return -EINVAL;
1204 kstack_depth_to_print = simple_strtoul(s,NULL,0);
1205 return 0;
1206}
1207early_param("kstack", kstack_setup);
1208
1209
1210static int __init code_bytes_setup(char *s)
1211{
1212 code_bytes = simple_strtoul(s, NULL, 0);
1213 if (code_bytes > 8192)
1214 code_bytes = 8192;
1215
1216 return 1;
1217}
1218__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
new file mode 100644
index 000000000000..161bb850fc47
--- /dev/null
+++ b/arch/x86/kernel/tsc.c
@@ -0,0 +1,849 @@
1#include <linux/kernel.h>
2#include <linux/sched.h>
3#include <linux/init.h>
4#include <linux/module.h>
5#include <linux/timer.h>
6#include <linux/acpi_pmtmr.h>
7#include <linux/cpufreq.h>
8#include <linux/dmi.h>
9#include <linux/delay.h>
10#include <linux/clocksource.h>
11#include <linux/percpu.h>
12
13#include <asm/hpet.h>
14#include <asm/timer.h>
15#include <asm/vgtod.h>
16#include <asm/time.h>
17#include <asm/delay.h>
18
19unsigned int cpu_khz; /* TSC clocks / usec, not used here */
20EXPORT_SYMBOL(cpu_khz);
21unsigned int tsc_khz;
22EXPORT_SYMBOL(tsc_khz);
23
24/*
25 * TSC can be unstable due to cpufreq or due to unsynced TSCs
26 */
27static int tsc_unstable;
28
29/* native_sched_clock() is called before tsc_init(), so
30 we must start with the TSC soft disabled to prevent
31 erroneous rdtsc usage on !cpu_has_tsc processors */
32static int tsc_disabled = -1;
33
34/*
35 * Scheduler clock - returns current time in nanosec units.
36 */
37u64 native_sched_clock(void)
38{
39 u64 this_offset;
40
41 /*
42 * Fall back to jiffies if there's no TSC available:
43 * ( But note that we still use it if the TSC is marked
44 * unstable. We do this because unlike Time Of Day,
45 * the scheduler clock tolerates small errors and it's
46 * very important for it to be as fast as the platform
47 * can achive it. )
48 */
49 if (unlikely(tsc_disabled)) {
50 /* No locking but a rare wrong value is not a big deal: */
51 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
52 }
53
54 /* read the Time Stamp Counter: */
55 rdtscll(this_offset);
56
57 /* return the value in ns */
58 return cycles_2_ns(this_offset);
59}
60
61/* We need to define a real function for sched_clock, to override the
62 weak default version */
63#ifdef CONFIG_PARAVIRT
64unsigned long long sched_clock(void)
65{
66 return paravirt_sched_clock();
67}
68#else
69unsigned long long
70sched_clock(void) __attribute__((alias("native_sched_clock")));
71#endif
72
73int check_tsc_unstable(void)
74{
75 return tsc_unstable;
76}
77EXPORT_SYMBOL_GPL(check_tsc_unstable);
78
79#ifdef CONFIG_X86_TSC
80int __init notsc_setup(char *str)
81{
82 printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
83 "cannot disable TSC completely.\n");
84 tsc_disabled = 1;
85 return 1;
86}
87#else
88/*
89 * disable flag for tsc. Takes effect by clearing the TSC cpu flag
90 * in cpu/common.c
91 */
92int __init notsc_setup(char *str)
93{
94 setup_clear_cpu_cap(X86_FEATURE_TSC);
95 return 1;
96}
97#endif
98
99__setup("notsc", notsc_setup);
100
101#define MAX_RETRIES 5
102#define SMI_TRESHOLD 50000
103
104/*
105 * Read TSC and the reference counters. Take care of SMI disturbance
106 */
107static u64 tsc_read_refs(u64 *p, int hpet)
108{
109 u64 t1, t2;
110 int i;
111
112 for (i = 0; i < MAX_RETRIES; i++) {
113 t1 = get_cycles();
114 if (hpet)
115 *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
116 else
117 *p = acpi_pm_read_early();
118 t2 = get_cycles();
119 if ((t2 - t1) < SMI_TRESHOLD)
120 return t2;
121 }
122 return ULLONG_MAX;
123}
124
125/*
126 * Calculate the TSC frequency from HPET reference
127 */
128static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
129{
130 u64 tmp;
131
132 if (hpet2 < hpet1)
133 hpet2 += 0x100000000ULL;
134 hpet2 -= hpet1;
135 tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
136 do_div(tmp, 1000000);
137 do_div(deltatsc, tmp);
138
139 return (unsigned long) deltatsc;
140}
141
142/*
143 * Calculate the TSC frequency from PMTimer reference
144 */
145static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
146{
147 u64 tmp;
148
149 if (!pm1 && !pm2)
150 return ULONG_MAX;
151
152 if (pm2 < pm1)
153 pm2 += (u64)ACPI_PM_OVRRUN;
154 pm2 -= pm1;
155 tmp = pm2 * 1000000000LL;
156 do_div(tmp, PMTMR_TICKS_PER_SEC);
157 do_div(deltatsc, tmp);
158
159 return (unsigned long) deltatsc;
160}
161
162#define CAL_MS 10
163#define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS))
164#define CAL_PIT_LOOPS 1000
165
166#define CAL2_MS 50
167#define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS))
168#define CAL2_PIT_LOOPS 5000
169
170
171/*
172 * Try to calibrate the TSC against the Programmable
173 * Interrupt Timer and return the frequency of the TSC
174 * in kHz.
175 *
176 * Return ULONG_MAX on failure to calibrate.
177 */
178static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
179{
180 u64 tsc, t1, t2, delta;
181 unsigned long tscmin, tscmax;
182 int pitcnt;
183
184 /* Set the Gate high, disable speaker */
185 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
186
187 /*
188 * Setup CTC channel 2* for mode 0, (interrupt on terminal
189 * count mode), binary count. Set the latch register to 50ms
190 * (LSB then MSB) to begin countdown.
191 */
192 outb(0xb0, 0x43);
193 outb(latch & 0xff, 0x42);
194 outb(latch >> 8, 0x42);
195
196 tsc = t1 = t2 = get_cycles();
197
198 pitcnt = 0;
199 tscmax = 0;
200 tscmin = ULONG_MAX;
201 while ((inb(0x61) & 0x20) == 0) {
202 t2 = get_cycles();
203 delta = t2 - tsc;
204 tsc = t2;
205 if ((unsigned long) delta < tscmin)
206 tscmin = (unsigned int) delta;
207 if ((unsigned long) delta > tscmax)
208 tscmax = (unsigned int) delta;
209 pitcnt++;
210 }
211
212 /*
213 * Sanity checks:
214 *
215 * If we were not able to read the PIT more than loopmin
216 * times, then we have been hit by a massive SMI
217 *
218 * If the maximum is 10 times larger than the minimum,
219 * then we got hit by an SMI as well.
220 */
221 if (pitcnt < loopmin || tscmax > 10 * tscmin)
222 return ULONG_MAX;
223
224 /* Calculate the PIT value */
225 delta = t2 - t1;
226 do_div(delta, ms);
227 return delta;
228}
229
230/*
231 * This reads the current MSB of the PIT counter, and
232 * checks if we are running on sufficiently fast and
233 * non-virtualized hardware.
234 *
235 * Our expectations are:
236 *
237 * - the PIT is running at roughly 1.19MHz
238 *
239 * - each IO is going to take about 1us on real hardware,
240 * but we allow it to be much faster (by a factor of 10) or
241 * _slightly_ slower (ie we allow up to a 2us read+counter
242 * update - anything else implies a unacceptably slow CPU
243 * or PIT for the fast calibration to work.
244 *
245 * - with 256 PIT ticks to read the value, we have 214us to
246 * see the same MSB (and overhead like doing a single TSC
247 * read per MSB value etc).
248 *
249 * - We're doing 2 reads per loop (LSB, MSB), and we expect
250 * them each to take about a microsecond on real hardware.
251 * So we expect a count value of around 100. But we'll be
252 * generous, and accept anything over 50.
253 *
254 * - if the PIT is stuck, and we see *many* more reads, we
255 * return early (and the next caller of pit_expect_msb()
256 * then consider it a failure when they don't see the
257 * next expected value).
258 *
259 * These expectations mean that we know that we have seen the
260 * transition from one expected value to another with a fairly
261 * high accuracy, and we didn't miss any events. We can thus
262 * use the TSC value at the transitions to calculate a pretty
263 * good value for the TSC frequencty.
264 */
265static inline int pit_expect_msb(unsigned char val)
266{
267 int count = 0;
268
269 for (count = 0; count < 50000; count++) {
270 /* Ignore LSB */
271 inb(0x42);
272 if (inb(0x42) != val)
273 break;
274 }
275 return count > 50;
276}
277
278/*
279 * How many MSB values do we want to see? We aim for a
280 * 15ms calibration, which assuming a 2us counter read
281 * error should give us roughly 150 ppm precision for
282 * the calibration.
283 */
284#define QUICK_PIT_MS 15
285#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
286
287static unsigned long quick_pit_calibrate(void)
288{
289 /* Set the Gate high, disable speaker */
290 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
291
292 /*
293 * Counter 2, mode 0 (one-shot), binary count
294 *
295 * NOTE! Mode 2 decrements by two (and then the
296 * output is flipped each time, giving the same
297 * final output frequency as a decrement-by-one),
298 * so mode 0 is much better when looking at the
299 * individual counts.
300 */
301 outb(0xb0, 0x43);
302
303 /* Start at 0xffff */
304 outb(0xff, 0x42);
305 outb(0xff, 0x42);
306
307 if (pit_expect_msb(0xff)) {
308 int i;
309 u64 t1, t2, delta;
310 unsigned char expect = 0xfe;
311
312 t1 = get_cycles();
313 for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) {
314 if (!pit_expect_msb(expect))
315 goto failed;
316 }
317 t2 = get_cycles();
318
319 /*
320 * Make sure we can rely on the second TSC timestamp:
321 */
322 if (!pit_expect_msb(expect))
323 goto failed;
324
325 /*
326 * Ok, if we get here, then we've seen the
327 * MSB of the PIT decrement QUICK_PIT_ITERATIONS
328 * times, and each MSB had many hits, so we never
329 * had any sudden jumps.
330 *
331 * As a result, we can depend on there not being
332 * any odd delays anywhere, and the TSC reads are
333 * reliable.
334 *
335 * kHz = ticks / time-in-seconds / 1000;
336 * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000
337 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000)
338 */
339 delta = (t2 - t1)*PIT_TICK_RATE;
340 do_div(delta, QUICK_PIT_ITERATIONS*256*1000);
341 printk("Fast TSC calibration using PIT\n");
342 return delta;
343 }
344failed:
345 return 0;
346}
347
348/**
349 * native_calibrate_tsc - calibrate the tsc on boot
350 */
351unsigned long native_calibrate_tsc(void)
352{
353 u64 tsc1, tsc2, delta, ref1, ref2;
354 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
355 unsigned long flags, latch, ms, fast_calibrate;
356 int hpet = is_hpet_enabled(), i, loopmin;
357
358 local_irq_save(flags);
359 fast_calibrate = quick_pit_calibrate();
360 local_irq_restore(flags);
361 if (fast_calibrate)
362 return fast_calibrate;
363
364 /*
365 * Run 5 calibration loops to get the lowest frequency value
366 * (the best estimate). We use two different calibration modes
367 * here:
368 *
369 * 1) PIT loop. We set the PIT Channel 2 to oneshot mode and
370 * load a timeout of 50ms. We read the time right after we
371 * started the timer and wait until the PIT count down reaches
372 * zero. In each wait loop iteration we read the TSC and check
373 * the delta to the previous read. We keep track of the min
374 * and max values of that delta. The delta is mostly defined
375 * by the IO time of the PIT access, so we can detect when a
376 * SMI/SMM disturbance happend between the two reads. If the
377 * maximum time is significantly larger than the minimum time,
378 * then we discard the result and have another try.
379 *
380 * 2) Reference counter. If available we use the HPET or the
381 * PMTIMER as a reference to check the sanity of that value.
382 * We use separate TSC readouts and check inside of the
383 * reference read for a SMI/SMM disturbance. We dicard
384 * disturbed values here as well. We do that around the PIT
385 * calibration delay loop as we have to wait for a certain
386 * amount of time anyway.
387 */
388
389 /* Preset PIT loop values */
390 latch = CAL_LATCH;
391 ms = CAL_MS;
392 loopmin = CAL_PIT_LOOPS;
393
394 for (i = 0; i < 3; i++) {
395 unsigned long tsc_pit_khz;
396
397 /*
398 * Read the start value and the reference count of
399 * hpet/pmtimer when available. Then do the PIT
400 * calibration, which will take at least 50ms, and
401 * read the end value.
402 */
403 local_irq_save(flags);
404 tsc1 = tsc_read_refs(&ref1, hpet);
405 tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin);
406 tsc2 = tsc_read_refs(&ref2, hpet);
407 local_irq_restore(flags);
408
409 /* Pick the lowest PIT TSC calibration so far */
410 tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
411
412 /* hpet or pmtimer available ? */
413 if (!hpet && !ref1 && !ref2)
414 continue;
415
416 /* Check, whether the sampling was disturbed by an SMI */
417 if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX)
418 continue;
419
420 tsc2 = (tsc2 - tsc1) * 1000000LL;
421 if (hpet)
422 tsc2 = calc_hpet_ref(tsc2, ref1, ref2);
423 else
424 tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2);
425
426 tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
427
428 /* Check the reference deviation */
429 delta = ((u64) tsc_pit_min) * 100;
430 do_div(delta, tsc_ref_min);
431
432 /*
433 * If both calibration results are inside a 10% window
434 * then we can be sure, that the calibration
435 * succeeded. We break out of the loop right away. We
436 * use the reference value, as it is more precise.
437 */
438 if (delta >= 90 && delta <= 110) {
439 printk(KERN_INFO
440 "TSC: PIT calibration matches %s. %d loops\n",
441 hpet ? "HPET" : "PMTIMER", i + 1);
442 return tsc_ref_min;
443 }
444
445 /*
446 * Check whether PIT failed more than once. This
447 * happens in virtualized environments. We need to
448 * give the virtual PC a slightly longer timeframe for
449 * the HPET/PMTIMER to make the result precise.
450 */
451 if (i == 1 && tsc_pit_min == ULONG_MAX) {
452 latch = CAL2_LATCH;
453 ms = CAL2_MS;
454 loopmin = CAL2_PIT_LOOPS;
455 }
456 }
457
458 /*
459 * Now check the results.
460 */
461 if (tsc_pit_min == ULONG_MAX) {
462 /* PIT gave no useful value */
463 printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n");
464
465 /* We don't have an alternative source, disable TSC */
466 if (!hpet && !ref1 && !ref2) {
467 printk("TSC: No reference (HPET/PMTIMER) available\n");
468 return 0;
469 }
470
471 /* The alternative source failed as well, disable TSC */
472 if (tsc_ref_min == ULONG_MAX) {
473 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration "
474 "failed.\n");
475 return 0;
476 }
477
478 /* Use the alternative source */
479 printk(KERN_INFO "TSC: using %s reference calibration\n",
480 hpet ? "HPET" : "PMTIMER");
481
482 return tsc_ref_min;
483 }
484
485 /* We don't have an alternative source, use the PIT calibration value */
486 if (!hpet && !ref1 && !ref2) {
487 printk(KERN_INFO "TSC: Using PIT calibration value\n");
488 return tsc_pit_min;
489 }
490
491 /* The alternative source failed, use the PIT calibration value */
492 if (tsc_ref_min == ULONG_MAX) {
493 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. "
494 "Using PIT calibration\n");
495 return tsc_pit_min;
496 }
497
498 /*
499 * The calibration values differ too much. In doubt, we use
500 * the PIT value as we know that there are PMTIMERs around
501 * running at double speed. At least we let the user know:
502 */
503 printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
504 hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
505 printk(KERN_INFO "TSC: Using PIT calibration value\n");
506 return tsc_pit_min;
507}
508
509#ifdef CONFIG_X86_32
510/* Only called from the Powernow K7 cpu freq driver */
511int recalibrate_cpu_khz(void)
512{
513#ifndef CONFIG_SMP
514 unsigned long cpu_khz_old = cpu_khz;
515
516 if (cpu_has_tsc) {
517 tsc_khz = calibrate_tsc();
518 cpu_khz = tsc_khz;
519 cpu_data(0).loops_per_jiffy =
520 cpufreq_scale(cpu_data(0).loops_per_jiffy,
521 cpu_khz_old, cpu_khz);
522 return 0;
523 } else
524 return -ENODEV;
525#else
526 return -ENODEV;
527#endif
528}
529
530EXPORT_SYMBOL(recalibrate_cpu_khz);
531
532#endif /* CONFIG_X86_32 */
533
534/* Accelerators for sched_clock()
535 * convert from cycles(64bits) => nanoseconds (64bits)
536 * basic equation:
537 * ns = cycles / (freq / ns_per_sec)
538 * ns = cycles * (ns_per_sec / freq)
539 * ns = cycles * (10^9 / (cpu_khz * 10^3))
540 * ns = cycles * (10^6 / cpu_khz)
541 *
542 * Then we use scaling math (suggested by george@mvista.com) to get:
543 * ns = cycles * (10^6 * SC / cpu_khz) / SC
544 * ns = cycles * cyc2ns_scale / SC
545 *
546 * And since SC is a constant power of two, we can convert the div
547 * into a shift.
548 *
549 * We can use khz divisor instead of mhz to keep a better precision, since
550 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
551 * (mathieu.desnoyers@polymtl.ca)
552 *
553 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
554 */
555
556DEFINE_PER_CPU(unsigned long, cyc2ns);
557
558static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
559{
560 unsigned long long tsc_now, ns_now;
561 unsigned long flags, *scale;
562
563 local_irq_save(flags);
564 sched_clock_idle_sleep_event();
565
566 scale = &per_cpu(cyc2ns, cpu);
567
568 rdtscll(tsc_now);
569 ns_now = __cycles_2_ns(tsc_now);
570
571 if (cpu_khz)
572 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
573
574 sched_clock_idle_wakeup_event(0);
575 local_irq_restore(flags);
576}
577
578#ifdef CONFIG_CPU_FREQ
579
580/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
581 * changes.
582 *
583 * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's
584 * not that important because current Opteron setups do not support
585 * scaling on SMP anyroads.
586 *
587 * Should fix up last_tsc too. Currently gettimeofday in the
588 * first tick after the change will be slightly wrong.
589 */
590
591static unsigned int ref_freq;
592static unsigned long loops_per_jiffy_ref;
593static unsigned long tsc_khz_ref;
594
595static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
596 void *data)
597{
598 struct cpufreq_freqs *freq = data;
599 unsigned long *lpj, dummy;
600
601 if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC))
602 return 0;
603
604 lpj = &dummy;
605 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
606#ifdef CONFIG_SMP
607 lpj = &cpu_data(freq->cpu).loops_per_jiffy;
608#else
609 lpj = &boot_cpu_data.loops_per_jiffy;
610#endif
611
612 if (!ref_freq) {
613 ref_freq = freq->old;
614 loops_per_jiffy_ref = *lpj;
615 tsc_khz_ref = tsc_khz;
616 }
617 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
618 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
619 (val == CPUFREQ_RESUMECHANGE)) {
620 *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
621
622 tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
623 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
624 mark_tsc_unstable("cpufreq changes");
625 }
626
627 set_cyc2ns_scale(tsc_khz, freq->cpu);
628
629 return 0;
630}
631
632static struct notifier_block time_cpufreq_notifier_block = {
633 .notifier_call = time_cpufreq_notifier
634};
635
636static int __init cpufreq_tsc(void)
637{
638 if (!cpu_has_tsc)
639 return 0;
640 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
641 return 0;
642 cpufreq_register_notifier(&time_cpufreq_notifier_block,
643 CPUFREQ_TRANSITION_NOTIFIER);
644 return 0;
645}
646
647core_initcall(cpufreq_tsc);
648
649#endif /* CONFIG_CPU_FREQ */
650
651/* clocksource code */
652
653static struct clocksource clocksource_tsc;
654
655/*
656 * We compare the TSC to the cycle_last value in the clocksource
657 * structure to avoid a nasty time-warp. This can be observed in a
658 * very small window right after one CPU updated cycle_last under
659 * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
660 * is smaller than the cycle_last reference value due to a TSC which
661 * is slighty behind. This delta is nowhere else observable, but in
662 * that case it results in a forward time jump in the range of hours
663 * due to the unsigned delta calculation of the time keeping core
664 * code, which is necessary to support wrapping clocksources like pm
665 * timer.
666 */
667static cycle_t read_tsc(void)
668{
669 cycle_t ret = (cycle_t)get_cycles();
670
671 return ret >= clocksource_tsc.cycle_last ?
672 ret : clocksource_tsc.cycle_last;
673}
674
675#ifdef CONFIG_X86_64
676static cycle_t __vsyscall_fn vread_tsc(void)
677{
678 cycle_t ret = (cycle_t)vget_cycles();
679
680 return ret >= __vsyscall_gtod_data.clock.cycle_last ?
681 ret : __vsyscall_gtod_data.clock.cycle_last;
682}
683#endif
684
685static struct clocksource clocksource_tsc = {
686 .name = "tsc",
687 .rating = 300,
688 .read = read_tsc,
689 .mask = CLOCKSOURCE_MASK(64),
690 .shift = 22,
691 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
692 CLOCK_SOURCE_MUST_VERIFY,
693#ifdef CONFIG_X86_64
694 .vread = vread_tsc,
695#endif
696};
697
698void mark_tsc_unstable(char *reason)
699{
700 if (!tsc_unstable) {
701 tsc_unstable = 1;
702 printk("Marking TSC unstable due to %s\n", reason);
703 /* Change only the rating, when not registered */
704 if (clocksource_tsc.mult)
705 clocksource_change_rating(&clocksource_tsc, 0);
706 else
707 clocksource_tsc.rating = 0;
708 }
709}
710
711EXPORT_SYMBOL_GPL(mark_tsc_unstable);
712
713static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d)
714{
715 printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
716 d->ident);
717 tsc_unstable = 1;
718 return 0;
719}
720
721/* List of systems that have known TSC problems */
722static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
723 {
724 .callback = dmi_mark_tsc_unstable,
725 .ident = "IBM Thinkpad 380XD",
726 .matches = {
727 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
728 DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
729 },
730 },
731 {}
732};
733
734/*
735 * Geode_LX - the OLPC CPU has a possibly a very reliable TSC
736 */
737#ifdef CONFIG_MGEODE_LX
738/* RTSC counts during suspend */
739#define RTSC_SUSP 0x100
740
741static void __init check_geode_tsc_reliable(void)
742{
743 unsigned long res_low, res_high;
744
745 rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
746 if (res_low & RTSC_SUSP)
747 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
748}
749#else
750static inline void check_geode_tsc_reliable(void) { }
751#endif
752
753/*
754 * Make an educated guess if the TSC is trustworthy and synchronized
755 * over all CPUs.
756 */
757__cpuinit int unsynchronized_tsc(void)
758{
759 if (!cpu_has_tsc || tsc_unstable)
760 return 1;
761
762#ifdef CONFIG_SMP
763 if (apic_is_clustered_box())
764 return 1;
765#endif
766
767 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
768 return 0;
769 /*
770 * Intel systems are normally all synchronized.
771 * Exceptions must mark TSC as unstable:
772 */
773 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
774 /* assume multi socket systems are not synchronized: */
775 if (num_possible_cpus() > 1)
776 tsc_unstable = 1;
777 }
778
779 return tsc_unstable;
780}
781
782static void __init init_tsc_clocksource(void)
783{
784 clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
785 clocksource_tsc.shift);
786 /* lower the rating if we already know its unstable: */
787 if (check_tsc_unstable()) {
788 clocksource_tsc.rating = 0;
789 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
790 }
791 clocksource_register(&clocksource_tsc);
792}
793
794void __init tsc_init(void)
795{
796 u64 lpj;
797 int cpu;
798
799 if (!cpu_has_tsc)
800 return;
801
802 tsc_khz = calibrate_tsc();
803 cpu_khz = tsc_khz;
804
805 if (!tsc_khz) {
806 mark_tsc_unstable("could not calculate TSC khz");
807 return;
808 }
809
810#ifdef CONFIG_X86_64
811 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
812 (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
813 cpu_khz = calibrate_cpu();
814#endif
815
816 lpj = ((u64)tsc_khz * 1000);
817 do_div(lpj, HZ);
818 lpj_fine = lpj;
819
820 printk("Detected %lu.%03lu MHz processor.\n",
821 (unsigned long)cpu_khz / 1000,
822 (unsigned long)cpu_khz % 1000);
823
824 /*
825 * Secondary CPUs do not run through tsc_init(), so set up
826 * all the scale factors for all CPUs, assuming the same
827 * speed as the bootup CPU. (cpufreq notifiers will fix this
828 * up if their speed diverges)
829 */
830 for_each_possible_cpu(cpu)
831 set_cyc2ns_scale(cpu_khz, cpu);
832
833 if (tsc_disabled > 0)
834 return;
835
836 /* now allow native_sched_clock() to use rdtsc */
837 tsc_disabled = 0;
838
839 use_tsc_delay();
840 /* Check and install the TSC clocksource */
841 dmi_check_system(bad_tsc_dmi_table);
842
843 if (unsynchronized_tsc())
844 mark_tsc_unstable("TSCs unsynchronized");
845
846 check_geode_tsc_reliable();
847 init_tsc_clocksource();
848}
849
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
deleted file mode 100644
index 65b70637ad97..000000000000
--- a/arch/x86/kernel/tsc_32.c
+++ /dev/null
@@ -1,451 +0,0 @@
1#include <linux/sched.h>
2#include <linux/clocksource.h>
3#include <linux/workqueue.h>
4#include <linux/cpufreq.h>
5#include <linux/jiffies.h>
6#include <linux/init.h>
7#include <linux/dmi.h>
8#include <linux/percpu.h>
9
10#include <asm/delay.h>
11#include <asm/tsc.h>
12#include <asm/io.h>
13#include <asm/timer.h>
14
15#include "mach_timer.h"
16
17/* native_sched_clock() is called before tsc_init(), so
18 we must start with the TSC soft disabled to prevent
19 erroneous rdtsc usage on !cpu_has_tsc processors */
20static int tsc_disabled = -1;
21
22/*
23 * On some systems the TSC frequency does not
24 * change with the cpu frequency. So we need
25 * an extra value to store the TSC freq
26 */
27unsigned int tsc_khz;
28EXPORT_SYMBOL_GPL(tsc_khz);
29
30#ifdef CONFIG_X86_TSC
31static int __init tsc_setup(char *str)
32{
33 printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
34 "cannot disable TSC completely.\n");
35 tsc_disabled = 1;
36 return 1;
37}
38#else
39/*
40 * disable flag for tsc. Takes effect by clearing the TSC cpu flag
41 * in cpu/common.c
42 */
43static int __init tsc_setup(char *str)
44{
45 setup_clear_cpu_cap(X86_FEATURE_TSC);
46 return 1;
47}
48#endif
49
50__setup("notsc", tsc_setup);
51
52/*
53 * code to mark and check if the TSC is unstable
54 * due to cpufreq or due to unsynced TSCs
55 */
56static int tsc_unstable;
57
58int check_tsc_unstable(void)
59{
60 return tsc_unstable;
61}
62EXPORT_SYMBOL_GPL(check_tsc_unstable);
63
64/* Accelerators for sched_clock()
65 * convert from cycles(64bits) => nanoseconds (64bits)
66 * basic equation:
67 * ns = cycles / (freq / ns_per_sec)
68 * ns = cycles * (ns_per_sec / freq)
69 * ns = cycles * (10^9 / (cpu_khz * 10^3))
70 * ns = cycles * (10^6 / cpu_khz)
71 *
72 * Then we use scaling math (suggested by george@mvista.com) to get:
73 * ns = cycles * (10^6 * SC / cpu_khz) / SC
74 * ns = cycles * cyc2ns_scale / SC
75 *
76 * And since SC is a constant power of two, we can convert the div
77 * into a shift.
78 *
79 * We can use khz divisor instead of mhz to keep a better precision, since
80 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
81 * (mathieu.desnoyers@polymtl.ca)
82 *
83 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
84 */
85
86DEFINE_PER_CPU(unsigned long, cyc2ns);
87
88static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
89{
90 unsigned long long tsc_now, ns_now;
91 unsigned long flags, *scale;
92
93 local_irq_save(flags);
94 sched_clock_idle_sleep_event();
95
96 scale = &per_cpu(cyc2ns, cpu);
97
98 rdtscll(tsc_now);
99 ns_now = __cycles_2_ns(tsc_now);
100
101 if (cpu_khz)
102 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
103
104 /*
105 * Start smoothly with the new frequency:
106 */
107 sched_clock_idle_wakeup_event(0);
108 local_irq_restore(flags);
109}
110
111/*
112 * Scheduler clock - returns current time in nanosec units.
113 */
114unsigned long long native_sched_clock(void)
115{
116 unsigned long long this_offset;
117
118 /*
119 * Fall back to jiffies if there's no TSC available:
120 * ( But note that we still use it if the TSC is marked
121 * unstable. We do this because unlike Time Of Day,
122 * the scheduler clock tolerates small errors and it's
123 * very important for it to be as fast as the platform
124 * can achive it. )
125 */
126 if (unlikely(tsc_disabled))
127 /* No locking but a rare wrong value is not a big deal: */
128 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
129
130 /* read the Time Stamp Counter: */
131 rdtscll(this_offset);
132
133 /* return the value in ns */
134 return cycles_2_ns(this_offset);
135}
136
137/* We need to define a real function for sched_clock, to override the
138 weak default version */
139#ifdef CONFIG_PARAVIRT
140unsigned long long sched_clock(void)
141{
142 return paravirt_sched_clock();
143}
144#else
145unsigned long long sched_clock(void)
146 __attribute__((alias("native_sched_clock")));
147#endif
148
149unsigned long native_calculate_cpu_khz(void)
150{
151 unsigned long long start, end;
152 unsigned long count;
153 u64 delta64 = (u64)ULLONG_MAX;
154 int i;
155 unsigned long flags;
156
157 local_irq_save(flags);
158
159 /* run 3 times to ensure the cache is warm and to get an accurate reading */
160 for (i = 0; i < 3; i++) {
161 mach_prepare_counter();
162 rdtscll(start);
163 mach_countup(&count);
164 rdtscll(end);
165
166 /*
167 * Error: ECTCNEVERSET
168 * The CTC wasn't reliable: we got a hit on the very first read,
169 * or the CPU was so fast/slow that the quotient wouldn't fit in
170 * 32 bits..
171 */
172 if (count <= 1)
173 continue;
174
175 /* cpu freq too slow: */
176 if ((end - start) <= CALIBRATE_TIME_MSEC)
177 continue;
178
179 /*
180 * We want the minimum time of all runs in case one of them
181 * is inaccurate due to SMI or other delay
182 */
183 delta64 = min(delta64, (end - start));
184 }
185
186 /* cpu freq too fast (or every run was bad): */
187 if (delta64 > (1ULL<<32))
188 goto err;
189
190 delta64 += CALIBRATE_TIME_MSEC/2; /* round for do_div */
191 do_div(delta64,CALIBRATE_TIME_MSEC);
192
193 local_irq_restore(flags);
194 return (unsigned long)delta64;
195err:
196 local_irq_restore(flags);
197 return 0;
198}
199
200int recalibrate_cpu_khz(void)
201{
202#ifndef CONFIG_SMP
203 unsigned long cpu_khz_old = cpu_khz;
204
205 if (cpu_has_tsc) {
206 cpu_khz = calculate_cpu_khz();
207 tsc_khz = cpu_khz;
208 cpu_data(0).loops_per_jiffy =
209 cpufreq_scale(cpu_data(0).loops_per_jiffy,
210 cpu_khz_old, cpu_khz);
211 return 0;
212 } else
213 return -ENODEV;
214#else
215 return -ENODEV;
216#endif
217}
218
219EXPORT_SYMBOL(recalibrate_cpu_khz);
220
221#ifdef CONFIG_CPU_FREQ
222
223/*
224 * if the CPU frequency is scaled, TSC-based delays will need a different
225 * loops_per_jiffy value to function properly.
226 */
227static unsigned int ref_freq;
228static unsigned long loops_per_jiffy_ref;
229static unsigned long cpu_khz_ref;
230
231static int
232time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
233{
234 struct cpufreq_freqs *freq = data;
235
236 if (!ref_freq) {
237 if (!freq->old){
238 ref_freq = freq->new;
239 return 0;
240 }
241 ref_freq = freq->old;
242 loops_per_jiffy_ref = cpu_data(freq->cpu).loops_per_jiffy;
243 cpu_khz_ref = cpu_khz;
244 }
245
246 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
247 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
248 (val == CPUFREQ_RESUMECHANGE)) {
249 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
250 cpu_data(freq->cpu).loops_per_jiffy =
251 cpufreq_scale(loops_per_jiffy_ref,
252 ref_freq, freq->new);
253
254 if (cpu_khz) {
255
256 if (num_online_cpus() == 1)
257 cpu_khz = cpufreq_scale(cpu_khz_ref,
258 ref_freq, freq->new);
259 if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
260 tsc_khz = cpu_khz;
261 set_cyc2ns_scale(cpu_khz, freq->cpu);
262 /*
263 * TSC based sched_clock turns
264 * to junk w/ cpufreq
265 */
266 mark_tsc_unstable("cpufreq changes");
267 }
268 }
269 }
270
271 return 0;
272}
273
274static struct notifier_block time_cpufreq_notifier_block = {
275 .notifier_call = time_cpufreq_notifier
276};
277
278static int __init cpufreq_tsc(void)
279{
280 return cpufreq_register_notifier(&time_cpufreq_notifier_block,
281 CPUFREQ_TRANSITION_NOTIFIER);
282}
283core_initcall(cpufreq_tsc);
284
285#endif
286
287/* clock source code */
288
289static unsigned long current_tsc_khz;
290static struct clocksource clocksource_tsc;
291
292/*
293 * We compare the TSC to the cycle_last value in the clocksource
294 * structure to avoid a nasty time-warp issue. This can be observed in
295 * a very small window right after one CPU updated cycle_last under
296 * xtime lock and the other CPU reads a TSC value which is smaller
297 * than the cycle_last reference value due to a TSC which is slighty
298 * behind. This delta is nowhere else observable, but in that case it
299 * results in a forward time jump in the range of hours due to the
300 * unsigned delta calculation of the time keeping core code, which is
301 * necessary to support wrapping clocksources like pm timer.
302 */
303static cycle_t read_tsc(void)
304{
305 cycle_t ret;
306
307 rdtscll(ret);
308
309 return ret >= clocksource_tsc.cycle_last ?
310 ret : clocksource_tsc.cycle_last;
311}
312
313static struct clocksource clocksource_tsc = {
314 .name = "tsc",
315 .rating = 300,
316 .read = read_tsc,
317 .mask = CLOCKSOURCE_MASK(64),
318 .mult = 0, /* to be set */
319 .shift = 22,
320 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
321 CLOCK_SOURCE_MUST_VERIFY,
322};
323
324void mark_tsc_unstable(char *reason)
325{
326 if (!tsc_unstable) {
327 tsc_unstable = 1;
328 printk("Marking TSC unstable due to: %s.\n", reason);
329 /* Can be called before registration */
330 if (clocksource_tsc.mult)
331 clocksource_change_rating(&clocksource_tsc, 0);
332 else
333 clocksource_tsc.rating = 0;
334 }
335}
336EXPORT_SYMBOL_GPL(mark_tsc_unstable);
337
338static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d)
339{
340 printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
341 d->ident);
342 tsc_unstable = 1;
343 return 0;
344}
345
346/* List of systems that have known TSC problems */
347static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
348 {
349 .callback = dmi_mark_tsc_unstable,
350 .ident = "IBM Thinkpad 380XD",
351 .matches = {
352 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
353 DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
354 },
355 },
356 {}
357};
358
359/*
360 * Make an educated guess if the TSC is trustworthy and synchronized
361 * over all CPUs.
362 */
363__cpuinit int unsynchronized_tsc(void)
364{
365 if (!cpu_has_tsc || tsc_unstable)
366 return 1;
367
368 /* Anything with constant TSC should be synchronized */
369 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
370 return 0;
371
372 /*
373 * Intel systems are normally all synchronized.
374 * Exceptions must mark TSC as unstable:
375 */
376 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
377 /* assume multi socket systems are not synchronized: */
378 if (num_possible_cpus() > 1)
379 tsc_unstable = 1;
380 }
381 return tsc_unstable;
382}
383
384/*
385 * Geode_LX - the OLPC CPU has a possibly a very reliable TSC
386 */
387#ifdef CONFIG_MGEODE_LX
388/* RTSC counts during suspend */
389#define RTSC_SUSP 0x100
390
391static void __init check_geode_tsc_reliable(void)
392{
393 unsigned long res_low, res_high;
394
395 rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
396 if (res_low & RTSC_SUSP)
397 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
398}
399#else
400static inline void check_geode_tsc_reliable(void) { }
401#endif
402
403
404void __init tsc_init(void)
405{
406 int cpu;
407
408 if (!cpu_has_tsc || tsc_disabled > 0)
409 return;
410
411 cpu_khz = calculate_cpu_khz();
412 tsc_khz = cpu_khz;
413
414 if (!cpu_khz) {
415 mark_tsc_unstable("could not calculate TSC khz");
416 return;
417 }
418
419 /* now allow native_sched_clock() to use rdtsc */
420 tsc_disabled = 0;
421
422 printk("Detected %lu.%03lu MHz processor.\n",
423 (unsigned long)cpu_khz / 1000,
424 (unsigned long)cpu_khz % 1000);
425
426 /*
427 * Secondary CPUs do not run through tsc_init(), so set up
428 * all the scale factors for all CPUs, assuming the same
429 * speed as the bootup CPU. (cpufreq notifiers will fix this
430 * up if their speed diverges)
431 */
432 for_each_possible_cpu(cpu)
433 set_cyc2ns_scale(cpu_khz, cpu);
434
435 use_tsc_delay();
436
437 /* Check and install the TSC clocksource */
438 dmi_check_system(bad_tsc_dmi_table);
439
440 unsynchronized_tsc();
441 check_geode_tsc_reliable();
442 current_tsc_khz = tsc_khz;
443 clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
444 clocksource_tsc.shift);
445 /* lower the rating if we already know its unstable: */
446 if (check_tsc_unstable()) {
447 clocksource_tsc.rating = 0;
448 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
449 }
450 clocksource_register(&clocksource_tsc);
451}
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
deleted file mode 100644
index 1784b8077a12..000000000000
--- a/arch/x86/kernel/tsc_64.c
+++ /dev/null
@@ -1,357 +0,0 @@
1#include <linux/kernel.h>
2#include <linux/sched.h>
3#include <linux/interrupt.h>
4#include <linux/init.h>
5#include <linux/clocksource.h>
6#include <linux/time.h>
7#include <linux/acpi.h>
8#include <linux/cpufreq.h>
9#include <linux/acpi_pmtmr.h>
10
11#include <asm/hpet.h>
12#include <asm/timex.h>
13#include <asm/timer.h>
14#include <asm/vgtod.h>
15
16static int notsc __initdata = 0;
17
18unsigned int cpu_khz; /* TSC clocks / usec, not used here */
19EXPORT_SYMBOL(cpu_khz);
20unsigned int tsc_khz;
21EXPORT_SYMBOL(tsc_khz);
22
23/* Accelerators for sched_clock()
24 * convert from cycles(64bits) => nanoseconds (64bits)
25 * basic equation:
26 * ns = cycles / (freq / ns_per_sec)
27 * ns = cycles * (ns_per_sec / freq)
28 * ns = cycles * (10^9 / (cpu_khz * 10^3))
29 * ns = cycles * (10^6 / cpu_khz)
30 *
31 * Then we use scaling math (suggested by george@mvista.com) to get:
32 * ns = cycles * (10^6 * SC / cpu_khz) / SC
33 * ns = cycles * cyc2ns_scale / SC
34 *
35 * And since SC is a constant power of two, we can convert the div
36 * into a shift.
37 *
38 * We can use khz divisor instead of mhz to keep a better precision, since
39 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
40 * (mathieu.desnoyers@polymtl.ca)
41 *
42 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
43 */
44DEFINE_PER_CPU(unsigned long, cyc2ns);
45
46static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
47{
48 unsigned long long tsc_now, ns_now;
49 unsigned long flags, *scale;
50
51 local_irq_save(flags);
52 sched_clock_idle_sleep_event();
53
54 scale = &per_cpu(cyc2ns, cpu);
55
56 rdtscll(tsc_now);
57 ns_now = __cycles_2_ns(tsc_now);
58
59 if (cpu_khz)
60 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
61
62 sched_clock_idle_wakeup_event(0);
63 local_irq_restore(flags);
64}
65
66unsigned long long native_sched_clock(void)
67{
68 unsigned long a = 0;
69
70 /* Could do CPU core sync here. Opteron can execute rdtsc speculatively,
71 * which means it is not completely exact and may not be monotonous
72 * between CPUs. But the errors should be too small to matter for
73 * scheduling purposes.
74 */
75
76 rdtscll(a);
77 return cycles_2_ns(a);
78}
79
80/* We need to define a real function for sched_clock, to override the
81 weak default version */
82#ifdef CONFIG_PARAVIRT
83unsigned long long sched_clock(void)
84{
85 return paravirt_sched_clock();
86}
87#else
88unsigned long long
89sched_clock(void) __attribute__((alias("native_sched_clock")));
90#endif
91
92
93static int tsc_unstable;
94
95int check_tsc_unstable(void)
96{
97 return tsc_unstable;
98}
99EXPORT_SYMBOL_GPL(check_tsc_unstable);
100
101#ifdef CONFIG_CPU_FREQ
102
103/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
104 * changes.
105 *
106 * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's
107 * not that important because current Opteron setups do not support
108 * scaling on SMP anyroads.
109 *
110 * Should fix up last_tsc too. Currently gettimeofday in the
111 * first tick after the change will be slightly wrong.
112 */
113
114static unsigned int ref_freq;
115static unsigned long loops_per_jiffy_ref;
116static unsigned long tsc_khz_ref;
117
118static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
119 void *data)
120{
121 struct cpufreq_freqs *freq = data;
122 unsigned long *lpj, dummy;
123
124 if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC))
125 return 0;
126
127 lpj = &dummy;
128 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
129#ifdef CONFIG_SMP
130 lpj = &cpu_data(freq->cpu).loops_per_jiffy;
131#else
132 lpj = &boot_cpu_data.loops_per_jiffy;
133#endif
134
135 if (!ref_freq) {
136 ref_freq = freq->old;
137 loops_per_jiffy_ref = *lpj;
138 tsc_khz_ref = tsc_khz;
139 }
140 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
141 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
142 (val == CPUFREQ_RESUMECHANGE)) {
143 *lpj =
144 cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
145
146 tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
147 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
148 mark_tsc_unstable("cpufreq changes");
149 }
150
151 set_cyc2ns_scale(tsc_khz_ref, freq->cpu);
152
153 return 0;
154}
155
156static struct notifier_block time_cpufreq_notifier_block = {
157 .notifier_call = time_cpufreq_notifier
158};
159
160static int __init cpufreq_tsc(void)
161{
162 cpufreq_register_notifier(&time_cpufreq_notifier_block,
163 CPUFREQ_TRANSITION_NOTIFIER);
164 return 0;
165}
166
167core_initcall(cpufreq_tsc);
168
169#endif
170
171#define MAX_RETRIES 5
172#define SMI_TRESHOLD 50000
173
174/*
175 * Read TSC and the reference counters. Take care of SMI disturbance
176 */
177static unsigned long __init tsc_read_refs(unsigned long *pm,
178 unsigned long *hpet)
179{
180 unsigned long t1, t2;
181 int i;
182
183 for (i = 0; i < MAX_RETRIES; i++) {
184 t1 = get_cycles();
185 if (hpet)
186 *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
187 else
188 *pm = acpi_pm_read_early();
189 t2 = get_cycles();
190 if ((t2 - t1) < SMI_TRESHOLD)
191 return t2;
192 }
193 return ULONG_MAX;
194}
195
196/**
197 * tsc_calibrate - calibrate the tsc on boot
198 */
199void __init tsc_calibrate(void)
200{
201 unsigned long flags, tsc1, tsc2, tr1, tr2, pm1, pm2, hpet1, hpet2;
202 int hpet = is_hpet_enabled(), cpu;
203
204 local_irq_save(flags);
205
206 tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL);
207
208 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
209
210 outb(0xb0, 0x43);
211 outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
212 outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42);
213 tr1 = get_cycles();
214 while ((inb(0x61) & 0x20) == 0);
215 tr2 = get_cycles();
216
217 tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL);
218
219 local_irq_restore(flags);
220
221 /*
222 * Preset the result with the raw and inaccurate PIT
223 * calibration value
224 */
225 tsc_khz = (tr2 - tr1) / 50;
226
227 /* hpet or pmtimer available ? */
228 if (!hpet && !pm1 && !pm2) {
229 printk(KERN_INFO "TSC calibrated against PIT\n");
230 goto out;
231 }
232
233 /* Check, whether the sampling was disturbed by an SMI */
234 if (tsc1 == ULONG_MAX || tsc2 == ULONG_MAX) {
235 printk(KERN_WARNING "TSC calibration disturbed by SMI, "
236 "using PIT calibration result\n");
237 goto out;
238 }
239
240 tsc2 = (tsc2 - tsc1) * 1000000L;
241
242 if (hpet) {
243 printk(KERN_INFO "TSC calibrated against HPET\n");
244 if (hpet2 < hpet1)
245 hpet2 += 0x100000000;
246 hpet2 -= hpet1;
247 tsc1 = (hpet2 * hpet_readl(HPET_PERIOD)) / 1000000;
248 } else {
249 printk(KERN_INFO "TSC calibrated against PM_TIMER\n");
250 if (pm2 < pm1)
251 pm2 += ACPI_PM_OVRRUN;
252 pm2 -= pm1;
253 tsc1 = (pm2 * 1000000000) / PMTMR_TICKS_PER_SEC;
254 }
255
256 tsc_khz = tsc2 / tsc1;
257
258out:
259 for_each_possible_cpu(cpu)
260 set_cyc2ns_scale(tsc_khz, cpu);
261}
262
263/*
264 * Make an educated guess if the TSC is trustworthy and synchronized
265 * over all CPUs.
266 */
267__cpuinit int unsynchronized_tsc(void)
268{
269 if (tsc_unstable)
270 return 1;
271
272#ifdef CONFIG_SMP
273 if (apic_is_clustered_box())
274 return 1;
275#endif
276
277 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
278 return 0;
279
280 /* Assume multi socket systems are not synchronized */
281 return num_present_cpus() > 1;
282}
283
284int __init notsc_setup(char *s)
285{
286 notsc = 1;
287 return 1;
288}
289
290__setup("notsc", notsc_setup);
291
292static struct clocksource clocksource_tsc;
293
294/*
295 * We compare the TSC to the cycle_last value in the clocksource
296 * structure to avoid a nasty time-warp. This can be observed in a
297 * very small window right after one CPU updated cycle_last under
298 * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
299 * is smaller than the cycle_last reference value due to a TSC which
300 * is slighty behind. This delta is nowhere else observable, but in
301 * that case it results in a forward time jump in the range of hours
302 * due to the unsigned delta calculation of the time keeping core
303 * code, which is necessary to support wrapping clocksources like pm
304 * timer.
305 */
306static cycle_t read_tsc(void)
307{
308 cycle_t ret = (cycle_t)get_cycles();
309
310 return ret >= clocksource_tsc.cycle_last ?
311 ret : clocksource_tsc.cycle_last;
312}
313
314static cycle_t __vsyscall_fn vread_tsc(void)
315{
316 cycle_t ret = (cycle_t)vget_cycles();
317
318 return ret >= __vsyscall_gtod_data.clock.cycle_last ?
319 ret : __vsyscall_gtod_data.clock.cycle_last;
320}
321
322static struct clocksource clocksource_tsc = {
323 .name = "tsc",
324 .rating = 300,
325 .read = read_tsc,
326 .mask = CLOCKSOURCE_MASK(64),
327 .shift = 22,
328 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
329 CLOCK_SOURCE_MUST_VERIFY,
330 .vread = vread_tsc,
331};
332
333void mark_tsc_unstable(char *reason)
334{
335 if (!tsc_unstable) {
336 tsc_unstable = 1;
337 printk("Marking TSC unstable due to %s\n", reason);
338 /* Change only the rating, when not registered */
339 if (clocksource_tsc.mult)
340 clocksource_change_rating(&clocksource_tsc, 0);
341 else
342 clocksource_tsc.rating = 0;
343 }
344}
345EXPORT_SYMBOL_GPL(mark_tsc_unstable);
346
347void __init init_tsc_clocksource(void)
348{
349 if (!notsc) {
350 clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
351 clocksource_tsc.shift);
352 if (check_tsc_unstable())
353 clocksource_tsc.rating = 0;
354
355 clocksource_register(&clocksource_tsc);
356 }
357}
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 0577825cf89b..9ffb01c31c40 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -88,11 +88,9 @@ static __cpuinit void check_tsc_warp(void)
88 __raw_spin_unlock(&sync_lock); 88 __raw_spin_unlock(&sync_lock);
89 } 89 }
90 } 90 }
91 if (!(now-start)) { 91 WARN(!(now-start),
92 printk("Warning: zero tsc calibration delta: %Ld [max: %Ld]\n", 92 "Warning: zero tsc calibration delta: %Ld [max: %Ld]\n",
93 now-start, end-start); 93 now-start, end-start);
94 WARN_ON(1);
95 }
96} 94}
97 95
98/* 96/*
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
new file mode 100644
index 000000000000..61a97e616f70
--- /dev/null
+++ b/arch/x86/kernel/visws_quirks.c
@@ -0,0 +1,691 @@
1/*
2 * SGI Visual Workstation support and quirks, unmaintained.
3 *
4 * Split out from setup.c by davej@suse.de
5 *
6 * Copyright (C) 1999 Bent Hagemark, Ingo Molnar
7 *
8 * SGI Visual Workstation interrupt controller
9 *
10 * The Cobalt system ASIC in the Visual Workstation contains a "Cobalt" APIC
11 * which serves as the main interrupt controller in the system. Non-legacy
12 * hardware in the system uses this controller directly. Legacy devices
13 * are connected to the PIIX4 which in turn has its 8259(s) connected to
14 * a of the Cobalt APIC entry.
15 *
16 * 09/02/2000 - Updated for 2.4 by jbarnes@sgi.com
17 *
18 * 25/11/2002 - Updated for 2.5 by Andrey Panin <pazke@orbita1.ru>
19 */
20#include <linux/interrupt.h>
21#include <linux/module.h>
22#include <linux/init.h>
23#include <linux/smp.h>
24
25#include <asm/visws/cobalt.h>
26#include <asm/visws/piix4.h>
27#include <asm/arch_hooks.h>
28#include <asm/io_apic.h>
29#include <asm/fixmap.h>
30#include <asm/reboot.h>
31#include <asm/setup.h>
32#include <asm/e820.h>
33#include <asm/io.h>
34
35#include <mach_ipi.h>
36
37#include "mach_apic.h"
38
39#include <linux/kernel_stat.h>
40
41#include <asm/i8259.h>
42#include <asm/irq_vectors.h>
43#include <asm/visws/lithium.h>
44
45#include <linux/sched.h>
46#include <linux/kernel.h>
47#include <linux/pci.h>
48#include <linux/pci_ids.h>
49
50extern int no_broadcast;
51
52#include <asm/apic.h>
53
54char visws_board_type = -1;
55char visws_board_rev = -1;
56
57int is_visws_box(void)
58{
59 return visws_board_type >= 0;
60}
61
62static int __init visws_time_init(void)
63{
64 printk(KERN_INFO "Starting Cobalt Timer system clock\n");
65
66 /* Set the countdown value */
67 co_cpu_write(CO_CPU_TIMEVAL, CO_TIME_HZ/HZ);
68
69 /* Start the timer */
70 co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) | CO_CTRL_TIMERUN);
71
72 /* Enable (unmask) the timer interrupt */
73 co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK);
74
75 /*
76 * Zero return means the generic timer setup code will set up
77 * the standard vector:
78 */
79 return 0;
80}
81
82static int __init visws_pre_intr_init(void)
83{
84 init_VISWS_APIC_irqs();
85
86 /*
87 * We dont want ISA irqs to be set up by the generic code:
88 */
89 return 1;
90}
91
92/* Quirk for machine specific memory setup. */
93
94#define MB (1024 * 1024)
95
96unsigned long sgivwfb_mem_phys;
97unsigned long sgivwfb_mem_size;
98EXPORT_SYMBOL(sgivwfb_mem_phys);
99EXPORT_SYMBOL(sgivwfb_mem_size);
100
101long long mem_size __initdata = 0;
102
103static char * __init visws_memory_setup(void)
104{
105 long long gfx_mem_size = 8 * MB;
106
107 mem_size = boot_params.alt_mem_k;
108
109 if (!mem_size) {
110 printk(KERN_WARNING "Bootloader didn't set memory size, upgrade it !\n");
111 mem_size = 128 * MB;
112 }
113
114 /*
115 * this hardcodes the graphics memory to 8 MB
116 * it really should be sized dynamically (or at least
117 * set as a boot param)
118 */
119 if (!sgivwfb_mem_size) {
120 printk(KERN_WARNING "Defaulting to 8 MB framebuffer size\n");
121 sgivwfb_mem_size = 8 * MB;
122 }
123
124 /*
125 * Trim to nearest MB
126 */
127 sgivwfb_mem_size &= ~((1 << 20) - 1);
128 sgivwfb_mem_phys = mem_size - gfx_mem_size;
129
130 e820_add_region(0, LOWMEMSIZE(), E820_RAM);
131 e820_add_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM);
132 e820_add_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED);
133
134 return "PROM";
135}
136
137static void visws_machine_emergency_restart(void)
138{
139 /*
140 * Visual Workstations restart after this
141 * register is poked on the PIIX4
142 */
143 outb(PIIX4_RESET_VAL, PIIX4_RESET_PORT);
144}
145
146static void visws_machine_power_off(void)
147{
148 unsigned short pm_status;
149/* extern unsigned int pci_bus0; */
150
151 while ((pm_status = inw(PMSTS_PORT)) & 0x100)
152 outw(pm_status, PMSTS_PORT);
153
154 outw(PM_SUSPEND_ENABLE, PMCNTRL_PORT);
155
156 mdelay(10);
157
158#define PCI_CONF1_ADDRESS(bus, devfn, reg) \
159 (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3))
160
161/* outl(PCI_CONF1_ADDRESS(pci_bus0, SPECIAL_DEV, SPECIAL_REG), 0xCF8); */
162 outl(PIIX_SPECIAL_STOP, 0xCFC);
163}
164
165static int __init visws_get_smp_config(unsigned int early)
166{
167 /*
168 * Prevent MP-table parsing by the generic code:
169 */
170 return 1;
171}
172
173/*
174 * The Visual Workstation is Intel MP compliant in the hardware
175 * sense, but it doesn't have a BIOS(-configuration table).
176 * No problem for Linux.
177 */
178
179static void __init MP_processor_info(struct mpc_config_processor *m)
180{
181 int ver, logical_apicid;
182 physid_mask_t apic_cpus;
183
184 if (!(m->mpc_cpuflag & CPU_ENABLED))
185 return;
186
187 logical_apicid = m->mpc_apicid;
188 printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n",
189 m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "",
190 m->mpc_apicid,
191 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
192 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
193 m->mpc_apicver);
194
195 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR)
196 boot_cpu_physical_apicid = m->mpc_apicid;
197
198 ver = m->mpc_apicver;
199 if ((ver >= 0x14 && m->mpc_apicid >= 0xff) || m->mpc_apicid >= 0xf) {
200 printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
201 m->mpc_apicid, MAX_APICS);
202 return;
203 }
204
205 apic_cpus = apicid_to_cpu_present(m->mpc_apicid);
206 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
207 /*
208 * Validate version
209 */
210 if (ver == 0x0) {
211 printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! "
212 "fixing up to 0x10. (tell your hw vendor)\n",
213 m->mpc_apicid);
214 ver = 0x10;
215 }
216 apic_version[m->mpc_apicid] = ver;
217}
218
219static int __init visws_find_smp_config(unsigned int reserve)
220{
221 struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS);
222 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
223
224 if (ncpus > CO_CPU_MAX) {
225 printk(KERN_WARNING "find_visws_smp: got cpu count of %d at %p\n",
226 ncpus, mp);
227
228 ncpus = CO_CPU_MAX;
229 }
230
231 if (ncpus > setup_max_cpus)
232 ncpus = setup_max_cpus;
233
234#ifdef CONFIG_X86_LOCAL_APIC
235 smp_found_config = 1;
236#endif
237 while (ncpus--)
238 MP_processor_info(mp++);
239
240 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
241
242 return 1;
243}
244
245static int visws_trap_init(void);
246
247static struct x86_quirks visws_x86_quirks __initdata = {
248 .arch_time_init = visws_time_init,
249 .arch_pre_intr_init = visws_pre_intr_init,
250 .arch_memory_setup = visws_memory_setup,
251 .arch_intr_init = NULL,
252 .arch_trap_init = visws_trap_init,
253 .mach_get_smp_config = visws_get_smp_config,
254 .mach_find_smp_config = visws_find_smp_config,
255};
256
257void __init visws_early_detect(void)
258{
259 int raw;
260
261 visws_board_type = (char)(inb_p(PIIX_GPI_BD_REG) & PIIX_GPI_BD_REG)
262 >> PIIX_GPI_BD_SHIFT;
263
264 if (visws_board_type < 0)
265 return;
266
267 /*
268 * Install special quirks for timer, interrupt and memory setup:
269 * Fall back to generic behavior for traps:
270 * Override generic MP-table parsing:
271 */
272 x86_quirks = &visws_x86_quirks;
273
274 /*
275 * Install reboot quirks:
276 */
277 pm_power_off = visws_machine_power_off;
278 machine_ops.emergency_restart = visws_machine_emergency_restart;
279
280 /*
281 * Do not use broadcast IPIs:
282 */
283 no_broadcast = 0;
284
285#ifdef CONFIG_X86_IO_APIC
286 /*
287 * Turn off IO-APIC detection and initialization:
288 */
289 skip_ioapic_setup = 1;
290#endif
291
292 /*
293 * Get Board rev.
294 * First, we have to initialize the 307 part to allow us access
295 * to the GPIO registers. Let's map them at 0x0fc0 which is right
296 * after the PIIX4 PM section.
297 */
298 outb_p(SIO_DEV_SEL, SIO_INDEX);
299 outb_p(SIO_GP_DEV, SIO_DATA); /* Talk to GPIO regs. */
300
301 outb_p(SIO_DEV_MSB, SIO_INDEX);
302 outb_p(SIO_GP_MSB, SIO_DATA); /* MSB of GPIO base address */
303
304 outb_p(SIO_DEV_LSB, SIO_INDEX);
305 outb_p(SIO_GP_LSB, SIO_DATA); /* LSB of GPIO base address */
306
307 outb_p(SIO_DEV_ENB, SIO_INDEX);
308 outb_p(1, SIO_DATA); /* Enable GPIO registers. */
309
310 /*
311 * Now, we have to map the power management section to write
312 * a bit which enables access to the GPIO registers.
313 * What lunatic came up with this shit?
314 */
315 outb_p(SIO_DEV_SEL, SIO_INDEX);
316 outb_p(SIO_PM_DEV, SIO_DATA); /* Talk to GPIO regs. */
317
318 outb_p(SIO_DEV_MSB, SIO_INDEX);
319 outb_p(SIO_PM_MSB, SIO_DATA); /* MSB of PM base address */
320
321 outb_p(SIO_DEV_LSB, SIO_INDEX);
322 outb_p(SIO_PM_LSB, SIO_DATA); /* LSB of PM base address */
323
324 outb_p(SIO_DEV_ENB, SIO_INDEX);
325 outb_p(1, SIO_DATA); /* Enable PM registers. */
326
327 /*
328 * Now, write the PM register which enables the GPIO registers.
329 */
330 outb_p(SIO_PM_FER2, SIO_PM_INDEX);
331 outb_p(SIO_PM_GP_EN, SIO_PM_DATA);
332
333 /*
334 * Now, initialize the GPIO registers.
335 * We want them all to be inputs which is the
336 * power on default, so let's leave them alone.
337 * So, let's just read the board rev!
338 */
339 raw = inb_p(SIO_GP_DATA1);
340 raw &= 0x7f; /* 7 bits of valid board revision ID. */
341
342 if (visws_board_type == VISWS_320) {
343 if (raw < 0x6) {
344 visws_board_rev = 4;
345 } else if (raw < 0xc) {
346 visws_board_rev = 5;
347 } else {
348 visws_board_rev = 6;
349 }
350 } else if (visws_board_type == VISWS_540) {
351 visws_board_rev = 2;
352 } else {
353 visws_board_rev = raw;
354 }
355
356 printk(KERN_INFO "Silicon Graphics Visual Workstation %s (rev %d) detected\n",
357 (visws_board_type == VISWS_320 ? "320" :
358 (visws_board_type == VISWS_540 ? "540" :
359 "unknown")), visws_board_rev);
360}
361
362#define A01234 (LI_INTA_0 | LI_INTA_1 | LI_INTA_2 | LI_INTA_3 | LI_INTA_4)
363#define BCD (LI_INTB | LI_INTC | LI_INTD)
364#define ALLDEVS (A01234 | BCD)
365
366static __init void lithium_init(void)
367{
368 set_fixmap(FIX_LI_PCIA, LI_PCI_A_PHYS);
369 set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS);
370
371 if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
372 (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
373 printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A');
374/* panic("This machine is not SGI Visual Workstation 320/540"); */
375 }
376
377 if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
378 (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
379 printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B');
380/* panic("This machine is not SGI Visual Workstation 320/540"); */
381 }
382
383 li_pcia_write16(LI_PCI_INTEN, ALLDEVS);
384 li_pcib_write16(LI_PCI_INTEN, ALLDEVS);
385}
386
387static __init void cobalt_init(void)
388{
389 /*
390 * On normal SMP PC this is used only with SMP, but we have to
391 * use it and set it up here to start the Cobalt clock
392 */
393 set_fixmap(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE);
394 setup_local_APIC();
395 printk(KERN_INFO "Local APIC Version %#x, ID %#x\n",
396 (unsigned int)apic_read(APIC_LVR),
397 (unsigned int)apic_read(APIC_ID));
398
399 set_fixmap(FIX_CO_CPU, CO_CPU_PHYS);
400 set_fixmap(FIX_CO_APIC, CO_APIC_PHYS);
401 printk(KERN_INFO "Cobalt Revision %#lx, APIC ID %#lx\n",
402 co_cpu_read(CO_CPU_REV), co_apic_read(CO_APIC_ID));
403
404 /* Enable Cobalt APIC being careful to NOT change the ID! */
405 co_apic_write(CO_APIC_ID, co_apic_read(CO_APIC_ID) | CO_APIC_ENABLE);
406
407 printk(KERN_INFO "Cobalt APIC enabled: ID reg %#lx\n",
408 co_apic_read(CO_APIC_ID));
409}
410
411static int __init visws_trap_init(void)
412{
413 lithium_init();
414 cobalt_init();
415
416 return 1;
417}
418
419/*
420 * IRQ controller / APIC support:
421 */
422
423static DEFINE_SPINLOCK(cobalt_lock);
424
425/*
426 * Set the given Cobalt APIC Redirection Table entry to point
427 * to the given IDT vector/index.
428 */
429static inline void co_apic_set(int entry, int irq)
430{
431 co_apic_write(CO_APIC_LO(entry), CO_APIC_LEVEL | (irq + FIRST_EXTERNAL_VECTOR));
432 co_apic_write(CO_APIC_HI(entry), 0);
433}
434
435/*
436 * Cobalt (IO)-APIC functions to handle PCI devices.
437 */
438static inline int co_apic_ide0_hack(void)
439{
440 extern char visws_board_type;
441 extern char visws_board_rev;
442
443 if (visws_board_type == VISWS_320 && visws_board_rev == 5)
444 return 5;
445 return CO_APIC_IDE0;
446}
447
448static int is_co_apic(unsigned int irq)
449{
450 if (IS_CO_APIC(irq))
451 return CO_APIC(irq);
452
453 switch (irq) {
454 case 0: return CO_APIC_CPU;
455 case CO_IRQ_IDE0: return co_apic_ide0_hack();
456 case CO_IRQ_IDE1: return CO_APIC_IDE1;
457 default: return -1;
458 }
459}
460
461
462/*
463 * This is the SGI Cobalt (IO-)APIC:
464 */
465
466static void enable_cobalt_irq(unsigned int irq)
467{
468 co_apic_set(is_co_apic(irq), irq);
469}
470
471static void disable_cobalt_irq(unsigned int irq)
472{
473 int entry = is_co_apic(irq);
474
475 co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK);
476 co_apic_read(CO_APIC_LO(entry));
477}
478
479/*
480 * "irq" really just serves to identify the device. Here is where we
481 * map this to the Cobalt APIC entry where it's physically wired.
482 * This is called via request_irq -> setup_irq -> irq_desc->startup()
483 */
484static unsigned int startup_cobalt_irq(unsigned int irq)
485{
486 unsigned long flags;
487
488 spin_lock_irqsave(&cobalt_lock, flags);
489 if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
490 irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
491 enable_cobalt_irq(irq);
492 spin_unlock_irqrestore(&cobalt_lock, flags);
493 return 0;
494}
495
496static void ack_cobalt_irq(unsigned int irq)
497{
498 unsigned long flags;
499
500 spin_lock_irqsave(&cobalt_lock, flags);
501 disable_cobalt_irq(irq);
502 apic_write(APIC_EOI, APIC_EIO_ACK);
503 spin_unlock_irqrestore(&cobalt_lock, flags);
504}
505
506static void end_cobalt_irq(unsigned int irq)
507{
508 unsigned long flags;
509
510 spin_lock_irqsave(&cobalt_lock, flags);
511 if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS)))
512 enable_cobalt_irq(irq);
513 spin_unlock_irqrestore(&cobalt_lock, flags);
514}
515
516static struct irq_chip cobalt_irq_type = {
517 .typename = "Cobalt-APIC",
518 .startup = startup_cobalt_irq,
519 .shutdown = disable_cobalt_irq,
520 .enable = enable_cobalt_irq,
521 .disable = disable_cobalt_irq,
522 .ack = ack_cobalt_irq,
523 .end = end_cobalt_irq,
524};
525
526
527/*
528 * This is the PIIX4-based 8259 that is wired up indirectly to Cobalt
529 * -- not the manner expected by the code in i8259.c.
530 *
531 * there is a 'master' physical interrupt source that gets sent to
532 * the CPU. But in the chipset there are various 'virtual' interrupts
533 * waiting to be handled. We represent this to Linux through a 'master'
534 * interrupt controller type, and through a special virtual interrupt-
535 * controller. Device drivers only see the virtual interrupt sources.
536 */
537static unsigned int startup_piix4_master_irq(unsigned int irq)
538{
539 init_8259A(0);
540
541 return startup_cobalt_irq(irq);
542}
543
544static void end_piix4_master_irq(unsigned int irq)
545{
546 unsigned long flags;
547
548 spin_lock_irqsave(&cobalt_lock, flags);
549 enable_cobalt_irq(irq);
550 spin_unlock_irqrestore(&cobalt_lock, flags);
551}
552
553static struct irq_chip piix4_master_irq_type = {
554 .typename = "PIIX4-master",
555 .startup = startup_piix4_master_irq,
556 .ack = ack_cobalt_irq,
557 .end = end_piix4_master_irq,
558};
559
560
561static struct irq_chip piix4_virtual_irq_type = {
562 .typename = "PIIX4-virtual",
563 .shutdown = disable_8259A_irq,
564 .enable = enable_8259A_irq,
565 .disable = disable_8259A_irq,
566};
567
568
569/*
570 * PIIX4-8259 master/virtual functions to handle interrupt requests
571 * from legacy devices: floppy, parallel, serial, rtc.
572 *
573 * None of these get Cobalt APIC entries, neither do they have IDT
574 * entries. These interrupts are purely virtual and distributed from
575 * the 'master' interrupt source: CO_IRQ_8259.
576 *
577 * When the 8259 interrupts its handler figures out which of these
578 * devices is interrupting and dispatches to its handler.
579 *
580 * CAREFUL: devices see the 'virtual' interrupt only. Thus disable/
581 * enable_irq gets the right irq. This 'master' irq is never directly
582 * manipulated by any driver.
583 */
584static irqreturn_t piix4_master_intr(int irq, void *dev_id)
585{
586 int realirq;
587 irq_desc_t *desc;
588 unsigned long flags;
589
590 spin_lock_irqsave(&i8259A_lock, flags);
591
592 /* Find out what's interrupting in the PIIX4 master 8259 */
593 outb(0x0c, 0x20); /* OCW3 Poll command */
594 realirq = inb(0x20);
595
596 /*
597 * Bit 7 == 0 means invalid/spurious
598 */
599 if (unlikely(!(realirq & 0x80)))
600 goto out_unlock;
601
602 realirq &= 7;
603
604 if (unlikely(realirq == 2)) {
605 outb(0x0c, 0xa0);
606 realirq = inb(0xa0);
607
608 if (unlikely(!(realirq & 0x80)))
609 goto out_unlock;
610
611 realirq = (realirq & 7) + 8;
612 }
613
614 /* mask and ack interrupt */
615 cached_irq_mask |= 1 << realirq;
616 if (unlikely(realirq > 7)) {
617 inb(0xa1);
618 outb(cached_slave_mask, 0xa1);
619 outb(0x60 + (realirq & 7), 0xa0);
620 outb(0x60 + 2, 0x20);
621 } else {
622 inb(0x21);
623 outb(cached_master_mask, 0x21);
624 outb(0x60 + realirq, 0x20);
625 }
626
627 spin_unlock_irqrestore(&i8259A_lock, flags);
628
629 desc = irq_desc + realirq;
630
631 /*
632 * handle this 'virtual interrupt' as a Cobalt one now.
633 */
634 kstat_cpu(smp_processor_id()).irqs[realirq]++;
635
636 if (likely(desc->action != NULL))
637 handle_IRQ_event(realirq, desc->action);
638
639 if (!(desc->status & IRQ_DISABLED))
640 enable_8259A_irq(realirq);
641
642 return IRQ_HANDLED;
643
644out_unlock:
645 spin_unlock_irqrestore(&i8259A_lock, flags);
646 return IRQ_NONE;
647}
648
649static struct irqaction master_action = {
650 .handler = piix4_master_intr,
651 .name = "PIIX4-8259",
652};
653
654static struct irqaction cascade_action = {
655 .handler = no_action,
656 .name = "cascade",
657};
658
659
660void init_VISWS_APIC_irqs(void)
661{
662 int i;
663
664 for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
665 irq_desc[i].status = IRQ_DISABLED;
666 irq_desc[i].action = 0;
667 irq_desc[i].depth = 1;
668
669 if (i == 0) {
670 irq_desc[i].chip = &cobalt_irq_type;
671 }
672 else if (i == CO_IRQ_IDE0) {
673 irq_desc[i].chip = &cobalt_irq_type;
674 }
675 else if (i == CO_IRQ_IDE1) {
676 irq_desc[i].chip = &cobalt_irq_type;
677 }
678 else if (i == CO_IRQ_8259) {
679 irq_desc[i].chip = &piix4_master_irq_type;
680 }
681 else if (i < CO_IRQ_APIC0) {
682 irq_desc[i].chip = &piix4_virtual_irq_type;
683 }
684 else if (IS_CO_APIC(i)) {
685 irq_desc[i].chip = &cobalt_irq_type;
686 }
687 }
688
689 setup_irq(CO_IRQ_8259, &master_action);
690 setup_irq(2, &cascade_action);
691}
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 38f566fa27d2..4eeb5cf9720d 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -46,6 +46,7 @@
46#include <asm/io.h> 46#include <asm/io.h>
47#include <asm/tlbflush.h> 47#include <asm/tlbflush.h>
48#include <asm/irq.h> 48#include <asm/irq.h>
49#include <asm/syscalls.h>
49 50
50/* 51/*
51 * Known problems: 52 * Known problems:
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 956f38927aa7..8b6c393ab9fd 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -37,6 +37,7 @@
37#include <asm/timer.h> 37#include <asm/timer.h>
38#include <asm/vmi_time.h> 38#include <asm/vmi_time.h>
39#include <asm/kmap_types.h> 39#include <asm/kmap_types.h>
40#include <asm/setup.h>
40 41
41/* Convenient for calling VMI functions indirectly in the ROM */ 42/* Convenient for calling VMI functions indirectly in the ROM */
42typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void); 43typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
@@ -151,7 +152,7 @@ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
151 insns, ip); 152 insns, ip);
152 case PARAVIRT_PATCH(pv_cpu_ops.iret): 153 case PARAVIRT_PATCH(pv_cpu_ops.iret):
153 return patch_internal(VMI_CALL_IRET, len, insns, ip); 154 return patch_internal(VMI_CALL_IRET, len, insns, ip);
154 case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret): 155 case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit):
155 return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip); 156 return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip);
156 default: 157 default:
157 break; 158 break;
@@ -234,7 +235,7 @@ static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
234 const void *desc) 235 const void *desc)
235{ 236{
236 u32 *ldt_entry = (u32 *)desc; 237 u32 *ldt_entry = (u32 *)desc;
237 vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[1]); 238 vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
238} 239}
239 240
240static void vmi_load_sp0(struct tss_struct *tss, 241static void vmi_load_sp0(struct tss_struct *tss,
@@ -392,13 +393,13 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
392} 393}
393#endif 394#endif
394 395
395static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn) 396static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
396{ 397{
397 vmi_set_page_type(pfn, VMI_PAGE_L1); 398 vmi_set_page_type(pfn, VMI_PAGE_L1);
398 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); 399 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
399} 400}
400 401
401static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn) 402static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
402{ 403{
403 /* 404 /*
404 * This call comes in very early, before mem_map is setup. 405 * This call comes in very early, before mem_map is setup.
@@ -409,20 +410,20 @@ static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn)
409 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); 410 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
410} 411}
411 412
412static void vmi_allocate_pmd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count) 413static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
413{ 414{
414 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE); 415 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
415 vmi_check_page_type(clonepfn, VMI_PAGE_L2); 416 vmi_check_page_type(clonepfn, VMI_PAGE_L2);
416 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); 417 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
417} 418}
418 419
419static void vmi_release_pte(u32 pfn) 420static void vmi_release_pte(unsigned long pfn)
420{ 421{
421 vmi_ops.release_page(pfn, VMI_PAGE_L1); 422 vmi_ops.release_page(pfn, VMI_PAGE_L1);
422 vmi_set_page_type(pfn, VMI_PAGE_NORMAL); 423 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
423} 424}
424 425
425static void vmi_release_pmd(u32 pfn) 426static void vmi_release_pmd(unsigned long pfn)
426{ 427{
427 vmi_ops.release_page(pfn, VMI_PAGE_L2); 428 vmi_ops.release_page(pfn, VMI_PAGE_L2);
428 vmi_set_page_type(pfn, VMI_PAGE_NORMAL); 429 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
@@ -683,7 +684,7 @@ void vmi_bringup(void)
683{ 684{
684 /* We must establish the lowmem mapping for MMU ops to work */ 685 /* We must establish the lowmem mapping for MMU ops to work */
685 if (vmi_ops.set_linear_mapping) 686 if (vmi_ops.set_linear_mapping)
686 vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, max_low_pfn, 0); 687 vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, MAXMEM_PFN, 0);
687} 688}
688 689
689/* 690/*
@@ -896,7 +897,7 @@ static inline int __init activate_vmi(void)
896 * the backend. They are performance critical anyway, so requiring 897 * the backend. They are performance critical anyway, so requiring
897 * a patch is not a big problem. 898 * a patch is not a big problem.
898 */ 899 */
899 pv_cpu_ops.irq_enable_syscall_ret = (void *)0xfeedbab0; 900 pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0;
900 pv_cpu_ops.iret = (void *)0xbadbab0; 901 pv_cpu_ops.iret = (void *)0xbadbab0;
901 902
902#ifdef CONFIG_SMP 903#ifdef CONFIG_SMP
@@ -904,9 +905,8 @@ static inline int __init activate_vmi(void)
904#endif 905#endif
905 906
906#ifdef CONFIG_X86_LOCAL_APIC 907#ifdef CONFIG_X86_LOCAL_APIC
907 para_fill(pv_apic_ops.apic_read, APICRead); 908 para_fill(apic_ops->read, APICRead);
908 para_fill(pv_apic_ops.apic_write, APICWrite); 909 para_fill(apic_ops->write, APICWrite);
909 para_fill(pv_apic_ops.apic_write_atomic, APICWrite);
910#endif 910#endif
911 911
912 /* 912 /*
@@ -932,7 +932,7 @@ static inline int __init activate_vmi(void)
932 pv_apic_ops.setup_secondary_clock = vmi_time_ap_init; 932 pv_apic_ops.setup_secondary_clock = vmi_time_ap_init;
933#endif 933#endif
934 pv_time_ops.sched_clock = vmi_sched_clock; 934 pv_time_ops.sched_clock = vmi_sched_clock;
935 pv_time_ops.get_cpu_khz = vmi_cpu_khz; 935 pv_time_ops.get_tsc_khz = vmi_tsc_khz;
936 936
937 /* We have true wallclock functions; disable CMOS clock sync */ 937 /* We have true wallclock functions; disable CMOS clock sync */
938 no_sync_cmos_clock = 1; 938 no_sync_cmos_clock = 1;
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index a2b030780aa9..6953859fe289 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -33,8 +33,7 @@
33#include <asm/apic.h> 33#include <asm/apic.h>
34#include <asm/timer.h> 34#include <asm/timer.h>
35#include <asm/i8253.h> 35#include <asm/i8253.h>
36 36#include <asm/irq_vectors.h>
37#include <irq_vectors.h>
38 37
39#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) 38#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
40#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) 39#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
@@ -70,8 +69,8 @@ unsigned long long vmi_sched_clock(void)
70 return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE)); 69 return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
71} 70}
72 71
73/* paravirt_ops.get_cpu_khz = vmi_cpu_khz */ 72/* paravirt_ops.get_tsc_khz = vmi_tsc_khz */
74unsigned long vmi_cpu_khz(void) 73unsigned long vmi_tsc_khz(void)
75{ 74{
76 unsigned long long khz; 75 unsigned long long khz;
77 khz = vmi_timer_ops.get_cycle_frequency(); 76 khz = vmi_timer_ops.get_cycle_frequency();
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index ce5ed083a1e9..a9b8560adbc2 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -49,23 +49,14 @@ SECTIONS
49 _etext = .; /* End of text section */ 49 _etext = .; /* End of text section */
50 } :text = 0x9090 50 } :text = 0x9090
51 51
52 NOTES :text :note
53
52 . = ALIGN(16); /* Exception table */ 54 . = ALIGN(16); /* Exception table */
53 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { 55 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
54 __start___ex_table = .; 56 __start___ex_table = .;
55 *(__ex_table) 57 *(__ex_table)
56 __stop___ex_table = .; 58 __stop___ex_table = .;
57 } 59 } :text = 0x9090
58
59 NOTES :text :note
60
61 BUG_TABLE :text
62
63 . = ALIGN(4);
64 .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
65 __tracedata_start = .;
66 *(.tracedata)
67 __tracedata_end = .;
68 }
69 60
70 RODATA 61 RODATA
71 62
@@ -149,10 +140,10 @@ SECTIONS
149 *(.con_initcall.init) 140 *(.con_initcall.init)
150 __con_initcall_end = .; 141 __con_initcall_end = .;
151 } 142 }
152 .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) { 143 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
153 __x86cpuvendor_start = .; 144 __x86_cpu_dev_start = .;
154 *(.x86cpuvendor.init) 145 *(.x86_cpu_dev.init)
155 __x86cpuvendor_end = .; 146 __x86_cpu_dev_end = .;
156 } 147 }
157 SECURITY_INIT 148 SECURITY_INIT
158 . = ALIGN(4); 149 . = ALIGN(4);
@@ -189,6 +180,7 @@ SECTIONS
189 . = ALIGN(PAGE_SIZE); 180 . = ALIGN(PAGE_SIZE);
190 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { 181 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
191 __per_cpu_start = .; 182 __per_cpu_start = .;
183 *(.data.percpu.page_aligned)
192 *(.data.percpu) 184 *(.data.percpu)
193 *(.data.percpu.shared_aligned) 185 *(.data.percpu.shared_aligned)
194 __per_cpu_end = .; 186 __per_cpu_end = .;
@@ -218,3 +210,11 @@ SECTIONS
218 210
219 DWARF_DEBUG 211 DWARF_DEBUG
220} 212}
213
214#ifdef CONFIG_KEXEC
215/* Link time checks */
216#include <asm/kexec.h>
217
218ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
219 "kexec control code size is too big")
220#endif
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index fad3674b06a5..46e05447405b 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -19,7 +19,7 @@ PHDRS {
19 data PT_LOAD FLAGS(7); /* RWE */ 19 data PT_LOAD FLAGS(7); /* RWE */
20 user PT_LOAD FLAGS(7); /* RWE */ 20 user PT_LOAD FLAGS(7); /* RWE */
21 data.init PT_LOAD FLAGS(7); /* RWE */ 21 data.init PT_LOAD FLAGS(7); /* RWE */
22 note PT_NOTE FLAGS(4); /* R__ */ 22 note PT_NOTE FLAGS(0); /* ___ */
23} 23}
24SECTIONS 24SECTIONS
25{ 25{
@@ -40,26 +40,17 @@ SECTIONS
40 _etext = .; /* End of text section */ 40 _etext = .; /* End of text section */
41 } :text = 0x9090 41 } :text = 0x9090
42 42
43 NOTES :text :note
44
43 . = ALIGN(16); /* Exception table */ 45 . = ALIGN(16); /* Exception table */
44 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { 46 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
45 __start___ex_table = .; 47 __start___ex_table = .;
46 *(__ex_table) 48 *(__ex_table)
47 __stop___ex_table = .; 49 __stop___ex_table = .;
48 } 50 } :text = 0x9090
49
50 NOTES :text :note
51
52 BUG_TABLE :text
53 51
54 RODATA 52 RODATA
55 53
56 . = ALIGN(4);
57 .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
58 __tracedata_start = .;
59 *(.tracedata)
60 __tracedata_end = .;
61 }
62
63 . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ 54 . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */
64 /* Data */ 55 /* Data */
65 .data : AT(ADDR(.data) - LOAD_OFFSET) { 56 .data : AT(ADDR(.data) - LOAD_OFFSET) {
@@ -177,11 +168,11 @@ SECTIONS
177 *(.con_initcall.init) 168 *(.con_initcall.init)
178 } 169 }
179 __con_initcall_end = .; 170 __con_initcall_end = .;
180 __x86cpuvendor_start = .; 171 __x86_cpu_dev_start = .;
181 .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) { 172 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
182 *(.x86cpuvendor.init) 173 *(.x86_cpu_dev.init)
183 } 174 }
184 __x86cpuvendor_end = .; 175 __x86_cpu_dev_end = .;
185 SECURITY_INIT 176 SECURITY_INIT
186 177
187 . = ALIGN(8); 178 . = ALIGN(8);
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index ba8c0b75ab0a..7766d36983fc 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -15,9 +15,12 @@
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/pci_ids.h> 16#include <linux/pci_ids.h>
17#include <linux/pci_regs.h> 17#include <linux/pci_regs.h>
18
19#include <asm/apic.h>
18#include <asm/pci-direct.h> 20#include <asm/pci-direct.h>
19#include <asm/io.h> 21#include <asm/io.h>
20#include <asm/paravirt.h> 22#include <asm/paravirt.h>
23#include <asm/setup.h>
21 24
22#if defined CONFIG_PCI && defined CONFIG_PARAVIRT 25#if defined CONFIG_PCI && defined CONFIG_PARAVIRT
23/* 26/*
@@ -58,7 +61,7 @@ static void vsmp_irq_enable(void)
58 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); 61 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
59} 62}
60 63
61static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf, 64static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
62 unsigned long addr, unsigned len) 65 unsigned long addr, unsigned len)
63{ 66{
64 switch (type) { 67 switch (type) {
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 61efa2f7d564..0b8b6690a86d 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -42,7 +42,8 @@
42#include <asm/topology.h> 42#include <asm/topology.h>
43#include <asm/vgtod.h> 43#include <asm/vgtod.h>
44 44
45#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) 45#define __vsyscall(nr) \
46 __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
46#define __syscall_clobber "r11","cx","memory" 47#define __syscall_clobber "r11","cx","memory"
47 48
48/* 49/*
@@ -249,7 +250,7 @@ static ctl_table kernel_root_table2[] = {
249 doesn't violate that. We'll find out if it does. */ 250 doesn't violate that. We'll find out if it does. */
250static void __cpuinit vsyscall_set_cpu(int cpu) 251static void __cpuinit vsyscall_set_cpu(int cpu)
251{ 252{
252 unsigned long *d; 253 unsigned long d;
253 unsigned long node = 0; 254 unsigned long node = 0;
254#ifdef CONFIG_NUMA 255#ifdef CONFIG_NUMA
255 node = cpu_to_node(cpu); 256 node = cpu_to_node(cpu);
@@ -260,11 +261,11 @@ static void __cpuinit vsyscall_set_cpu(int cpu)
260 /* Store cpu number in limit so that it can be loaded quickly 261 /* Store cpu number in limit so that it can be loaded quickly
261 in user space in vgetcpu. 262 in user space in vgetcpu.
262 12 bits for the CPU and 8 bits for the node. */ 263 12 bits for the CPU and 8 bits for the node. */
263 d = (unsigned long *)(get_cpu_gdt_table(cpu) + GDT_ENTRY_PER_CPU); 264 d = 0x0f40000000000ULL;
264 *d = 0x0f40000000000ULL; 265 d |= cpu;
265 *d |= cpu; 266 d |= (node & 0xf) << 12;
266 *d |= (node & 0xf) << 12; 267 d |= (node >> 4) << 48;
267 *d |= (node >> 4) << 48; 268 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
268} 269}
269 270
270static void __cpuinit cpu_vsyscall_init(void *arg) 271static void __cpuinit cpu_vsyscall_init(void *arg)
@@ -278,7 +279,7 @@ cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
278{ 279{
279 long cpu = (long)arg; 280 long cpu = (long)arg;
280 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) 281 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
281 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1); 282 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
282 return NOTIFY_DONE; 283 return NOTIFY_DONE;
283} 284}
284 285
@@ -301,7 +302,7 @@ static int __init vsyscall_init(void)
301#ifdef CONFIG_SYSCTL 302#ifdef CONFIG_SYSCTL
302 register_sysctl_table(kernel_root_table2); 303 register_sysctl_table(kernel_root_table2);
303#endif 304#endif
304 on_each_cpu(cpu_vsyscall_init, NULL, 0, 1); 305 on_each_cpu(cpu_vsyscall_init, NULL, 1);
305 hotcpu_notifier(cpu_vsyscall_notifier, 0); 306 hotcpu_notifier(cpu_vsyscall_notifier, 0);
306 return 0; 307 return 0;
307} 308}
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index f6c05d0410fb..b545f371b5f5 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -2,13 +2,20 @@
2 All C exports should go in the respective C files. */ 2 All C exports should go in the respective C files. */
3 3
4#include <linux/module.h> 4#include <linux/module.h>
5#include <net/checksum.h>
6#include <linux/smp.h> 5#include <linux/smp.h>
7 6
7#include <net/checksum.h>
8
8#include <asm/processor.h> 9#include <asm/processor.h>
9#include <asm/uaccess.h>
10#include <asm/pgtable.h> 10#include <asm/pgtable.h>
11#include <asm/uaccess.h>
11#include <asm/desc.h> 12#include <asm/desc.h>
13#include <asm/ftrace.h>
14
15#ifdef CONFIG_FTRACE
16/* mcount is defined in assembly */
17EXPORT_SYMBOL(mcount);
18#endif
12 19
13EXPORT_SYMBOL(kernel_thread); 20EXPORT_SYMBOL(kernel_thread);
14 21
@@ -53,8 +60,3 @@ EXPORT_SYMBOL(init_level4_pgt);
53EXPORT_SYMBOL(load_gs_index); 60EXPORT_SYMBOL(load_gs_index);
54 61
55EXPORT_SYMBOL(_proxy_pda); 62EXPORT_SYMBOL(_proxy_pda);
56
57#ifdef CONFIG_PARAVIRT
58/* Virtualized guests may want to use it */
59EXPORT_SYMBOL_GPL(cpu_gdt_descr);
60#endif
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
new file mode 100644
index 000000000000..9abac8a9d823
--- /dev/null
+++ b/arch/x86/kernel/xsave.c
@@ -0,0 +1,345 @@
1/*
2 * xsave/xrstor support.
3 *
4 * Author: Suresh Siddha <suresh.b.siddha@intel.com>
5 */
6#include <linux/bootmem.h>
7#include <linux/compat.h>
8#include <asm/i387.h>
9#ifdef CONFIG_IA32_EMULATION
10#include <asm/sigcontext32.h>
11#endif
12#include <asm/xcr.h>
13
14/*
15 * Supported feature mask by the CPU and the kernel.
16 */
17u64 pcntxt_mask;
18
19struct _fpx_sw_bytes fx_sw_reserved;
20#ifdef CONFIG_IA32_EMULATION
21struct _fpx_sw_bytes fx_sw_reserved_ia32;
22#endif
23
24/*
25 * Check for the presence of extended state information in the
26 * user fpstate pointer in the sigcontext.
27 */
28int check_for_xstate(struct i387_fxsave_struct __user *buf,
29 void __user *fpstate,
30 struct _fpx_sw_bytes *fx_sw_user)
31{
32 int min_xstate_size = sizeof(struct i387_fxsave_struct) +
33 sizeof(struct xsave_hdr_struct);
34 unsigned int magic2;
35 int err;
36
37 err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0],
38 sizeof(struct _fpx_sw_bytes));
39
40 if (err)
41 return err;
42
43 /*
44 * First Magic check failed.
45 */
46 if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1)
47 return -1;
48
49 /*
50 * Check for error scenarios.
51 */
52 if (fx_sw_user->xstate_size < min_xstate_size ||
53 fx_sw_user->xstate_size > xstate_size ||
54 fx_sw_user->xstate_size > fx_sw_user->extended_size)
55 return -1;
56
57 err = __get_user(magic2, (__u32 *) (((void *)fpstate) +
58 fx_sw_user->extended_size -
59 FP_XSTATE_MAGIC2_SIZE));
60 /*
61 * Check for the presence of second magic word at the end of memory
62 * layout. This detects the case where the user just copied the legacy
63 * fpstate layout with out copying the extended state information
64 * in the memory layout.
65 */
66 if (err || magic2 != FP_XSTATE_MAGIC2)
67 return -1;
68
69 return 0;
70}
71
72#ifdef CONFIG_X86_64
73/*
74 * Signal frame handlers.
75 */
76
77int save_i387_xstate(void __user *buf)
78{
79 struct task_struct *tsk = current;
80 int err = 0;
81
82 if (!access_ok(VERIFY_WRITE, buf, sig_xstate_size))
83 return -EACCES;
84
85 BUG_ON(sig_xstate_size < xstate_size);
86
87 if ((unsigned long)buf % 64)
88 printk("save_i387_xstate: bad fpstate %p\n", buf);
89
90 if (!used_math())
91 return 0;
92 clear_used_math(); /* trigger finit */
93 if (task_thread_info(tsk)->status & TS_USEDFPU) {
94 /*
95 * Start with clearing the user buffer. This will present a
96 * clean context for the bytes not touched by the fxsave/xsave.
97 */
98 err = __clear_user(buf, sig_xstate_size);
99 if (err)
100 return err;
101
102 if (task_thread_info(tsk)->status & TS_XSAVE)
103 err = xsave_user(buf);
104 else
105 err = fxsave_user(buf);
106
107 if (err)
108 return err;
109 task_thread_info(tsk)->status &= ~TS_USEDFPU;
110 stts();
111 } else {
112 if (__copy_to_user(buf, &tsk->thread.xstate->fxsave,
113 xstate_size))
114 return -1;
115 }
116
117 if (task_thread_info(tsk)->status & TS_XSAVE) {
118 struct _fpstate __user *fx = buf;
119 struct _xstate __user *x = buf;
120 u64 xstate_bv;
121
122 err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved,
123 sizeof(struct _fpx_sw_bytes));
124
125 err |= __put_user(FP_XSTATE_MAGIC2,
126 (__u32 __user *) (buf + sig_xstate_size
127 - FP_XSTATE_MAGIC2_SIZE));
128
129 /*
130 * Read the xstate_bv which we copied (directly from the cpu or
131 * from the state in task struct) to the user buffers and
132 * set the FP/SSE bits.
133 */
134 err |= __get_user(xstate_bv, &x->xstate_hdr.xstate_bv);
135
136 /*
137 * For legacy compatible, we always set FP/SSE bits in the bit
138 * vector while saving the state to the user context. This will
139 * enable us capturing any changes(during sigreturn) to
140 * the FP/SSE bits by the legacy applications which don't touch
141 * xstate_bv in the xsave header.
142 *
143 * xsave aware apps can change the xstate_bv in the xsave
144 * header as well as change any contents in the memory layout.
145 * xrestore as part of sigreturn will capture all the changes.
146 */
147 xstate_bv |= XSTATE_FPSSE;
148
149 err |= __put_user(xstate_bv, &x->xstate_hdr.xstate_bv);
150
151 if (err)
152 return err;
153 }
154
155 return 1;
156}
157
158/*
159 * Restore the extended state if present. Otherwise, restore the FP/SSE
160 * state.
161 */
162int restore_user_xstate(void __user *buf)
163{
164 struct _fpx_sw_bytes fx_sw_user;
165 u64 mask;
166 int err;
167
168 if (((unsigned long)buf % 64) ||
169 check_for_xstate(buf, buf, &fx_sw_user))
170 goto fx_only;
171
172 mask = fx_sw_user.xstate_bv;
173
174 /*
175 * restore the state passed by the user.
176 */
177 err = xrestore_user(buf, mask);
178 if (err)
179 return err;
180
181 /*
182 * init the state skipped by the user.
183 */
184 mask = pcntxt_mask & ~mask;
185
186 xrstor_state(init_xstate_buf, mask);
187
188 return 0;
189
190fx_only:
191 /*
192 * couldn't find the extended state information in the
193 * memory layout. Restore just the FP/SSE and init all
194 * the other extended state.
195 */
196 xrstor_state(init_xstate_buf, pcntxt_mask & ~XSTATE_FPSSE);
197 return fxrstor_checking((__force struct i387_fxsave_struct *)buf);
198}
199
200/*
201 * This restores directly out of user space. Exceptions are handled.
202 */
203int restore_i387_xstate(void __user *buf)
204{
205 struct task_struct *tsk = current;
206 int err = 0;
207
208 if (!buf) {
209 if (used_math())
210 goto clear;
211 return 0;
212 } else
213 if (!access_ok(VERIFY_READ, buf, sig_xstate_size))
214 return -EACCES;
215
216 if (!used_math()) {
217 err = init_fpu(tsk);
218 if (err)
219 return err;
220 }
221
222 if (!(task_thread_info(current)->status & TS_USEDFPU)) {
223 clts();
224 task_thread_info(current)->status |= TS_USEDFPU;
225 }
226 if (task_thread_info(tsk)->status & TS_XSAVE)
227 err = restore_user_xstate(buf);
228 else
229 err = fxrstor_checking((__force struct i387_fxsave_struct *)
230 buf);
231 if (unlikely(err)) {
232 /*
233 * Encountered an error while doing the restore from the
234 * user buffer, clear the fpu state.
235 */
236clear:
237 clear_fpu(tsk);
238 clear_used_math();
239 }
240 return err;
241}
242#endif
243
244/*
245 * Prepare the SW reserved portion of the fxsave memory layout, indicating
246 * the presence of the extended state information in the memory layout
247 * pointed by the fpstate pointer in the sigcontext.
248 * This will be saved when ever the FP and extended state context is
249 * saved on the user stack during the signal handler delivery to the user.
250 */
251void prepare_fx_sw_frame(void)
252{
253 int size_extended = (xstate_size - sizeof(struct i387_fxsave_struct)) +
254 FP_XSTATE_MAGIC2_SIZE;
255
256 sig_xstate_size = sizeof(struct _fpstate) + size_extended;
257
258#ifdef CONFIG_IA32_EMULATION
259 sig_xstate_ia32_size = sizeof(struct _fpstate_ia32) + size_extended;
260#endif
261
262 memset(&fx_sw_reserved, 0, sizeof(fx_sw_reserved));
263
264 fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
265 fx_sw_reserved.extended_size = sig_xstate_size;
266 fx_sw_reserved.xstate_bv = pcntxt_mask;
267 fx_sw_reserved.xstate_size = xstate_size;
268#ifdef CONFIG_IA32_EMULATION
269 memcpy(&fx_sw_reserved_ia32, &fx_sw_reserved,
270 sizeof(struct _fpx_sw_bytes));
271 fx_sw_reserved_ia32.extended_size = sig_xstate_ia32_size;
272#endif
273}
274
275/*
276 * Represents init state for the supported extended state.
277 */
278struct xsave_struct *init_xstate_buf;
279
280#ifdef CONFIG_X86_64
281unsigned int sig_xstate_size = sizeof(struct _fpstate);
282#endif
283
284/*
285 * Enable the extended processor state save/restore feature
286 */
287void __cpuinit xsave_init(void)
288{
289 if (!cpu_has_xsave)
290 return;
291
292 set_in_cr4(X86_CR4_OSXSAVE);
293
294 /*
295 * Enable all the features that the HW is capable of
296 * and the Linux kernel is aware of.
297 */
298 xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
299}
300
301/*
302 * setup the xstate image representing the init state
303 */
304static void __init setup_xstate_init(void)
305{
306 init_xstate_buf = alloc_bootmem(xstate_size);
307 init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
308}
309
310/*
311 * Enable and initialize the xsave feature.
312 */
313void __init xsave_cntxt_init(void)
314{
315 unsigned int eax, ebx, ecx, edx;
316
317 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
318 pcntxt_mask = eax + ((u64)edx << 32);
319
320 if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
321 printk(KERN_ERR "FP/SSE not shown under xsave features 0x%llx\n",
322 pcntxt_mask);
323 BUG();
324 }
325
326 /*
327 * for now OS knows only about FP/SSE
328 */
329 pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
330 xsave_init();
331
332 /*
333 * Recompute the context size for enabled features
334 */
335 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
336 xstate_size = ebx;
337
338 prepare_fx_sw_frame();
339
340 setup_xstate_init();
341
342 printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%llx, "
343 "cntxt size 0x%x\n",
344 pcntxt_mask, xstate_size);
345}
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 8d45fabc5f3b..ce3251ce5504 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -21,6 +21,7 @@ config KVM
21 tristate "Kernel-based Virtual Machine (KVM) support" 21 tristate "Kernel-based Virtual Machine (KVM) support"
22 depends on HAVE_KVM 22 depends on HAVE_KVM
23 select PREEMPT_NOTIFIERS 23 select PREEMPT_NOTIFIERS
24 select MMU_NOTIFIER
24 select ANON_INODES 25 select ANON_INODES
25 ---help--- 26 ---help---
26 Support hosting fully virtualized guest machines using hardware 27 Support hosting fully virtualized guest machines using hardware
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index c97d35c218db..d0e940bb6f40 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -2,7 +2,8 @@
2# Makefile for Kernel-based Virtual Machine module 2# Makefile for Kernel-based Virtual Machine module
3# 3#
4 4
5common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o) 5common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
6 coalesced_mmio.o)
6ifeq ($(CONFIG_KVM_TRACE),y) 7ifeq ($(CONFIG_KVM_TRACE),y)
7common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) 8common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
8endif 9endif
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 3829aa7b663f..c0f7872a9124 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -91,7 +91,7 @@ static void pit_set_gate(struct kvm *kvm, int channel, u32 val)
91 c->gate = val; 91 c->gate = val;
92} 92}
93 93
94int pit_get_gate(struct kvm *kvm, int channel) 94static int pit_get_gate(struct kvm *kvm, int channel)
95{ 95{
96 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); 96 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
97 97
@@ -193,19 +193,16 @@ static void pit_latch_status(struct kvm *kvm, int channel)
193 } 193 }
194} 194}
195 195
196int __pit_timer_fn(struct kvm_kpit_state *ps) 196static int __pit_timer_fn(struct kvm_kpit_state *ps)
197{ 197{
198 struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0]; 198 struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0];
199 struct kvm_kpit_timer *pt = &ps->pit_timer; 199 struct kvm_kpit_timer *pt = &ps->pit_timer;
200 200
201 atomic_inc(&pt->pending); 201 if (!atomic_inc_and_test(&pt->pending))
202 smp_mb__after_atomic_inc();
203 if (vcpu0) {
204 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); 202 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
205 if (waitqueue_active(&vcpu0->wq)) { 203 if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
206 vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; 204 vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
207 wake_up_interruptible(&vcpu0->wq); 205 wake_up_interruptible(&vcpu0->wq);
208 }
209 } 206 }
210 207
211 pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); 208 pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
@@ -308,6 +305,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
308 create_pit_timer(&ps->pit_timer, val, 0); 305 create_pit_timer(&ps->pit_timer, val, 0);
309 break; 306 break;
310 case 2: 307 case 2:
308 case 3:
311 create_pit_timer(&ps->pit_timer, val, 1); 309 create_pit_timer(&ps->pit_timer, val, 1);
312 break; 310 break;
313 default: 311 default:
@@ -459,7 +457,8 @@ static void pit_ioport_read(struct kvm_io_device *this,
459 mutex_unlock(&pit_state->lock); 457 mutex_unlock(&pit_state->lock);
460} 458}
461 459
462static int pit_in_range(struct kvm_io_device *this, gpa_t addr) 460static int pit_in_range(struct kvm_io_device *this, gpa_t addr,
461 int len, int is_write)
463{ 462{
464 return ((addr >= KVM_PIT_BASE_ADDRESS) && 463 return ((addr >= KVM_PIT_BASE_ADDRESS) &&
465 (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); 464 (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
@@ -500,7 +499,8 @@ static void speaker_ioport_read(struct kvm_io_device *this,
500 mutex_unlock(&pit_state->lock); 499 mutex_unlock(&pit_state->lock);
501} 500}
502 501
503static int speaker_in_range(struct kvm_io_device *this, gpa_t addr) 502static int speaker_in_range(struct kvm_io_device *this, gpa_t addr,
503 int len, int is_write)
504{ 504{
505 return (addr == KVM_SPEAKER_BASE_ADDRESS); 505 return (addr == KVM_SPEAKER_BASE_ADDRESS);
506} 506}
@@ -575,7 +575,7 @@ void kvm_free_pit(struct kvm *kvm)
575 } 575 }
576} 576}
577 577
578void __inject_pit_timer_intr(struct kvm *kvm) 578static void __inject_pit_timer_intr(struct kvm *kvm)
579{ 579{
580 mutex_lock(&kvm->lock); 580 mutex_lock(&kvm->lock);
581 kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1); 581 kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1);
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index ab29cf2def47..c31164e8aa46 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -130,8 +130,10 @@ void kvm_pic_set_irq(void *opaque, int irq, int level)
130{ 130{
131 struct kvm_pic *s = opaque; 131 struct kvm_pic *s = opaque;
132 132
133 pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); 133 if (irq >= 0 && irq < PIC_NUM_PINS) {
134 pic_update_irq(s); 134 pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
135 pic_update_irq(s);
136 }
135} 137}
136 138
137/* 139/*
@@ -346,7 +348,8 @@ static u32 elcr_ioport_read(void *opaque, u32 addr1)
346 return s->elcr; 348 return s->elcr;
347} 349}
348 350
349static int picdev_in_range(struct kvm_io_device *this, gpa_t addr) 351static int picdev_in_range(struct kvm_io_device *this, gpa_t addr,
352 int len, int is_write)
350{ 353{
351 switch (addr) { 354 switch (addr) {
352 case 0x20: 355 case 0x20:
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 2a15be2275c0..7ca47cbb48bb 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -30,6 +30,8 @@
30#include "ioapic.h" 30#include "ioapic.h"
31#include "lapic.h" 31#include "lapic.h"
32 32
33#define PIC_NUM_PINS 16
34
33struct kvm; 35struct kvm;
34struct kvm_vcpu; 36struct kvm_vcpu;
35 37
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ebc03f5ae162..73f43de69f67 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -356,8 +356,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
356 case APIC_DM_SMI: 356 case APIC_DM_SMI:
357 printk(KERN_DEBUG "Ignoring guest SMI\n"); 357 printk(KERN_DEBUG "Ignoring guest SMI\n");
358 break; 358 break;
359
359 case APIC_DM_NMI: 360 case APIC_DM_NMI:
360 printk(KERN_DEBUG "Ignoring guest NMI\n"); 361 kvm_inject_nmi(vcpu);
361 break; 362 break;
362 363
363 case APIC_DM_INIT: 364 case APIC_DM_INIT:
@@ -572,6 +573,8 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
572{ 573{
573 u32 val = 0; 574 u32 val = 0;
574 575
576 KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler);
577
575 if (offset >= LAPIC_MMIO_LENGTH) 578 if (offset >= LAPIC_MMIO_LENGTH)
576 return 0; 579 return 0;
577 580
@@ -695,6 +698,8 @@ static void apic_mmio_write(struct kvm_io_device *this,
695 698
696 offset &= 0xff0; 699 offset &= 0xff0;
697 700
701 KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler);
702
698 switch (offset) { 703 switch (offset) {
699 case APIC_ID: /* Local APIC ID */ 704 case APIC_ID: /* Local APIC ID */
700 apic_set_reg(apic, APIC_ID, val); 705 apic_set_reg(apic, APIC_ID, val);
@@ -780,7 +785,8 @@ static void apic_mmio_write(struct kvm_io_device *this,
780 785
781} 786}
782 787
783static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr) 788static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr,
789 int len, int size)
784{ 790{
785 struct kvm_lapic *apic = (struct kvm_lapic *)this->private; 791 struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
786 int ret = 0; 792 int ret = 0;
@@ -939,8 +945,8 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
939 int result = 0; 945 int result = 0;
940 wait_queue_head_t *q = &apic->vcpu->wq; 946 wait_queue_head_t *q = &apic->vcpu->wq;
941 947
942 atomic_inc(&apic->timer.pending); 948 if(!atomic_inc_and_test(&apic->timer.pending))
943 set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); 949 set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
944 if (waitqueue_active(q)) { 950 if (waitqueue_active(q)) {
945 apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 951 apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
946 wake_up_interruptible(q); 952 wake_up_interruptible(q);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 676c396c9cee..81858881287e 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -31,6 +31,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu);
31u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); 31u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
32void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); 32void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
33void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); 33void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
34u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
34 35
35int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); 36int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
36int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); 37int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7e7c3969f7a2..3da2508eb22a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -66,7 +66,8 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
66#endif 66#endif
67 67
68#if defined(MMU_DEBUG) || defined(AUDIT) 68#if defined(MMU_DEBUG) || defined(AUDIT)
69static int dbg = 1; 69static int dbg = 0;
70module_param(dbg, bool, 0644);
70#endif 71#endif
71 72
72#ifndef MMU_DEBUG 73#ifndef MMU_DEBUG
@@ -652,6 +653,88 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
652 account_shadowed(kvm, gfn); 653 account_shadowed(kvm, gfn);
653} 654}
654 655
656static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
657{
658 u64 *spte;
659 int need_tlb_flush = 0;
660
661 while ((spte = rmap_next(kvm, rmapp, NULL))) {
662 BUG_ON(!(*spte & PT_PRESENT_MASK));
663 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
664 rmap_remove(kvm, spte);
665 set_shadow_pte(spte, shadow_trap_nonpresent_pte);
666 need_tlb_flush = 1;
667 }
668 return need_tlb_flush;
669}
670
671static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
672 int (*handler)(struct kvm *kvm, unsigned long *rmapp))
673{
674 int i;
675 int retval = 0;
676
677 /*
678 * If mmap_sem isn't taken, we can look the memslots with only
679 * the mmu_lock by skipping over the slots with userspace_addr == 0.
680 */
681 for (i = 0; i < kvm->nmemslots; i++) {
682 struct kvm_memory_slot *memslot = &kvm->memslots[i];
683 unsigned long start = memslot->userspace_addr;
684 unsigned long end;
685
686 /* mmu_lock protects userspace_addr */
687 if (!start)
688 continue;
689
690 end = start + (memslot->npages << PAGE_SHIFT);
691 if (hva >= start && hva < end) {
692 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
693 retval |= handler(kvm, &memslot->rmap[gfn_offset]);
694 retval |= handler(kvm,
695 &memslot->lpage_info[
696 gfn_offset /
697 KVM_PAGES_PER_HPAGE].rmap_pde);
698 }
699 }
700
701 return retval;
702}
703
704int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
705{
706 return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
707}
708
709static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
710{
711 u64 *spte;
712 int young = 0;
713
714 /* always return old for EPT */
715 if (!shadow_accessed_mask)
716 return 0;
717
718 spte = rmap_next(kvm, rmapp, NULL);
719 while (spte) {
720 int _young;
721 u64 _spte = *spte;
722 BUG_ON(!(_spte & PT_PRESENT_MASK));
723 _young = _spte & PT_ACCESSED_MASK;
724 if (_young) {
725 young = 1;
726 clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
727 }
728 spte = rmap_next(kvm, rmapp, spte);
729 }
730 return young;
731}
732
733int kvm_age_hva(struct kvm *kvm, unsigned long hva)
734{
735 return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
736}
737
655#ifdef MMU_DEBUG 738#ifdef MMU_DEBUG
656static int is_empty_shadow_page(u64 *spt) 739static int is_empty_shadow_page(u64 *spt)
657{ 740{
@@ -776,6 +859,15 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
776 BUG(); 859 BUG();
777} 860}
778 861
862static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
863 struct kvm_mmu_page *sp)
864{
865 int i;
866
867 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
868 sp->spt[i] = shadow_trap_nonpresent_pte;
869}
870
779static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) 871static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
780{ 872{
781 unsigned index; 873 unsigned index;
@@ -841,7 +933,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
841 hlist_add_head(&sp->hash_link, bucket); 933 hlist_add_head(&sp->hash_link, bucket);
842 if (!metaphysical) 934 if (!metaphysical)
843 rmap_write_protect(vcpu->kvm, gfn); 935 rmap_write_protect(vcpu->kvm, gfn);
844 vcpu->arch.mmu.prefetch_page(vcpu, sp); 936 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
937 vcpu->arch.mmu.prefetch_page(vcpu, sp);
938 else
939 nonpaging_prefetch_page(vcpu, sp);
845 return sp; 940 return sp;
846} 941}
847 942
@@ -917,14 +1012,17 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
917 } 1012 }
918 kvm_mmu_page_unlink_children(kvm, sp); 1013 kvm_mmu_page_unlink_children(kvm, sp);
919 if (!sp->root_count) { 1014 if (!sp->root_count) {
920 if (!sp->role.metaphysical) 1015 if (!sp->role.metaphysical && !sp->role.invalid)
921 unaccount_shadowed(kvm, sp->gfn); 1016 unaccount_shadowed(kvm, sp->gfn);
922 hlist_del(&sp->hash_link); 1017 hlist_del(&sp->hash_link);
923 kvm_mmu_free_page(kvm, sp); 1018 kvm_mmu_free_page(kvm, sp);
924 } else { 1019 } else {
1020 int invalid = sp->role.invalid;
925 list_move(&sp->link, &kvm->arch.active_mmu_pages); 1021 list_move(&sp->link, &kvm->arch.active_mmu_pages);
926 sp->role.invalid = 1; 1022 sp->role.invalid = 1;
927 kvm_reload_remote_mmus(kvm); 1023 kvm_reload_remote_mmus(kvm);
1024 if (!sp->role.metaphysical && !invalid)
1025 unaccount_shadowed(kvm, sp->gfn);
928 } 1026 }
929 kvm_mmu_reset_last_pte_updated(kvm); 1027 kvm_mmu_reset_last_pte_updated(kvm);
930} 1028}
@@ -1103,7 +1201,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1103 mark_page_dirty(vcpu->kvm, gfn); 1201 mark_page_dirty(vcpu->kvm, gfn);
1104 1202
1105 pgprintk("%s: setting spte %llx\n", __func__, spte); 1203 pgprintk("%s: setting spte %llx\n", __func__, spte);
1106 pgprintk("instantiating %s PTE (%s) at %d (%llx) addr %llx\n", 1204 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
1107 (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB", 1205 (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
1108 (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte); 1206 (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte);
1109 set_shadow_pte(shadow_pte, spte); 1207 set_shadow_pte(shadow_pte, spte);
@@ -1122,8 +1220,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1122 else 1220 else
1123 kvm_release_pfn_clean(pfn); 1221 kvm_release_pfn_clean(pfn);
1124 } 1222 }
1125 if (!ptwrite || !*ptwrite) 1223 if (speculative) {
1126 vcpu->arch.last_pte_updated = shadow_pte; 1224 vcpu->arch.last_pte_updated = shadow_pte;
1225 vcpu->arch.last_pte_gfn = gfn;
1226 }
1127} 1227}
1128 1228
1129static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) 1229static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
@@ -1171,9 +1271,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1171 return -ENOMEM; 1271 return -ENOMEM;
1172 } 1272 }
1173 1273
1174 table[index] = __pa(new_table->spt) 1274 set_shadow_pte(&table[index],
1175 | PT_PRESENT_MASK | PT_WRITABLE_MASK 1275 __pa(new_table->spt)
1176 | shadow_user_mask | shadow_x_mask; 1276 | PT_PRESENT_MASK | PT_WRITABLE_MASK
1277 | shadow_user_mask | shadow_x_mask);
1177 } 1278 }
1178 table_addr = table[index] & PT64_BASE_ADDR_MASK; 1279 table_addr = table[index] & PT64_BASE_ADDR_MASK;
1179 } 1280 }
@@ -1184,6 +1285,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1184 int r; 1285 int r;
1185 int largepage = 0; 1286 int largepage = 0;
1186 pfn_t pfn; 1287 pfn_t pfn;
1288 unsigned long mmu_seq;
1187 1289
1188 down_read(&current->mm->mmap_sem); 1290 down_read(&current->mm->mmap_sem);
1189 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { 1291 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
@@ -1191,6 +1293,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1191 largepage = 1; 1293 largepage = 1;
1192 } 1294 }
1193 1295
1296 mmu_seq = vcpu->kvm->mmu_notifier_seq;
1297 /* implicit mb(), we'll read before PT lock is unlocked */
1194 pfn = gfn_to_pfn(vcpu->kvm, gfn); 1298 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1195 up_read(&current->mm->mmap_sem); 1299 up_read(&current->mm->mmap_sem);
1196 1300
@@ -1201,6 +1305,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1201 } 1305 }
1202 1306
1203 spin_lock(&vcpu->kvm->mmu_lock); 1307 spin_lock(&vcpu->kvm->mmu_lock);
1308 if (mmu_notifier_retry(vcpu, mmu_seq))
1309 goto out_unlock;
1204 kvm_mmu_free_some_pages(vcpu); 1310 kvm_mmu_free_some_pages(vcpu);
1205 r = __direct_map(vcpu, v, write, largepage, gfn, pfn, 1311 r = __direct_map(vcpu, v, write, largepage, gfn, pfn,
1206 PT32E_ROOT_LEVEL); 1312 PT32E_ROOT_LEVEL);
@@ -1208,18 +1314,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1208 1314
1209 1315
1210 return r; 1316 return r;
1211}
1212
1213 1317
1214static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, 1318out_unlock:
1215 struct kvm_mmu_page *sp) 1319 spin_unlock(&vcpu->kvm->mmu_lock);
1216{ 1320 kvm_release_pfn_clean(pfn);
1217 int i; 1321 return 0;
1218
1219 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1220 sp->spt[i] = shadow_trap_nonpresent_pte;
1221} 1322}
1222 1323
1324
1223static void mmu_free_roots(struct kvm_vcpu *vcpu) 1325static void mmu_free_roots(struct kvm_vcpu *vcpu)
1224{ 1326{
1225 int i; 1327 int i;
@@ -1335,6 +1437,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1335 int r; 1437 int r;
1336 int largepage = 0; 1438 int largepage = 0;
1337 gfn_t gfn = gpa >> PAGE_SHIFT; 1439 gfn_t gfn = gpa >> PAGE_SHIFT;
1440 unsigned long mmu_seq;
1338 1441
1339 ASSERT(vcpu); 1442 ASSERT(vcpu);
1340 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 1443 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -1348,6 +1451,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1348 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 1451 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1349 largepage = 1; 1452 largepage = 1;
1350 } 1453 }
1454 mmu_seq = vcpu->kvm->mmu_notifier_seq;
1455 /* implicit mb(), we'll read before PT lock is unlocked */
1351 pfn = gfn_to_pfn(vcpu->kvm, gfn); 1456 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1352 up_read(&current->mm->mmap_sem); 1457 up_read(&current->mm->mmap_sem);
1353 if (is_error_pfn(pfn)) { 1458 if (is_error_pfn(pfn)) {
@@ -1355,12 +1460,19 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1355 return 1; 1460 return 1;
1356 } 1461 }
1357 spin_lock(&vcpu->kvm->mmu_lock); 1462 spin_lock(&vcpu->kvm->mmu_lock);
1463 if (mmu_notifier_retry(vcpu, mmu_seq))
1464 goto out_unlock;
1358 kvm_mmu_free_some_pages(vcpu); 1465 kvm_mmu_free_some_pages(vcpu);
1359 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, 1466 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
1360 largepage, gfn, pfn, kvm_x86_ops->get_tdp_level()); 1467 largepage, gfn, pfn, kvm_x86_ops->get_tdp_level());
1361 spin_unlock(&vcpu->kvm->mmu_lock); 1468 spin_unlock(&vcpu->kvm->mmu_lock);
1362 1469
1363 return r; 1470 return r;
1471
1472out_unlock:
1473 spin_unlock(&vcpu->kvm->mmu_lock);
1474 kvm_release_pfn_clean(pfn);
1475 return 0;
1364} 1476}
1365 1477
1366static void nonpaging_free(struct kvm_vcpu *vcpu) 1478static void nonpaging_free(struct kvm_vcpu *vcpu)
@@ -1660,6 +1772,8 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1660 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 1772 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1661 vcpu->arch.update_pte.largepage = 1; 1773 vcpu->arch.update_pte.largepage = 1;
1662 } 1774 }
1775 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
1776 /* implicit mb(), we'll read before PT lock is unlocked */
1663 pfn = gfn_to_pfn(vcpu->kvm, gfn); 1777 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1664 up_read(&current->mm->mmap_sem); 1778 up_read(&current->mm->mmap_sem);
1665 1779
@@ -1671,6 +1785,18 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1671 vcpu->arch.update_pte.pfn = pfn; 1785 vcpu->arch.update_pte.pfn = pfn;
1672} 1786}
1673 1787
1788static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
1789{
1790 u64 *spte = vcpu->arch.last_pte_updated;
1791
1792 if (spte
1793 && vcpu->arch.last_pte_gfn == gfn
1794 && shadow_accessed_mask
1795 && !(*spte & shadow_accessed_mask)
1796 && is_shadow_present_pte(*spte))
1797 set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
1798}
1799
1674void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 1800void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1675 const u8 *new, int bytes) 1801 const u8 *new, int bytes)
1676{ 1802{
@@ -1694,6 +1820,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1694 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 1820 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
1695 mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); 1821 mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
1696 spin_lock(&vcpu->kvm->mmu_lock); 1822 spin_lock(&vcpu->kvm->mmu_lock);
1823 kvm_mmu_access_page(vcpu, gfn);
1697 kvm_mmu_free_some_pages(vcpu); 1824 kvm_mmu_free_some_pages(vcpu);
1698 ++vcpu->kvm->stat.mmu_pte_write; 1825 ++vcpu->kvm->stat.mmu_pte_write;
1699 kvm_mmu_audit(vcpu, "pre pte write"); 1826 kvm_mmu_audit(vcpu, "pre pte write");
@@ -1791,6 +1918,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
1791 spin_unlock(&vcpu->kvm->mmu_lock); 1918 spin_unlock(&vcpu->kvm->mmu_lock);
1792 return r; 1919 return r;
1793} 1920}
1921EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
1794 1922
1795void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 1923void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
1796{ 1924{
@@ -1847,6 +1975,12 @@ void kvm_enable_tdp(void)
1847} 1975}
1848EXPORT_SYMBOL_GPL(kvm_enable_tdp); 1976EXPORT_SYMBOL_GPL(kvm_enable_tdp);
1849 1977
1978void kvm_disable_tdp(void)
1979{
1980 tdp_enabled = false;
1981}
1982EXPORT_SYMBOL_GPL(kvm_disable_tdp);
1983
1850static void free_mmu_pages(struct kvm_vcpu *vcpu) 1984static void free_mmu_pages(struct kvm_vcpu *vcpu)
1851{ 1985{
1852 struct kvm_mmu_page *sp; 1986 struct kvm_mmu_page *sp;
@@ -1948,7 +2082,7 @@ void kvm_mmu_zap_all(struct kvm *kvm)
1948 kvm_flush_remote_tlbs(kvm); 2082 kvm_flush_remote_tlbs(kvm);
1949} 2083}
1950 2084
1951void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm) 2085static void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm)
1952{ 2086{
1953 struct kvm_mmu_page *page; 2087 struct kvm_mmu_page *page;
1954 2088
@@ -1968,6 +2102,8 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
1968 list_for_each_entry(kvm, &vm_list, vm_list) { 2102 list_for_each_entry(kvm, &vm_list, vm_list) {
1969 int npages; 2103 int npages;
1970 2104
2105 if (!down_read_trylock(&kvm->slots_lock))
2106 continue;
1971 spin_lock(&kvm->mmu_lock); 2107 spin_lock(&kvm->mmu_lock);
1972 npages = kvm->arch.n_alloc_mmu_pages - 2108 npages = kvm->arch.n_alloc_mmu_pages -
1973 kvm->arch.n_free_mmu_pages; 2109 kvm->arch.n_free_mmu_pages;
@@ -1980,6 +2116,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
1980 nr_to_scan--; 2116 nr_to_scan--;
1981 2117
1982 spin_unlock(&kvm->mmu_lock); 2118 spin_unlock(&kvm->mmu_lock);
2119 up_read(&kvm->slots_lock);
1983 } 2120 }
1984 if (kvm_freed) 2121 if (kvm_freed)
1985 list_move_tail(&kvm_freed->vm_list, &vm_list); 2122 list_move_tail(&kvm_freed->vm_list, &vm_list);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 1730757bbc7a..258e5d56298e 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -15,7 +15,8 @@
15#define PT_USER_MASK (1ULL << 2) 15#define PT_USER_MASK (1ULL << 2)
16#define PT_PWT_MASK (1ULL << 3) 16#define PT_PWT_MASK (1ULL << 3)
17#define PT_PCD_MASK (1ULL << 4) 17#define PT_PCD_MASK (1ULL << 4)
18#define PT_ACCESSED_MASK (1ULL << 5) 18#define PT_ACCESSED_SHIFT 5
19#define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
19#define PT_DIRTY_MASK (1ULL << 6) 20#define PT_DIRTY_MASK (1ULL << 6)
20#define PT_PAGE_SIZE_MASK (1ULL << 7) 21#define PT_PAGE_SIZE_MASK (1ULL << 7)
21#define PT_PAT_MASK (1ULL << 7) 22#define PT_PAT_MASK (1ULL << 7)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 934c7b619396..4a814bff21f2 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -263,6 +263,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
263 pfn = vcpu->arch.update_pte.pfn; 263 pfn = vcpu->arch.update_pte.pfn;
264 if (is_error_pfn(pfn)) 264 if (is_error_pfn(pfn))
265 return; 265 return;
266 if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
267 return;
266 kvm_get_pfn(pfn); 268 kvm_get_pfn(pfn);
267 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, 269 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
268 gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte), 270 gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
@@ -343,7 +345,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
343 shadow_addr = __pa(shadow_page->spt); 345 shadow_addr = __pa(shadow_page->spt);
344 shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK 346 shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
345 | PT_WRITABLE_MASK | PT_USER_MASK; 347 | PT_WRITABLE_MASK | PT_USER_MASK;
346 *shadow_ent = shadow_pte; 348 set_shadow_pte(shadow_ent, shadow_pte);
347 } 349 }
348 350
349 mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, 351 mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
@@ -380,6 +382,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
380 int r; 382 int r;
381 pfn_t pfn; 383 pfn_t pfn;
382 int largepage = 0; 384 int largepage = 0;
385 unsigned long mmu_seq;
383 386
384 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 387 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
385 kvm_mmu_audit(vcpu, "pre page fault"); 388 kvm_mmu_audit(vcpu, "pre page fault");
@@ -413,6 +416,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
413 largepage = 1; 416 largepage = 1;
414 } 417 }
415 } 418 }
419 mmu_seq = vcpu->kvm->mmu_notifier_seq;
420 /* implicit mb(), we'll read before PT lock is unlocked */
416 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); 421 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
417 up_read(&current->mm->mmap_sem); 422 up_read(&current->mm->mmap_sem);
418 423
@@ -424,6 +429,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
424 } 429 }
425 430
426 spin_lock(&vcpu->kvm->mmu_lock); 431 spin_lock(&vcpu->kvm->mmu_lock);
432 if (mmu_notifier_retry(vcpu, mmu_seq))
433 goto out_unlock;
427 kvm_mmu_free_some_pages(vcpu); 434 kvm_mmu_free_some_pages(vcpu);
428 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 435 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
429 largepage, &write_pt, pfn); 436 largepage, &write_pt, pfn);
@@ -439,6 +446,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
439 spin_unlock(&vcpu->kvm->mmu_lock); 446 spin_unlock(&vcpu->kvm->mmu_lock);
440 447
441 return write_pt; 448 return write_pt;
449
450out_unlock:
451 spin_unlock(&vcpu->kvm->mmu_lock);
452 kvm_release_pfn_clean(pfn);
453 return 0;
442} 454}
443 455
444static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) 456static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
@@ -460,8 +472,9 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
460static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, 472static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
461 struct kvm_mmu_page *sp) 473 struct kvm_mmu_page *sp)
462{ 474{
463 int i, offset = 0, r = 0; 475 int i, j, offset, r;
464 pt_element_t pt; 476 pt_element_t pt[256 / sizeof(pt_element_t)];
477 gpa_t pte_gpa;
465 478
466 if (sp->role.metaphysical 479 if (sp->role.metaphysical
467 || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) { 480 || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
@@ -469,19 +482,20 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
469 return; 482 return;
470 } 483 }
471 484
472 if (PTTYPE == 32) 485 pte_gpa = gfn_to_gpa(sp->gfn);
486 if (PTTYPE == 32) {
473 offset = sp->role.quadrant << PT64_LEVEL_BITS; 487 offset = sp->role.quadrant << PT64_LEVEL_BITS;
488 pte_gpa += offset * sizeof(pt_element_t);
489 }
474 490
475 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 491 for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) {
476 gpa_t pte_gpa = gfn_to_gpa(sp->gfn); 492 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt);
477 pte_gpa += (i+offset) * sizeof(pt_element_t); 493 pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t);
478 494 for (j = 0; j < ARRAY_SIZE(pt); ++j)
479 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &pt, 495 if (r || is_present_pte(pt[j]))
480 sizeof(pt_element_t)); 496 sp->spt[i+j] = shadow_trap_nonpresent_pte;
481 if (r || is_present_pte(pt)) 497 else
482 sp->spt[i] = shadow_trap_nonpresent_pte; 498 sp->spt[i+j] = shadow_notrap_nonpresent_pte;
483 else
484 sp->spt[i] = shadow_notrap_nonpresent_pte;
485 } 499 }
486} 500}
487 501
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6b0d5fa5bab3..8233b86c778c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -27,6 +27,8 @@
27 27
28#include <asm/desc.h> 28#include <asm/desc.h>
29 29
30#define __ex(x) __kvm_handle_fault_on_reboot(x)
31
30MODULE_AUTHOR("Qumranet"); 32MODULE_AUTHOR("Qumranet");
31MODULE_LICENSE("GPL"); 33MODULE_LICENSE("GPL");
32 34
@@ -60,6 +62,7 @@ static int npt = 1;
60module_param(npt, int, S_IRUGO); 62module_param(npt, int, S_IRUGO);
61 63
62static void kvm_reput_irq(struct vcpu_svm *svm); 64static void kvm_reput_irq(struct vcpu_svm *svm);
65static void svm_flush_tlb(struct kvm_vcpu *vcpu);
63 66
64static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) 67static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
65{ 68{
@@ -129,17 +132,17 @@ static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
129 132
130static inline void clgi(void) 133static inline void clgi(void)
131{ 134{
132 asm volatile (SVM_CLGI); 135 asm volatile (__ex(SVM_CLGI));
133} 136}
134 137
135static inline void stgi(void) 138static inline void stgi(void)
136{ 139{
137 asm volatile (SVM_STGI); 140 asm volatile (__ex(SVM_STGI));
138} 141}
139 142
140static inline void invlpga(unsigned long addr, u32 asid) 143static inline void invlpga(unsigned long addr, u32 asid)
141{ 144{
142 asm volatile (SVM_INVLPGA :: "a"(addr), "c"(asid)); 145 asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid));
143} 146}
144 147
145static inline unsigned long kvm_read_cr2(void) 148static inline unsigned long kvm_read_cr2(void)
@@ -270,19 +273,11 @@ static int has_svm(void)
270 273
271static void svm_hardware_disable(void *garbage) 274static void svm_hardware_disable(void *garbage)
272{ 275{
273 struct svm_cpu_data *svm_data 276 uint64_t efer;
274 = per_cpu(svm_data, raw_smp_processor_id());
275
276 if (svm_data) {
277 uint64_t efer;
278 277
279 wrmsrl(MSR_VM_HSAVE_PA, 0); 278 wrmsrl(MSR_VM_HSAVE_PA, 0);
280 rdmsrl(MSR_EFER, efer); 279 rdmsrl(MSR_EFER, efer);
281 wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK); 280 wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
282 per_cpu(svm_data, raw_smp_processor_id()) = NULL;
283 __free_page(svm_data->save_area);
284 kfree(svm_data);
285 }
286} 281}
287 282
288static void svm_hardware_enable(void *garbage) 283static void svm_hardware_enable(void *garbage)
@@ -321,6 +316,19 @@ static void svm_hardware_enable(void *garbage)
321 page_to_pfn(svm_data->save_area) << PAGE_SHIFT); 316 page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
322} 317}
323 318
319static void svm_cpu_uninit(int cpu)
320{
321 struct svm_cpu_data *svm_data
322 = per_cpu(svm_data, raw_smp_processor_id());
323
324 if (!svm_data)
325 return;
326
327 per_cpu(svm_data, raw_smp_processor_id()) = NULL;
328 __free_page(svm_data->save_area);
329 kfree(svm_data);
330}
331
324static int svm_cpu_init(int cpu) 332static int svm_cpu_init(int cpu)
325{ 333{
326 struct svm_cpu_data *svm_data; 334 struct svm_cpu_data *svm_data;
@@ -446,7 +454,8 @@ static __init int svm_hardware_setup(void)
446 if (npt_enabled) { 454 if (npt_enabled) {
447 printk(KERN_INFO "kvm: Nested Paging enabled\n"); 455 printk(KERN_INFO "kvm: Nested Paging enabled\n");
448 kvm_enable_tdp(); 456 kvm_enable_tdp();
449 } 457 } else
458 kvm_disable_tdp();
450 459
451 return 0; 460 return 0;
452 461
@@ -458,6 +467,11 @@ err:
458 467
459static __exit void svm_hardware_unsetup(void) 468static __exit void svm_hardware_unsetup(void)
460{ 469{
470 int cpu;
471
472 for_each_online_cpu(cpu)
473 svm_cpu_uninit(cpu);
474
461 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); 475 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
462 iopm_base = 0; 476 iopm_base = 0;
463} 477}
@@ -707,10 +721,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
707 rdtscll(vcpu->arch.host_tsc); 721 rdtscll(vcpu->arch.host_tsc);
708} 722}
709 723
710static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
711{
712}
713
714static void svm_cache_regs(struct kvm_vcpu *vcpu) 724static void svm_cache_regs(struct kvm_vcpu *vcpu)
715{ 725{
716 struct vcpu_svm *svm = to_svm(vcpu); 726 struct vcpu_svm *svm = to_svm(vcpu);
@@ -869,6 +879,10 @@ set:
869static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 879static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
870{ 880{
871 unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE; 881 unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
882 unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
883
884 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
885 force_new_asid(vcpu);
872 886
873 vcpu->arch.cr4 = cr4; 887 vcpu->arch.cr4 = cr4;
874 if (!npt_enabled) 888 if (!npt_enabled)
@@ -949,7 +963,9 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
949 963
950static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) 964static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
951{ 965{
952 return to_svm(vcpu)->db_regs[dr]; 966 unsigned long val = to_svm(vcpu)->db_regs[dr];
967 KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
968 return val;
953} 969}
954 970
955static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, 971static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
@@ -997,13 +1013,35 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
997 struct kvm *kvm = svm->vcpu.kvm; 1013 struct kvm *kvm = svm->vcpu.kvm;
998 u64 fault_address; 1014 u64 fault_address;
999 u32 error_code; 1015 u32 error_code;
1016 bool event_injection = false;
1000 1017
1001 if (!irqchip_in_kernel(kvm) && 1018 if (!irqchip_in_kernel(kvm) &&
1002 is_external_interrupt(exit_int_info)) 1019 is_external_interrupt(exit_int_info)) {
1020 event_injection = true;
1003 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); 1021 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
1022 }
1004 1023
1005 fault_address = svm->vmcb->control.exit_info_2; 1024 fault_address = svm->vmcb->control.exit_info_2;
1006 error_code = svm->vmcb->control.exit_info_1; 1025 error_code = svm->vmcb->control.exit_info_1;
1026
1027 if (!npt_enabled)
1028 KVMTRACE_3D(PAGE_FAULT, &svm->vcpu, error_code,
1029 (u32)fault_address, (u32)(fault_address >> 32),
1030 handler);
1031 else
1032 KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code,
1033 (u32)fault_address, (u32)(fault_address >> 32),
1034 handler);
1035 /*
1036 * FIXME: Tis shouldn't be necessary here, but there is a flush
1037 * missing in the MMU code. Until we find this bug, flush the
1038 * complete TLB here on an NPF
1039 */
1040 if (npt_enabled)
1041 svm_flush_tlb(&svm->vcpu);
1042
1043 if (event_injection)
1044 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1007 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1045 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1008} 1046}
1009 1047
@@ -1081,6 +1119,19 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1081 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); 1119 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
1082} 1120}
1083 1121
1122static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1123{
1124 KVMTRACE_0D(NMI, &svm->vcpu, handler);
1125 return 1;
1126}
1127
1128static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1129{
1130 ++svm->vcpu.stat.irq_exits;
1131 KVMTRACE_0D(INTR, &svm->vcpu, handler);
1132 return 1;
1133}
1134
1084static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1135static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1085{ 1136{
1086 return 1; 1137 return 1;
@@ -1219,6 +1270,9 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1219 if (svm_get_msr(&svm->vcpu, ecx, &data)) 1270 if (svm_get_msr(&svm->vcpu, ecx, &data))
1220 kvm_inject_gp(&svm->vcpu, 0); 1271 kvm_inject_gp(&svm->vcpu, 0);
1221 else { 1272 else {
1273 KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data,
1274 (u32)(data >> 32), handler);
1275
1222 svm->vmcb->save.rax = data & 0xffffffff; 1276 svm->vmcb->save.rax = data & 0xffffffff;
1223 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; 1277 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
1224 svm->next_rip = svm->vmcb->save.rip + 2; 1278 svm->next_rip = svm->vmcb->save.rip + 2;
@@ -1284,16 +1338,19 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1284 case MSR_K7_EVNTSEL1: 1338 case MSR_K7_EVNTSEL1:
1285 case MSR_K7_EVNTSEL2: 1339 case MSR_K7_EVNTSEL2:
1286 case MSR_K7_EVNTSEL3: 1340 case MSR_K7_EVNTSEL3:
1341 case MSR_K7_PERFCTR0:
1342 case MSR_K7_PERFCTR1:
1343 case MSR_K7_PERFCTR2:
1344 case MSR_K7_PERFCTR3:
1287 /* 1345 /*
1288 * only support writing 0 to the performance counters for now 1346 * Just discard all writes to the performance counters; this
1289 * to make Windows happy. Should be replaced by a real 1347 * should keep both older linux and windows 64-bit guests
1290 * performance counter emulation later. 1348 * happy
1291 */ 1349 */
1292 if (data != 0) 1350 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", ecx, data);
1293 goto unhandled; 1351
1294 break; 1352 break;
1295 default: 1353 default:
1296 unhandled:
1297 return kvm_set_msr_common(vcpu, ecx, data); 1354 return kvm_set_msr_common(vcpu, ecx, data);
1298 } 1355 }
1299 return 0; 1356 return 0;
@@ -1304,6 +1361,10 @@ static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1304 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 1361 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1305 u64 data = (svm->vmcb->save.rax & -1u) 1362 u64 data = (svm->vmcb->save.rax & -1u)
1306 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); 1363 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
1364
1365 KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32),
1366 handler);
1367
1307 svm->next_rip = svm->vmcb->save.rip + 2; 1368 svm->next_rip = svm->vmcb->save.rip + 2;
1308 if (svm_set_msr(&svm->vcpu, ecx, data)) 1369 if (svm_set_msr(&svm->vcpu, ecx, data))
1309 kvm_inject_gp(&svm->vcpu, 0); 1370 kvm_inject_gp(&svm->vcpu, 0);
@@ -1323,6 +1384,8 @@ static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1323static int interrupt_window_interception(struct vcpu_svm *svm, 1384static int interrupt_window_interception(struct vcpu_svm *svm,
1324 struct kvm_run *kvm_run) 1385 struct kvm_run *kvm_run)
1325{ 1386{
1387 KVMTRACE_0D(PEND_INTR, &svm->vcpu, handler);
1388
1326 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); 1389 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
1327 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 1390 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
1328 /* 1391 /*
@@ -1364,8 +1427,8 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
1364 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, 1427 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
1365 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, 1428 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
1366 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, 1429 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
1367 [SVM_EXIT_INTR] = nop_on_interception, 1430 [SVM_EXIT_INTR] = intr_interception,
1368 [SVM_EXIT_NMI] = nop_on_interception, 1431 [SVM_EXIT_NMI] = nmi_interception,
1369 [SVM_EXIT_SMI] = nop_on_interception, 1432 [SVM_EXIT_SMI] = nop_on_interception,
1370 [SVM_EXIT_INIT] = nop_on_interception, 1433 [SVM_EXIT_INIT] = nop_on_interception,
1371 [SVM_EXIT_VINTR] = interrupt_window_interception, 1434 [SVM_EXIT_VINTR] = interrupt_window_interception,
@@ -1397,6 +1460,9 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1397 struct vcpu_svm *svm = to_svm(vcpu); 1460 struct vcpu_svm *svm = to_svm(vcpu);
1398 u32 exit_code = svm->vmcb->control.exit_code; 1461 u32 exit_code = svm->vmcb->control.exit_code;
1399 1462
1463 KVMTRACE_3D(VMEXIT, vcpu, exit_code, (u32)svm->vmcb->save.rip,
1464 (u32)((u64)svm->vmcb->save.rip >> 32), entryexit);
1465
1400 if (npt_enabled) { 1466 if (npt_enabled) {
1401 int mmu_reload = 0; 1467 int mmu_reload = 0;
1402 if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { 1468 if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
@@ -1470,6 +1536,8 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
1470{ 1536{
1471 struct vmcb_control_area *control; 1537 struct vmcb_control_area *control;
1472 1538
1539 KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler);
1540
1473 control = &svm->vmcb->control; 1541 control = &svm->vmcb->control;
1474 control->int_vector = irq; 1542 control->int_vector = irq;
1475 control->int_ctl &= ~V_INTR_PRIO_MASK; 1543 control->int_ctl &= ~V_INTR_PRIO_MASK;
@@ -1660,9 +1728,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1660 sync_lapic_to_cr8(vcpu); 1728 sync_lapic_to_cr8(vcpu);
1661 1729
1662 save_host_msrs(vcpu); 1730 save_host_msrs(vcpu);
1663 fs_selector = read_fs(); 1731 fs_selector = kvm_read_fs();
1664 gs_selector = read_gs(); 1732 gs_selector = kvm_read_gs();
1665 ldt_selector = read_ldt(); 1733 ldt_selector = kvm_read_ldt();
1666 svm->host_cr2 = kvm_read_cr2(); 1734 svm->host_cr2 = kvm_read_cr2();
1667 svm->host_dr6 = read_dr6(); 1735 svm->host_dr6 = read_dr6();
1668 svm->host_dr7 = read_dr7(); 1736 svm->host_dr7 = read_dr7();
@@ -1716,17 +1784,17 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1716 /* Enter guest mode */ 1784 /* Enter guest mode */
1717 "push %%rax \n\t" 1785 "push %%rax \n\t"
1718 "mov %c[vmcb](%[svm]), %%rax \n\t" 1786 "mov %c[vmcb](%[svm]), %%rax \n\t"
1719 SVM_VMLOAD "\n\t" 1787 __ex(SVM_VMLOAD) "\n\t"
1720 SVM_VMRUN "\n\t" 1788 __ex(SVM_VMRUN) "\n\t"
1721 SVM_VMSAVE "\n\t" 1789 __ex(SVM_VMSAVE) "\n\t"
1722 "pop %%rax \n\t" 1790 "pop %%rax \n\t"
1723#else 1791#else
1724 /* Enter guest mode */ 1792 /* Enter guest mode */
1725 "push %%eax \n\t" 1793 "push %%eax \n\t"
1726 "mov %c[vmcb](%[svm]), %%eax \n\t" 1794 "mov %c[vmcb](%[svm]), %%eax \n\t"
1727 SVM_VMLOAD "\n\t" 1795 __ex(SVM_VMLOAD) "\n\t"
1728 SVM_VMRUN "\n\t" 1796 __ex(SVM_VMRUN) "\n\t"
1729 SVM_VMSAVE "\n\t" 1797 __ex(SVM_VMSAVE) "\n\t"
1730 "pop %%eax \n\t" 1798 "pop %%eax \n\t"
1731#endif 1799#endif
1732 1800
@@ -1795,9 +1863,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1795 write_dr7(svm->host_dr7); 1863 write_dr7(svm->host_dr7);
1796 kvm_write_cr2(svm->host_cr2); 1864 kvm_write_cr2(svm->host_cr2);
1797 1865
1798 load_fs(fs_selector); 1866 kvm_load_fs(fs_selector);
1799 load_gs(gs_selector); 1867 kvm_load_gs(gs_selector);
1800 load_ldt(ldt_selector); 1868 kvm_load_ldt(ldt_selector);
1801 load_host_msrs(vcpu); 1869 load_host_msrs(vcpu);
1802 1870
1803 reload_tss(vcpu); 1871 reload_tss(vcpu);
@@ -1889,7 +1957,6 @@ static struct kvm_x86_ops svm_x86_ops = {
1889 .prepare_guest_switch = svm_prepare_guest_switch, 1957 .prepare_guest_switch = svm_prepare_guest_switch,
1890 .vcpu_load = svm_vcpu_load, 1958 .vcpu_load = svm_vcpu_load,
1891 .vcpu_put = svm_vcpu_put, 1959 .vcpu_put = svm_vcpu_put,
1892 .vcpu_decache = svm_vcpu_decache,
1893 1960
1894 .set_guest_debug = svm_guest_debug, 1961 .set_guest_debug = svm_guest_debug,
1895 .get_msr = svm_get_msr, 1962 .get_msr = svm_get_msr,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 540e95179074..7041cc52b562 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -30,6 +30,8 @@
30#include <asm/io.h> 30#include <asm/io.h>
31#include <asm/desc.h> 31#include <asm/desc.h>
32 32
33#define __ex(x) __kvm_handle_fault_on_reboot(x)
34
33MODULE_AUTHOR("Qumranet"); 35MODULE_AUTHOR("Qumranet");
34MODULE_LICENSE("GPL"); 36MODULE_LICENSE("GPL");
35 37
@@ -53,6 +55,7 @@ struct vmcs {
53 55
54struct vcpu_vmx { 56struct vcpu_vmx {
55 struct kvm_vcpu vcpu; 57 struct kvm_vcpu vcpu;
58 struct list_head local_vcpus_link;
56 int launched; 59 int launched;
57 u8 fail; 60 u8 fail;
58 u32 idt_vectoring_info; 61 u32 idt_vectoring_info;
@@ -88,9 +91,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
88} 91}
89 92
90static int init_rmode(struct kvm *kvm); 93static int init_rmode(struct kvm *kvm);
94static u64 construct_eptp(unsigned long root_hpa);
91 95
92static DEFINE_PER_CPU(struct vmcs *, vmxarea); 96static DEFINE_PER_CPU(struct vmcs *, vmxarea);
93static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 97static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
98static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
94 99
95static struct page *vmx_io_bitmap_a; 100static struct page *vmx_io_bitmap_a;
96static struct page *vmx_io_bitmap_b; 101static struct page *vmx_io_bitmap_b;
@@ -260,6 +265,11 @@ static inline int cpu_has_vmx_vpid(void)
260 SECONDARY_EXEC_ENABLE_VPID); 265 SECONDARY_EXEC_ENABLE_VPID);
261} 266}
262 267
268static inline int cpu_has_virtual_nmis(void)
269{
270 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
271}
272
263static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) 273static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
264{ 274{
265 int i; 275 int i;
@@ -278,7 +288,7 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva)
278 u64 gva; 288 u64 gva;
279 } operand = { vpid, 0, gva }; 289 } operand = { vpid, 0, gva };
280 290
281 asm volatile (ASM_VMX_INVVPID 291 asm volatile (__ex(ASM_VMX_INVVPID)
282 /* CF==1 or ZF==1 --> rc = -1 */ 292 /* CF==1 or ZF==1 --> rc = -1 */
283 "; ja 1f ; ud2 ; 1:" 293 "; ja 1f ; ud2 ; 1:"
284 : : "a"(&operand), "c"(ext) : "cc", "memory"); 294 : : "a"(&operand), "c"(ext) : "cc", "memory");
@@ -290,7 +300,7 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa)
290 u64 eptp, gpa; 300 u64 eptp, gpa;
291 } operand = {eptp, gpa}; 301 } operand = {eptp, gpa};
292 302
293 asm volatile (ASM_VMX_INVEPT 303 asm volatile (__ex(ASM_VMX_INVEPT)
294 /* CF==1 or ZF==1 --> rc = -1 */ 304 /* CF==1 or ZF==1 --> rc = -1 */
295 "; ja 1f ; ud2 ; 1:\n" 305 "; ja 1f ; ud2 ; 1:\n"
296 : : "a" (&operand), "c" (ext) : "cc", "memory"); 306 : : "a" (&operand), "c" (ext) : "cc", "memory");
@@ -311,7 +321,7 @@ static void vmcs_clear(struct vmcs *vmcs)
311 u64 phys_addr = __pa(vmcs); 321 u64 phys_addr = __pa(vmcs);
312 u8 error; 322 u8 error;
313 323
314 asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0" 324 asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
315 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 325 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
316 : "cc", "memory"); 326 : "cc", "memory");
317 if (error) 327 if (error)
@@ -329,14 +339,16 @@ static void __vcpu_clear(void *arg)
329 if (per_cpu(current_vmcs, cpu) == vmx->vmcs) 339 if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
330 per_cpu(current_vmcs, cpu) = NULL; 340 per_cpu(current_vmcs, cpu) = NULL;
331 rdtscll(vmx->vcpu.arch.host_tsc); 341 rdtscll(vmx->vcpu.arch.host_tsc);
342 list_del(&vmx->local_vcpus_link);
343 vmx->vcpu.cpu = -1;
344 vmx->launched = 0;
332} 345}
333 346
334static void vcpu_clear(struct vcpu_vmx *vmx) 347static void vcpu_clear(struct vcpu_vmx *vmx)
335{ 348{
336 if (vmx->vcpu.cpu == -1) 349 if (vmx->vcpu.cpu == -1)
337 return; 350 return;
338 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1); 351 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
339 vmx->launched = 0;
340} 352}
341 353
342static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) 354static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
@@ -378,7 +390,7 @@ static unsigned long vmcs_readl(unsigned long field)
378{ 390{
379 unsigned long value; 391 unsigned long value;
380 392
381 asm volatile (ASM_VMX_VMREAD_RDX_RAX 393 asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
382 : "=a"(value) : "d"(field) : "cc"); 394 : "=a"(value) : "d"(field) : "cc");
383 return value; 395 return value;
384} 396}
@@ -413,7 +425,7 @@ static void vmcs_writel(unsigned long field, unsigned long value)
413{ 425{
414 u8 error; 426 u8 error;
415 427
416 asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" 428 asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
417 : "=q"(error) : "a"(value), "d"(field) : "cc"); 429 : "=q"(error) : "a"(value), "d"(field) : "cc");
418 if (unlikely(error)) 430 if (unlikely(error))
419 vmwrite_error(field, value); 431 vmwrite_error(field, value);
@@ -431,10 +443,8 @@ static void vmcs_write32(unsigned long field, u32 value)
431 443
432static void vmcs_write64(unsigned long field, u64 value) 444static void vmcs_write64(unsigned long field, u64 value)
433{ 445{
434#ifdef CONFIG_X86_64
435 vmcs_writel(field, value);
436#else
437 vmcs_writel(field, value); 446 vmcs_writel(field, value);
447#ifndef CONFIG_X86_64
438 asm volatile (""); 448 asm volatile ("");
439 vmcs_writel(field+1, value >> 32); 449 vmcs_writel(field+1, value >> 32);
440#endif 450#endif
@@ -474,7 +484,7 @@ static void reload_tss(void)
474 struct descriptor_table gdt; 484 struct descriptor_table gdt;
475 struct desc_struct *descs; 485 struct desc_struct *descs;
476 486
477 get_gdt(&gdt); 487 kvm_get_gdt(&gdt);
478 descs = (void *)gdt.base; 488 descs = (void *)gdt.base;
479 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ 489 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
480 load_TR_desc(); 490 load_TR_desc();
@@ -530,9 +540,9 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
530 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not 540 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
531 * allow segment selectors with cpl > 0 or ti == 1. 541 * allow segment selectors with cpl > 0 or ti == 1.
532 */ 542 */
533 vmx->host_state.ldt_sel = read_ldt(); 543 vmx->host_state.ldt_sel = kvm_read_ldt();
534 vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; 544 vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
535 vmx->host_state.fs_sel = read_fs(); 545 vmx->host_state.fs_sel = kvm_read_fs();
536 if (!(vmx->host_state.fs_sel & 7)) { 546 if (!(vmx->host_state.fs_sel & 7)) {
537 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); 547 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
538 vmx->host_state.fs_reload_needed = 0; 548 vmx->host_state.fs_reload_needed = 0;
@@ -540,7 +550,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
540 vmcs_write16(HOST_FS_SELECTOR, 0); 550 vmcs_write16(HOST_FS_SELECTOR, 0);
541 vmx->host_state.fs_reload_needed = 1; 551 vmx->host_state.fs_reload_needed = 1;
542 } 552 }
543 vmx->host_state.gs_sel = read_gs(); 553 vmx->host_state.gs_sel = kvm_read_gs();
544 if (!(vmx->host_state.gs_sel & 7)) 554 if (!(vmx->host_state.gs_sel & 7))
545 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); 555 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
546 else { 556 else {
@@ -576,15 +586,15 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
576 ++vmx->vcpu.stat.host_state_reload; 586 ++vmx->vcpu.stat.host_state_reload;
577 vmx->host_state.loaded = 0; 587 vmx->host_state.loaded = 0;
578 if (vmx->host_state.fs_reload_needed) 588 if (vmx->host_state.fs_reload_needed)
579 load_fs(vmx->host_state.fs_sel); 589 kvm_load_fs(vmx->host_state.fs_sel);
580 if (vmx->host_state.gs_ldt_reload_needed) { 590 if (vmx->host_state.gs_ldt_reload_needed) {
581 load_ldt(vmx->host_state.ldt_sel); 591 kvm_load_ldt(vmx->host_state.ldt_sel);
582 /* 592 /*
583 * If we have to reload gs, we must take care to 593 * If we have to reload gs, we must take care to
584 * preserve our gs base. 594 * preserve our gs base.
585 */ 595 */
586 local_irq_save(flags); 596 local_irq_save(flags);
587 load_gs(vmx->host_state.gs_sel); 597 kvm_load_gs(vmx->host_state.gs_sel);
588#ifdef CONFIG_X86_64 598#ifdef CONFIG_X86_64
589 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); 599 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
590#endif 600#endif
@@ -617,13 +627,17 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
617 vcpu_clear(vmx); 627 vcpu_clear(vmx);
618 kvm_migrate_timers(vcpu); 628 kvm_migrate_timers(vcpu);
619 vpid_sync_vcpu_all(vmx); 629 vpid_sync_vcpu_all(vmx);
630 local_irq_disable();
631 list_add(&vmx->local_vcpus_link,
632 &per_cpu(vcpus_on_cpu, cpu));
633 local_irq_enable();
620 } 634 }
621 635
622 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { 636 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
623 u8 error; 637 u8 error;
624 638
625 per_cpu(current_vmcs, cpu) = vmx->vmcs; 639 per_cpu(current_vmcs, cpu) = vmx->vmcs;
626 asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0" 640 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
627 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 641 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
628 : "cc"); 642 : "cc");
629 if (error) 643 if (error)
@@ -640,8 +654,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
640 * Linux uses per-cpu TSS and GDT, so set these when switching 654 * Linux uses per-cpu TSS and GDT, so set these when switching
641 * processors. 655 * processors.
642 */ 656 */
643 vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */ 657 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
644 get_gdt(&dt); 658 kvm_get_gdt(&dt);
645 vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */ 659 vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */
646 660
647 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 661 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
@@ -684,11 +698,6 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
684 update_exception_bitmap(vcpu); 698 update_exception_bitmap(vcpu);
685} 699}
686 700
687static void vmx_vcpu_decache(struct kvm_vcpu *vcpu)
688{
689 vcpu_clear(to_vmx(vcpu));
690}
691
692static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 701static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
693{ 702{
694 return vmcs_readl(GUEST_RFLAGS); 703 return vmcs_readl(GUEST_RFLAGS);
@@ -913,6 +922,18 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
913 case MSR_IA32_TIME_STAMP_COUNTER: 922 case MSR_IA32_TIME_STAMP_COUNTER:
914 guest_write_tsc(data); 923 guest_write_tsc(data);
915 break; 924 break;
925 case MSR_P6_PERFCTR0:
926 case MSR_P6_PERFCTR1:
927 case MSR_P6_EVNTSEL0:
928 case MSR_P6_EVNTSEL1:
929 /*
930 * Just discard all writes to the performance counters; this
931 * should keep both older linux and windows 64-bit guests
932 * happy
933 */
934 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data);
935
936 break;
916 default: 937 default:
917 vmx_load_host_state(vmx); 938 vmx_load_host_state(vmx);
918 msr = find_msr_entry(vmx, msr_index); 939 msr = find_msr_entry(vmx, msr_index);
@@ -1022,6 +1043,7 @@ static void hardware_enable(void *garbage)
1022 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 1043 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1023 u64 old; 1044 u64 old;
1024 1045
1046 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
1025 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 1047 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1026 if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED | 1048 if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED |
1027 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) 1049 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
@@ -1032,13 +1054,25 @@ static void hardware_enable(void *garbage)
1032 MSR_IA32_FEATURE_CONTROL_LOCKED | 1054 MSR_IA32_FEATURE_CONTROL_LOCKED |
1033 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED); 1055 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED);
1034 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ 1056 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
1035 asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) 1057 asm volatile (ASM_VMX_VMXON_RAX
1058 : : "a"(&phys_addr), "m"(phys_addr)
1036 : "memory", "cc"); 1059 : "memory", "cc");
1037} 1060}
1038 1061
1062static void vmclear_local_vcpus(void)
1063{
1064 int cpu = raw_smp_processor_id();
1065 struct vcpu_vmx *vmx, *n;
1066
1067 list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu),
1068 local_vcpus_link)
1069 __vcpu_clear(vmx);
1070}
1071
1039static void hardware_disable(void *garbage) 1072static void hardware_disable(void *garbage)
1040{ 1073{
1041 asm volatile (ASM_VMX_VMXOFF : : : "cc"); 1074 vmclear_local_vcpus();
1075 asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
1042 write_cr4(read_cr4() & ~X86_CR4_VMXE); 1076 write_cr4(read_cr4() & ~X86_CR4_VMXE);
1043} 1077}
1044 1078
@@ -1072,7 +1106,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1072 u32 _vmentry_control = 0; 1106 u32 _vmentry_control = 0;
1073 1107
1074 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; 1108 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
1075 opt = 0; 1109 opt = PIN_BASED_VIRTUAL_NMIS;
1076 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, 1110 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
1077 &_pin_based_exec_control) < 0) 1111 &_pin_based_exec_control) < 0)
1078 return -EIO; 1112 return -EIO;
@@ -1389,6 +1423,8 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
1389static void vmx_flush_tlb(struct kvm_vcpu *vcpu) 1423static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1390{ 1424{
1391 vpid_sync_vcpu_all(to_vmx(vcpu)); 1425 vpid_sync_vcpu_all(to_vmx(vcpu));
1426 if (vm_need_ept())
1427 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
1392} 1428}
1393 1429
1394static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 1430static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
@@ -1420,7 +1456,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1420 if (!(cr0 & X86_CR0_PG)) { 1456 if (!(cr0 & X86_CR0_PG)) {
1421 /* From paging/starting to nonpaging */ 1457 /* From paging/starting to nonpaging */
1422 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 1458 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1423 vmcs_config.cpu_based_exec_ctrl | 1459 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
1424 (CPU_BASED_CR3_LOAD_EXITING | 1460 (CPU_BASED_CR3_LOAD_EXITING |
1425 CPU_BASED_CR3_STORE_EXITING)); 1461 CPU_BASED_CR3_STORE_EXITING));
1426 vcpu->arch.cr0 = cr0; 1462 vcpu->arch.cr0 = cr0;
@@ -1430,7 +1466,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1430 } else if (!is_paging(vcpu)) { 1466 } else if (!is_paging(vcpu)) {
1431 /* From nonpaging to paging */ 1467 /* From nonpaging to paging */
1432 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 1468 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1433 vmcs_config.cpu_based_exec_ctrl & 1469 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
1434 ~(CPU_BASED_CR3_LOAD_EXITING | 1470 ~(CPU_BASED_CR3_LOAD_EXITING |
1435 CPU_BASED_CR3_STORE_EXITING)); 1471 CPU_BASED_CR3_STORE_EXITING));
1436 vcpu->arch.cr0 = cr0; 1472 vcpu->arch.cr0 = cr0;
@@ -1821,7 +1857,7 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
1821 spin_unlock(&vmx_vpid_lock); 1857 spin_unlock(&vmx_vpid_lock);
1822} 1858}
1823 1859
1824void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr) 1860static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
1825{ 1861{
1826 void *va; 1862 void *va;
1827 1863
@@ -1907,8 +1943,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1907 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 1943 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
1908 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 1944 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1909 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 1945 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1910 vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */ 1946 vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs()); /* 22.2.4 */
1911 vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */ 1947 vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs()); /* 22.2.4 */
1912 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 1948 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1913#ifdef CONFIG_X86_64 1949#ifdef CONFIG_X86_64
1914 rdmsrl(MSR_FS_BASE, a); 1950 rdmsrl(MSR_FS_BASE, a);
@@ -1922,7 +1958,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1922 1958
1923 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 1959 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
1924 1960
1925 get_idt(&dt); 1961 kvm_get_idt(&dt);
1926 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ 1962 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
1927 1963
1928 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); 1964 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
@@ -2114,6 +2150,13 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
2114 irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); 2150 irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
2115} 2151}
2116 2152
2153static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2154{
2155 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2156 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
2157 vcpu->arch.nmi_pending = 0;
2158}
2159
2117static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) 2160static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
2118{ 2161{
2119 int word_index = __ffs(vcpu->arch.irq_summary); 2162 int word_index = __ffs(vcpu->arch.irq_summary);
@@ -2255,6 +2298,8 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2255 cr2 = vmcs_readl(EXIT_QUALIFICATION); 2298 cr2 = vmcs_readl(EXIT_QUALIFICATION);
2256 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, 2299 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
2257 (u32)((u64)cr2 >> 32), handler); 2300 (u32)((u64)cr2 >> 32), handler);
2301 if (vect_info & VECTORING_INFO_VALID_MASK)
2302 kvm_mmu_unprotect_page_virt(vcpu, cr2);
2258 return kvm_mmu_page_fault(vcpu, cr2, error_code); 2303 return kvm_mmu_page_fault(vcpu, cr2, error_code);
2259 } 2304 }
2260 2305
@@ -2554,8 +2599,6 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2554 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); 2599 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2555 offset = exit_qualification & 0xffful; 2600 offset = exit_qualification & 0xffful;
2556 2601
2557 KVMTRACE_1D(APIC_ACCESS, vcpu, (u32)offset, handler);
2558
2559 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 2602 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
2560 2603
2561 if (er != EMULATE_DONE) { 2604 if (er != EMULATE_DONE) {
@@ -2639,6 +2682,19 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2639 return 1; 2682 return 1;
2640} 2683}
2641 2684
2685static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2686{
2687 u32 cpu_based_vm_exec_control;
2688
2689 /* clear pending NMI */
2690 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2691 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
2692 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2693 ++vcpu->stat.nmi_window_exits;
2694
2695 return 1;
2696}
2697
2642/* 2698/*
2643 * The exit handlers return 1 if the exit was handled fully and guest execution 2699 * The exit handlers return 1 if the exit was handled fully and guest execution
2644 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 2700 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -2649,6 +2705,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
2649 [EXIT_REASON_EXCEPTION_NMI] = handle_exception, 2705 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
2650 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 2706 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
2651 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 2707 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
2708 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
2652 [EXIT_REASON_IO_INSTRUCTION] = handle_io, 2709 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
2653 [EXIT_REASON_CR_ACCESS] = handle_cr, 2710 [EXIT_REASON_CR_ACCESS] = handle_cr,
2654 [EXIT_REASON_DR_ACCESS] = handle_dr, 2711 [EXIT_REASON_DR_ACCESS] = handle_dr,
@@ -2736,17 +2793,52 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
2736 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 2793 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2737} 2794}
2738 2795
2796static void enable_nmi_window(struct kvm_vcpu *vcpu)
2797{
2798 u32 cpu_based_vm_exec_control;
2799
2800 if (!cpu_has_virtual_nmis())
2801 return;
2802
2803 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2804 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
2805 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2806}
2807
2808static int vmx_nmi_enabled(struct kvm_vcpu *vcpu)
2809{
2810 u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2811 return !(guest_intr & (GUEST_INTR_STATE_NMI |
2812 GUEST_INTR_STATE_MOV_SS |
2813 GUEST_INTR_STATE_STI));
2814}
2815
2816static int vmx_irq_enabled(struct kvm_vcpu *vcpu)
2817{
2818 u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2819 return (!(guest_intr & (GUEST_INTR_STATE_MOV_SS |
2820 GUEST_INTR_STATE_STI)) &&
2821 (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF));
2822}
2823
2824static void enable_intr_window(struct kvm_vcpu *vcpu)
2825{
2826 if (vcpu->arch.nmi_pending)
2827 enable_nmi_window(vcpu);
2828 else if (kvm_cpu_has_interrupt(vcpu))
2829 enable_irq_window(vcpu);
2830}
2831
2739static void vmx_intr_assist(struct kvm_vcpu *vcpu) 2832static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2740{ 2833{
2741 struct vcpu_vmx *vmx = to_vmx(vcpu); 2834 struct vcpu_vmx *vmx = to_vmx(vcpu);
2742 u32 idtv_info_field, intr_info_field; 2835 u32 idtv_info_field, intr_info_field, exit_intr_info_field;
2743 int has_ext_irq, interrupt_window_open;
2744 int vector; 2836 int vector;
2745 2837
2746 update_tpr_threshold(vcpu); 2838 update_tpr_threshold(vcpu);
2747 2839
2748 has_ext_irq = kvm_cpu_has_interrupt(vcpu);
2749 intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); 2840 intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
2841 exit_intr_info_field = vmcs_read32(VM_EXIT_INTR_INFO);
2750 idtv_info_field = vmx->idt_vectoring_info; 2842 idtv_info_field = vmx->idt_vectoring_info;
2751 if (intr_info_field & INTR_INFO_VALID_MASK) { 2843 if (intr_info_field & INTR_INFO_VALID_MASK) {
2752 if (idtv_info_field & INTR_INFO_VALID_MASK) { 2844 if (idtv_info_field & INTR_INFO_VALID_MASK) {
@@ -2754,8 +2846,7 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2754 if (printk_ratelimit()) 2846 if (printk_ratelimit())
2755 printk(KERN_ERR "Fault when IDT_Vectoring\n"); 2847 printk(KERN_ERR "Fault when IDT_Vectoring\n");
2756 } 2848 }
2757 if (has_ext_irq) 2849 enable_intr_window(vcpu);
2758 enable_irq_window(vcpu);
2759 return; 2850 return;
2760 } 2851 }
2761 if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) { 2852 if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
@@ -2765,30 +2856,56 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2765 u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK; 2856 u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
2766 2857
2767 vmx_inject_irq(vcpu, vect); 2858 vmx_inject_irq(vcpu, vect);
2768 if (unlikely(has_ext_irq)) 2859 enable_intr_window(vcpu);
2769 enable_irq_window(vcpu);
2770 return; 2860 return;
2771 } 2861 }
2772 2862
2773 KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler); 2863 KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler);
2774 2864
2775 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); 2865 /*
2866 * SDM 3: 25.7.1.2
2867 * Clear bit "block by NMI" before VM entry if a NMI delivery
2868 * faulted.
2869 */
2870 if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
2871 == INTR_TYPE_NMI_INTR && cpu_has_virtual_nmis())
2872 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2873 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2874 ~GUEST_INTR_STATE_NMI);
2875
2876 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field
2877 & ~INTR_INFO_RESVD_BITS_MASK);
2776 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2878 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2777 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 2879 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
2778 2880
2779 if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK)) 2881 if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK))
2780 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2882 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2781 vmcs_read32(IDT_VECTORING_ERROR_CODE)); 2883 vmcs_read32(IDT_VECTORING_ERROR_CODE));
2782 if (unlikely(has_ext_irq)) 2884 enable_intr_window(vcpu);
2783 enable_irq_window(vcpu);
2784 return; 2885 return;
2785 } 2886 }
2786 if (!has_ext_irq) 2887 if (cpu_has_virtual_nmis()) {
2888 /*
2889 * SDM 3: 25.7.1.2
2890 * Re-set bit "block by NMI" before VM entry if vmexit caused by
2891 * a guest IRET fault.
2892 */
2893 if ((exit_intr_info_field & INTR_INFO_UNBLOCK_NMI) &&
2894 (exit_intr_info_field & INTR_INFO_VECTOR_MASK) != 8)
2895 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2896 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) |
2897 GUEST_INTR_STATE_NMI);
2898 else if (vcpu->arch.nmi_pending) {
2899 if (vmx_nmi_enabled(vcpu))
2900 vmx_inject_nmi(vcpu);
2901 enable_intr_window(vcpu);
2902 return;
2903 }
2904
2905 }
2906 if (!kvm_cpu_has_interrupt(vcpu))
2787 return; 2907 return;
2788 interrupt_window_open = 2908 if (vmx_irq_enabled(vcpu)) {
2789 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
2790 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
2791 if (interrupt_window_open) {
2792 vector = kvm_cpu_get_interrupt(vcpu); 2909 vector = kvm_cpu_get_interrupt(vcpu);
2793 vmx_inject_irq(vcpu, vector); 2910 vmx_inject_irq(vcpu, vector);
2794 kvm_timer_intr_post(vcpu, vector); 2911 kvm_timer_intr_post(vcpu, vector);
@@ -2838,7 +2955,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2838 "push %%edx; push %%ebp;" 2955 "push %%edx; push %%ebp;"
2839 "push %%ecx \n\t" 2956 "push %%ecx \n\t"
2840#endif 2957#endif
2841 ASM_VMX_VMWRITE_RSP_RDX "\n\t" 2958 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
2842 /* Check if vmlaunch of vmresume is needed */ 2959 /* Check if vmlaunch of vmresume is needed */
2843 "cmpl $0, %c[launched](%0) \n\t" 2960 "cmpl $0, %c[launched](%0) \n\t"
2844 /* Load guest registers. Don't clobber flags. */ 2961 /* Load guest registers. Don't clobber flags. */
@@ -2873,9 +2990,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2873#endif 2990#endif
2874 /* Enter guest mode */ 2991 /* Enter guest mode */
2875 "jne .Llaunched \n\t" 2992 "jne .Llaunched \n\t"
2876 ASM_VMX_VMLAUNCH "\n\t" 2993 __ex(ASM_VMX_VMLAUNCH) "\n\t"
2877 "jmp .Lkvm_vmx_return \n\t" 2994 "jmp .Lkvm_vmx_return \n\t"
2878 ".Llaunched: " ASM_VMX_VMRESUME "\n\t" 2995 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
2879 ".Lkvm_vmx_return: " 2996 ".Lkvm_vmx_return: "
2880 /* Save guest registers, load host registers, keep flags */ 2997 /* Save guest registers, load host registers, keep flags */
2881#ifdef CONFIG_X86_64 2998#ifdef CONFIG_X86_64
@@ -2949,7 +3066,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2949 fixup_rmode_irq(vmx); 3066 fixup_rmode_irq(vmx);
2950 3067
2951 vcpu->arch.interrupt_window_open = 3068 vcpu->arch.interrupt_window_open =
2952 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; 3069 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
3070 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)) == 0;
2953 3071
2954 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 3072 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
2955 vmx->launched = 1; 3073 vmx->launched = 1;
@@ -2957,7 +3075,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2957 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 3075 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2958 3076
2959 /* We need to handle NMIs before interrupts are enabled */ 3077 /* We need to handle NMIs before interrupts are enabled */
2960 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */ 3078 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200 &&
3079 (intr_info & INTR_INFO_VALID_MASK)) {
2961 KVMTRACE_0D(NMI, vcpu, handler); 3080 KVMTRACE_0D(NMI, vcpu, handler);
2962 asm("int $2"); 3081 asm("int $2");
2963 } 3082 }
@@ -2968,7 +3087,7 @@ static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
2968 struct vcpu_vmx *vmx = to_vmx(vcpu); 3087 struct vcpu_vmx *vmx = to_vmx(vcpu);
2969 3088
2970 if (vmx->vmcs) { 3089 if (vmx->vmcs) {
2971 on_each_cpu(__vcpu_clear, vmx, 0, 1); 3090 vcpu_clear(vmx);
2972 free_vmcs(vmx->vmcs); 3091 free_vmcs(vmx->vmcs);
2973 vmx->vmcs = NULL; 3092 vmx->vmcs = NULL;
2974 } 3093 }
@@ -2999,15 +3118,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
2999 return ERR_PTR(-ENOMEM); 3118 return ERR_PTR(-ENOMEM);
3000 3119
3001 allocate_vpid(vmx); 3120 allocate_vpid(vmx);
3002 if (id == 0 && vm_need_ept()) {
3003 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
3004 VMX_EPT_WRITABLE_MASK |
3005 VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
3006 kvm_mmu_set_mask_ptes(0ull, VMX_EPT_FAKE_ACCESSED_MASK,
3007 VMX_EPT_FAKE_DIRTY_MASK, 0ull,
3008 VMX_EPT_EXECUTABLE_MASK);
3009 kvm_enable_tdp();
3010 }
3011 3121
3012 err = kvm_vcpu_init(&vmx->vcpu, kvm, id); 3122 err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
3013 if (err) 3123 if (err)
@@ -3095,7 +3205,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
3095 .prepare_guest_switch = vmx_save_host_state, 3205 .prepare_guest_switch = vmx_save_host_state,
3096 .vcpu_load = vmx_vcpu_load, 3206 .vcpu_load = vmx_vcpu_load,
3097 .vcpu_put = vmx_vcpu_put, 3207 .vcpu_put = vmx_vcpu_put,
3098 .vcpu_decache = vmx_vcpu_decache,
3099 3208
3100 .set_guest_debug = set_guest_debug, 3209 .set_guest_debug = set_guest_debug,
3101 .guest_debug_pre = kvm_guest_debug_pre, 3210 .guest_debug_pre = kvm_guest_debug_pre,
@@ -3187,8 +3296,16 @@ static int __init vmx_init(void)
3187 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP); 3296 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP);
3188 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP); 3297 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP);
3189 3298
3190 if (cpu_has_vmx_ept()) 3299 if (vm_need_ept()) {
3191 bypass_guest_pf = 0; 3300 bypass_guest_pf = 0;
3301 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
3302 VMX_EPT_WRITABLE_MASK |
3303 VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
3304 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
3305 VMX_EPT_EXECUTABLE_MASK);
3306 kvm_enable_tdp();
3307 } else
3308 kvm_disable_tdp();
3192 3309
3193 if (bypass_guest_pf) 3310 if (bypass_guest_pf)
3194 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); 3311 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index 79d94c610dfe..17e25995b65b 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -40,6 +40,7 @@
40#define CPU_BASED_CR8_LOAD_EXITING 0x00080000 40#define CPU_BASED_CR8_LOAD_EXITING 0x00080000
41#define CPU_BASED_CR8_STORE_EXITING 0x00100000 41#define CPU_BASED_CR8_STORE_EXITING 0x00100000
42#define CPU_BASED_TPR_SHADOW 0x00200000 42#define CPU_BASED_TPR_SHADOW 0x00200000
43#define CPU_BASED_VIRTUAL_NMI_PENDING 0x00400000
43#define CPU_BASED_MOV_DR_EXITING 0x00800000 44#define CPU_BASED_MOV_DR_EXITING 0x00800000
44#define CPU_BASED_UNCOND_IO_EXITING 0x01000000 45#define CPU_BASED_UNCOND_IO_EXITING 0x01000000
45#define CPU_BASED_USE_IO_BITMAPS 0x02000000 46#define CPU_BASED_USE_IO_BITMAPS 0x02000000
@@ -216,7 +217,7 @@ enum vmcs_field {
216#define EXIT_REASON_TRIPLE_FAULT 2 217#define EXIT_REASON_TRIPLE_FAULT 2
217 218
218#define EXIT_REASON_PENDING_INTERRUPT 7 219#define EXIT_REASON_PENDING_INTERRUPT 7
219 220#define EXIT_REASON_NMI_WINDOW 8
220#define EXIT_REASON_TASK_SWITCH 9 221#define EXIT_REASON_TASK_SWITCH 9
221#define EXIT_REASON_CPUID 10 222#define EXIT_REASON_CPUID 10
222#define EXIT_REASON_HLT 12 223#define EXIT_REASON_HLT 12
@@ -251,7 +252,9 @@ enum vmcs_field {
251#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */ 252#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */
252#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */ 253#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */
253#define INTR_INFO_DELIVER_CODE_MASK 0x800 /* 11 */ 254#define INTR_INFO_DELIVER_CODE_MASK 0x800 /* 11 */
255#define INTR_INFO_UNBLOCK_NMI 0x1000 /* 12 */
254#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */ 256#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */
257#define INTR_INFO_RESVD_BITS_MASK 0x7ffff000
255 258
256#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK 259#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK
257#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK 260#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK
@@ -259,9 +262,16 @@ enum vmcs_field {
259#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK 262#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK
260 263
261#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ 264#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */
265#define INTR_TYPE_NMI_INTR (2 << 8) /* NMI */
262#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */ 266#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */
263#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ 267#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */
264 268
269/* GUEST_INTERRUPTIBILITY_INFO flags. */
270#define GUEST_INTR_STATE_STI 0x00000001
271#define GUEST_INTR_STATE_MOV_SS 0x00000002
272#define GUEST_INTR_STATE_SMI 0x00000004
273#define GUEST_INTR_STATE_NMI 0x00000008
274
265/* 275/*
266 * Exit Qualifications for MOV for Control Register Access 276 * Exit Qualifications for MOV for Control Register Access
267 */ 277 */
@@ -321,21 +331,6 @@ enum vmcs_field {
321 331
322#define AR_RESERVD_MASK 0xfffe0f00 332#define AR_RESERVD_MASK 0xfffe0f00
323 333
324#define MSR_IA32_VMX_BASIC 0x480
325#define MSR_IA32_VMX_PINBASED_CTLS 0x481
326#define MSR_IA32_VMX_PROCBASED_CTLS 0x482
327#define MSR_IA32_VMX_EXIT_CTLS 0x483
328#define MSR_IA32_VMX_ENTRY_CTLS 0x484
329#define MSR_IA32_VMX_MISC 0x485
330#define MSR_IA32_VMX_CR0_FIXED0 0x486
331#define MSR_IA32_VMX_CR0_FIXED1 0x487
332#define MSR_IA32_VMX_CR4_FIXED0 0x488
333#define MSR_IA32_VMX_CR4_FIXED1 0x489
334#define MSR_IA32_VMX_VMCS_ENUM 0x48a
335#define MSR_IA32_VMX_PROCBASED_CTLS2 0x48b
336#define MSR_IA32_VMX_EPT_VPID_CAP 0x48c
337
338#define MSR_IA32_FEATURE_CONTROL 0x3a
339#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1 334#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1
340#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4 335#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4
341 336
@@ -360,8 +355,6 @@ enum vmcs_field {
360#define VMX_EPT_READABLE_MASK 0x1ull 355#define VMX_EPT_READABLE_MASK 0x1ull
361#define VMX_EPT_WRITABLE_MASK 0x2ull 356#define VMX_EPT_WRITABLE_MASK 0x2ull
362#define VMX_EPT_EXECUTABLE_MASK 0x4ull 357#define VMX_EPT_EXECUTABLE_MASK 0x4ull
363#define VMX_EPT_FAKE_ACCESSED_MASK (1ull << 62)
364#define VMX_EPT_FAKE_DIRTY_MASK (1ull << 63)
365 358
366#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul 359#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
367 360
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 63a77caa59f1..0d682fc6aeb3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -72,6 +72,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
72 { "mmio_exits", VCPU_STAT(mmio_exits) }, 72 { "mmio_exits", VCPU_STAT(mmio_exits) },
73 { "signal_exits", VCPU_STAT(signal_exits) }, 73 { "signal_exits", VCPU_STAT(signal_exits) },
74 { "irq_window", VCPU_STAT(irq_window_exits) }, 74 { "irq_window", VCPU_STAT(irq_window_exits) },
75 { "nmi_window", VCPU_STAT(nmi_window_exits) },
75 { "halt_exits", VCPU_STAT(halt_exits) }, 76 { "halt_exits", VCPU_STAT(halt_exits) },
76 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 77 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
77 { "hypercalls", VCPU_STAT(hypercalls) }, 78 { "hypercalls", VCPU_STAT(hypercalls) },
@@ -173,6 +174,12 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
173 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 174 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
174} 175}
175 176
177void kvm_inject_nmi(struct kvm_vcpu *vcpu)
178{
179 vcpu->arch.nmi_pending = 1;
180}
181EXPORT_SYMBOL_GPL(kvm_inject_nmi);
182
176void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 183void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
177{ 184{
178 WARN_ON(vcpu->arch.exception.pending); 185 WARN_ON(vcpu->arch.exception.pending);
@@ -604,6 +611,38 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
604 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); 611 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
605} 612}
606 613
614static bool msr_mtrr_valid(unsigned msr)
615{
616 switch (msr) {
617 case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
618 case MSR_MTRRfix64K_00000:
619 case MSR_MTRRfix16K_80000:
620 case MSR_MTRRfix16K_A0000:
621 case MSR_MTRRfix4K_C0000:
622 case MSR_MTRRfix4K_C8000:
623 case MSR_MTRRfix4K_D0000:
624 case MSR_MTRRfix4K_D8000:
625 case MSR_MTRRfix4K_E0000:
626 case MSR_MTRRfix4K_E8000:
627 case MSR_MTRRfix4K_F0000:
628 case MSR_MTRRfix4K_F8000:
629 case MSR_MTRRdefType:
630 case MSR_IA32_CR_PAT:
631 return true;
632 case 0x2f8:
633 return true;
634 }
635 return false;
636}
637
638static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
639{
640 if (!msr_mtrr_valid(msr))
641 return 1;
642
643 vcpu->arch.mtrr[msr - 0x200] = data;
644 return 0;
645}
607 646
608int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 647int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
609{ 648{
@@ -625,8 +664,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
625 break; 664 break;
626 case MSR_IA32_UCODE_REV: 665 case MSR_IA32_UCODE_REV:
627 case MSR_IA32_UCODE_WRITE: 666 case MSR_IA32_UCODE_WRITE:
628 case 0x200 ... 0x2ff: /* MTRRs */
629 break; 667 break;
668 case 0x200 ... 0x2ff:
669 return set_msr_mtrr(vcpu, msr, data);
630 case MSR_IA32_APICBASE: 670 case MSR_IA32_APICBASE:
631 kvm_set_apic_base(vcpu, data); 671 kvm_set_apic_base(vcpu, data);
632 break; 672 break;
@@ -684,6 +724,15 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
684 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); 724 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
685} 725}
686 726
727static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
728{
729 if (!msr_mtrr_valid(msr))
730 return 1;
731
732 *pdata = vcpu->arch.mtrr[msr - 0x200];
733 return 0;
734}
735
687int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 736int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
688{ 737{
689 u64 data; 738 u64 data;
@@ -705,11 +754,13 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
705 case MSR_IA32_MC0_MISC+16: 754 case MSR_IA32_MC0_MISC+16:
706 case MSR_IA32_UCODE_REV: 755 case MSR_IA32_UCODE_REV:
707 case MSR_IA32_EBL_CR_POWERON: 756 case MSR_IA32_EBL_CR_POWERON:
708 /* MTRR registers */
709 case 0xfe:
710 case 0x200 ... 0x2ff:
711 data = 0; 757 data = 0;
712 break; 758 break;
759 case MSR_MTRRcap:
760 data = 0x500 | KVM_NR_VAR_MTRR;
761 break;
762 case 0x200 ... 0x2ff:
763 return get_msr_mtrr(vcpu, msr, pdata);
713 case 0xcd: /* fsb frequency */ 764 case 0xcd: /* fsb frequency */
714 data = 3; 765 data = 3;
715 break; 766 break;
@@ -817,41 +868,6 @@ out:
817 return r; 868 return r;
818} 869}
819 870
820/*
821 * Make sure that a cpu that is being hot-unplugged does not have any vcpus
822 * cached on it.
823 */
824void decache_vcpus_on_cpu(int cpu)
825{
826 struct kvm *vm;
827 struct kvm_vcpu *vcpu;
828 int i;
829
830 spin_lock(&kvm_lock);
831 list_for_each_entry(vm, &vm_list, vm_list)
832 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
833 vcpu = vm->vcpus[i];
834 if (!vcpu)
835 continue;
836 /*
837 * If the vcpu is locked, then it is running on some
838 * other cpu and therefore it is not cached on the
839 * cpu in question.
840 *
841 * If it's not locked, check the last cpu it executed
842 * on.
843 */
844 if (mutex_trylock(&vcpu->mutex)) {
845 if (vcpu->cpu == cpu) {
846 kvm_x86_ops->vcpu_decache(vcpu);
847 vcpu->cpu = -1;
848 }
849 mutex_unlock(&vcpu->mutex);
850 }
851 }
852 spin_unlock(&kvm_lock);
853}
854
855int kvm_dev_ioctl_check_extension(long ext) 871int kvm_dev_ioctl_check_extension(long ext)
856{ 872{
857 int r; 873 int r;
@@ -867,8 +883,12 @@ int kvm_dev_ioctl_check_extension(long ext)
867 case KVM_CAP_PIT: 883 case KVM_CAP_PIT:
868 case KVM_CAP_NOP_IO_DELAY: 884 case KVM_CAP_NOP_IO_DELAY:
869 case KVM_CAP_MP_STATE: 885 case KVM_CAP_MP_STATE:
886 case KVM_CAP_SYNC_MMU:
870 r = 1; 887 r = 1;
871 break; 888 break;
889 case KVM_CAP_COALESCED_MMIO:
890 r = KVM_COALESCED_MMIO_PAGE_OFFSET;
891 break;
872 case KVM_CAP_VAPIC: 892 case KVM_CAP_VAPIC:
873 r = !kvm_x86_ops->cpu_has_accelerated_tpr(); 893 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
874 break; 894 break;
@@ -1476,6 +1496,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1476 goto out; 1496 goto out;
1477 1497
1478 down_write(&kvm->slots_lock); 1498 down_write(&kvm->slots_lock);
1499 spin_lock(&kvm->mmu_lock);
1479 1500
1480 p = &kvm->arch.aliases[alias->slot]; 1501 p = &kvm->arch.aliases[alias->slot];
1481 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 1502 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
@@ -1487,6 +1508,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1487 break; 1508 break;
1488 kvm->arch.naliases = n; 1509 kvm->arch.naliases = n;
1489 1510
1511 spin_unlock(&kvm->mmu_lock);
1490 kvm_mmu_zap_all(kvm); 1512 kvm_mmu_zap_all(kvm);
1491 1513
1492 up_write(&kvm->slots_lock); 1514 up_write(&kvm->slots_lock);
@@ -1781,13 +1803,14 @@ static void kvm_init_msr_list(void)
1781 * Only apic need an MMIO device hook, so shortcut now.. 1803 * Only apic need an MMIO device hook, so shortcut now..
1782 */ 1804 */
1783static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, 1805static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1784 gpa_t addr) 1806 gpa_t addr, int len,
1807 int is_write)
1785{ 1808{
1786 struct kvm_io_device *dev; 1809 struct kvm_io_device *dev;
1787 1810
1788 if (vcpu->arch.apic) { 1811 if (vcpu->arch.apic) {
1789 dev = &vcpu->arch.apic->dev; 1812 dev = &vcpu->arch.apic->dev;
1790 if (dev->in_range(dev, addr)) 1813 if (dev->in_range(dev, addr, len, is_write))
1791 return dev; 1814 return dev;
1792 } 1815 }
1793 return NULL; 1816 return NULL;
@@ -1795,13 +1818,15 @@ static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1795 1818
1796 1819
1797static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, 1820static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1798 gpa_t addr) 1821 gpa_t addr, int len,
1822 int is_write)
1799{ 1823{
1800 struct kvm_io_device *dev; 1824 struct kvm_io_device *dev;
1801 1825
1802 dev = vcpu_find_pervcpu_dev(vcpu, addr); 1826 dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write);
1803 if (dev == NULL) 1827 if (dev == NULL)
1804 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr); 1828 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len,
1829 is_write);
1805 return dev; 1830 return dev;
1806} 1831}
1807 1832
@@ -1869,7 +1894,7 @@ mmio:
1869 * Is this MMIO handled locally? 1894 * Is this MMIO handled locally?
1870 */ 1895 */
1871 mutex_lock(&vcpu->kvm->lock); 1896 mutex_lock(&vcpu->kvm->lock);
1872 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); 1897 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0);
1873 if (mmio_dev) { 1898 if (mmio_dev) {
1874 kvm_iodevice_read(mmio_dev, gpa, bytes, val); 1899 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1875 mutex_unlock(&vcpu->kvm->lock); 1900 mutex_unlock(&vcpu->kvm->lock);
@@ -1924,7 +1949,7 @@ mmio:
1924 * Is this MMIO handled locally? 1949 * Is this MMIO handled locally?
1925 */ 1950 */
1926 mutex_lock(&vcpu->kvm->lock); 1951 mutex_lock(&vcpu->kvm->lock);
1927 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); 1952 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
1928 if (mmio_dev) { 1953 if (mmio_dev) {
1929 kvm_iodevice_write(mmio_dev, gpa, bytes, val); 1954 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1930 mutex_unlock(&vcpu->kvm->lock); 1955 mutex_unlock(&vcpu->kvm->lock);
@@ -2020,6 +2045,7 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2020 2045
2021int emulate_clts(struct kvm_vcpu *vcpu) 2046int emulate_clts(struct kvm_vcpu *vcpu)
2022{ 2047{
2048 KVMTRACE_0D(CLTS, vcpu, handler);
2023 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); 2049 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
2024 return X86EMUL_CONTINUE; 2050 return X86EMUL_CONTINUE;
2025} 2051}
@@ -2053,21 +2079,19 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
2053 2079
2054void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 2080void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2055{ 2081{
2056 static int reported;
2057 u8 opcodes[4]; 2082 u8 opcodes[4];
2058 unsigned long rip = vcpu->arch.rip; 2083 unsigned long rip = vcpu->arch.rip;
2059 unsigned long rip_linear; 2084 unsigned long rip_linear;
2060 2085
2061 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 2086 if (!printk_ratelimit())
2062
2063 if (reported)
2064 return; 2087 return;
2065 2088
2089 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
2090
2066 emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu); 2091 emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
2067 2092
2068 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", 2093 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
2069 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); 2094 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
2070 reported = 1;
2071} 2095}
2072EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 2096EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
2073 2097
@@ -2105,27 +2129,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2105 ? X86EMUL_MODE_PROT64 : cs_db 2129 ? X86EMUL_MODE_PROT64 : cs_db
2106 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 2130 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2107 2131
2108 if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
2109 vcpu->arch.emulate_ctxt.cs_base = 0;
2110 vcpu->arch.emulate_ctxt.ds_base = 0;
2111 vcpu->arch.emulate_ctxt.es_base = 0;
2112 vcpu->arch.emulate_ctxt.ss_base = 0;
2113 } else {
2114 vcpu->arch.emulate_ctxt.cs_base =
2115 get_segment_base(vcpu, VCPU_SREG_CS);
2116 vcpu->arch.emulate_ctxt.ds_base =
2117 get_segment_base(vcpu, VCPU_SREG_DS);
2118 vcpu->arch.emulate_ctxt.es_base =
2119 get_segment_base(vcpu, VCPU_SREG_ES);
2120 vcpu->arch.emulate_ctxt.ss_base =
2121 get_segment_base(vcpu, VCPU_SREG_SS);
2122 }
2123
2124 vcpu->arch.emulate_ctxt.gs_base =
2125 get_segment_base(vcpu, VCPU_SREG_GS);
2126 vcpu->arch.emulate_ctxt.fs_base =
2127 get_segment_base(vcpu, VCPU_SREG_FS);
2128
2129 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2132 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2130 2133
2131 /* Reject the instructions other than VMCALL/VMMCALL when 2134 /* Reject the instructions other than VMCALL/VMMCALL when
@@ -2300,9 +2303,10 @@ static void pio_string_write(struct kvm_io_device *pio_dev,
2300} 2303}
2301 2304
2302static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, 2305static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
2303 gpa_t addr) 2306 gpa_t addr, int len,
2307 int is_write)
2304{ 2308{
2305 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr); 2309 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write);
2306} 2310}
2307 2311
2308int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 2312int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
@@ -2331,11 +2335,10 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2331 2335
2332 kvm_x86_ops->cache_regs(vcpu); 2336 kvm_x86_ops->cache_regs(vcpu);
2333 memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); 2337 memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
2334 kvm_x86_ops->decache_regs(vcpu);
2335 2338
2336 kvm_x86_ops->skip_emulated_instruction(vcpu); 2339 kvm_x86_ops->skip_emulated_instruction(vcpu);
2337 2340
2338 pio_dev = vcpu_find_pio_dev(vcpu, port); 2341 pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
2339 if (pio_dev) { 2342 if (pio_dev) {
2340 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); 2343 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
2341 complete_pio(vcpu); 2344 complete_pio(vcpu);
@@ -2417,7 +2420,9 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2417 } 2420 }
2418 } 2421 }
2419 2422
2420 pio_dev = vcpu_find_pio_dev(vcpu, port); 2423 pio_dev = vcpu_find_pio_dev(vcpu, port,
2424 vcpu->arch.pio.cur_count,
2425 !vcpu->arch.pio.in);
2421 if (!vcpu->arch.pio.in) { 2426 if (!vcpu->arch.pio.in) {
2422 /* string PIO write */ 2427 /* string PIO write */
2423 ret = pio_copy_data(vcpu); 2428 ret = pio_copy_data(vcpu);
@@ -2600,27 +2605,41 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
2600 2605
2601unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 2606unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2602{ 2607{
2608 unsigned long value;
2609
2603 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 2610 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2604 switch (cr) { 2611 switch (cr) {
2605 case 0: 2612 case 0:
2606 return vcpu->arch.cr0; 2613 value = vcpu->arch.cr0;
2614 break;
2607 case 2: 2615 case 2:
2608 return vcpu->arch.cr2; 2616 value = vcpu->arch.cr2;
2617 break;
2609 case 3: 2618 case 3:
2610 return vcpu->arch.cr3; 2619 value = vcpu->arch.cr3;
2620 break;
2611 case 4: 2621 case 4:
2612 return vcpu->arch.cr4; 2622 value = vcpu->arch.cr4;
2623 break;
2613 case 8: 2624 case 8:
2614 return kvm_get_cr8(vcpu); 2625 value = kvm_get_cr8(vcpu);
2626 break;
2615 default: 2627 default:
2616 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 2628 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
2617 return 0; 2629 return 0;
2618 } 2630 }
2631 KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value,
2632 (u32)((u64)value >> 32), handler);
2633
2634 return value;
2619} 2635}
2620 2636
2621void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, 2637void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
2622 unsigned long *rflags) 2638 unsigned long *rflags)
2623{ 2639{
2640 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val,
2641 (u32)((u64)val >> 32), handler);
2642
2624 switch (cr) { 2643 switch (cr) {
2625 case 0: 2644 case 0:
2626 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 2645 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
@@ -2771,8 +2790,10 @@ static void vapic_exit(struct kvm_vcpu *vcpu)
2771 if (!apic || !apic->vapic_addr) 2790 if (!apic || !apic->vapic_addr)
2772 return; 2791 return;
2773 2792
2793 down_read(&vcpu->kvm->slots_lock);
2774 kvm_release_page_dirty(apic->vapic_page); 2794 kvm_release_page_dirty(apic->vapic_page);
2775 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 2795 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
2796 up_read(&vcpu->kvm->slots_lock);
2776} 2797}
2777 2798
2778static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2799static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -2928,9 +2949,7 @@ out:
2928 2949
2929 post_kvm_run_save(vcpu, kvm_run); 2950 post_kvm_run_save(vcpu, kvm_run);
2930 2951
2931 down_read(&vcpu->kvm->slots_lock);
2932 vapic_exit(vcpu); 2952 vapic_exit(vcpu);
2933 up_read(&vcpu->kvm->slots_lock);
2934 2953
2935 return r; 2954 return r;
2936} 2955}
@@ -2942,15 +2961,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2942 2961
2943 vcpu_load(vcpu); 2962 vcpu_load(vcpu);
2944 2963
2964 if (vcpu->sigset_active)
2965 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
2966
2945 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 2967 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
2946 kvm_vcpu_block(vcpu); 2968 kvm_vcpu_block(vcpu);
2947 vcpu_put(vcpu); 2969 r = -EAGAIN;
2948 return -EAGAIN; 2970 goto out;
2949 } 2971 }
2950 2972
2951 if (vcpu->sigset_active)
2952 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
2953
2954 /* re-sync apic's tpr */ 2973 /* re-sync apic's tpr */
2955 if (!irqchip_in_kernel(vcpu->kvm)) 2974 if (!irqchip_in_kernel(vcpu->kvm))
2956 kvm_set_cr8(vcpu, kvm_run->cr8); 2975 kvm_set_cr8(vcpu, kvm_run->cr8);
@@ -3070,8 +3089,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3070 return 0; 3089 return 0;
3071} 3090}
3072 3091
3073static void get_segment(struct kvm_vcpu *vcpu, 3092void kvm_get_segment(struct kvm_vcpu *vcpu,
3074 struct kvm_segment *var, int seg) 3093 struct kvm_segment *var, int seg)
3075{ 3094{
3076 kvm_x86_ops->get_segment(vcpu, var, seg); 3095 kvm_x86_ops->get_segment(vcpu, var, seg);
3077} 3096}
@@ -3080,7 +3099,7 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3080{ 3099{
3081 struct kvm_segment cs; 3100 struct kvm_segment cs;
3082 3101
3083 get_segment(vcpu, &cs, VCPU_SREG_CS); 3102 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
3084 *db = cs.db; 3103 *db = cs.db;
3085 *l = cs.l; 3104 *l = cs.l;
3086} 3105}
@@ -3094,15 +3113,15 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
3094 3113
3095 vcpu_load(vcpu); 3114 vcpu_load(vcpu);
3096 3115
3097 get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3116 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3098 get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 3117 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3099 get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 3118 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3100 get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 3119 kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3101 get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 3120 kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3102 get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 3121 kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
3103 3122
3104 get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3123 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3105 get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3124 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3106 3125
3107 kvm_x86_ops->get_idt(vcpu, &dt); 3126 kvm_x86_ops->get_idt(vcpu, &dt);
3108 sregs->idt.limit = dt.limit; 3127 sregs->idt.limit = dt.limit;
@@ -3154,7 +3173,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
3154 return 0; 3173 return 0;
3155} 3174}
3156 3175
3157static void set_segment(struct kvm_vcpu *vcpu, 3176static void kvm_set_segment(struct kvm_vcpu *vcpu,
3158 struct kvm_segment *var, int seg) 3177 struct kvm_segment *var, int seg)
3159{ 3178{
3160 kvm_x86_ops->set_segment(vcpu, var, seg); 3179 kvm_x86_ops->set_segment(vcpu, var, seg);
@@ -3168,6 +3187,10 @@ static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
3168 kvm_desct->base |= seg_desc->base2 << 24; 3187 kvm_desct->base |= seg_desc->base2 << 24;
3169 kvm_desct->limit = seg_desc->limit0; 3188 kvm_desct->limit = seg_desc->limit0;
3170 kvm_desct->limit |= seg_desc->limit << 16; 3189 kvm_desct->limit |= seg_desc->limit << 16;
3190 if (seg_desc->g) {
3191 kvm_desct->limit <<= 12;
3192 kvm_desct->limit |= 0xfff;
3193 }
3171 kvm_desct->selector = selector; 3194 kvm_desct->selector = selector;
3172 kvm_desct->type = seg_desc->type; 3195 kvm_desct->type = seg_desc->type;
3173 kvm_desct->present = seg_desc->p; 3196 kvm_desct->present = seg_desc->p;
@@ -3191,7 +3214,7 @@ static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu,
3191 if (selector & 1 << 2) { 3214 if (selector & 1 << 2) {
3192 struct kvm_segment kvm_seg; 3215 struct kvm_segment kvm_seg;
3193 3216
3194 get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); 3217 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
3195 3218
3196 if (kvm_seg.unusable) 3219 if (kvm_seg.unusable)
3197 dtable->limit = 0; 3220 dtable->limit = 0;
@@ -3207,6 +3230,7 @@ static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu,
3207static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3230static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3208 struct desc_struct *seg_desc) 3231 struct desc_struct *seg_desc)
3209{ 3232{
3233 gpa_t gpa;
3210 struct descriptor_table dtable; 3234 struct descriptor_table dtable;
3211 u16 index = selector >> 3; 3235 u16 index = selector >> 3;
3212 3236
@@ -3216,13 +3240,16 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3216 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); 3240 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
3217 return 1; 3241 return 1;
3218 } 3242 }
3219 return kvm_read_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8); 3243 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
3244 gpa += index * 8;
3245 return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8);
3220} 3246}
3221 3247
3222/* allowed just for 8 bytes segments */ 3248/* allowed just for 8 bytes segments */
3223static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3249static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3224 struct desc_struct *seg_desc) 3250 struct desc_struct *seg_desc)
3225{ 3251{
3252 gpa_t gpa;
3226 struct descriptor_table dtable; 3253 struct descriptor_table dtable;
3227 u16 index = selector >> 3; 3254 u16 index = selector >> 3;
3228 3255
@@ -3230,7 +3257,9 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3230 3257
3231 if (dtable.limit < index * 8 + 7) 3258 if (dtable.limit < index * 8 + 7)
3232 return 1; 3259 return 1;
3233 return kvm_write_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8); 3260 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
3261 gpa += index * 8;
3262 return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8);
3234} 3263}
3235 3264
3236static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, 3265static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
@@ -3242,62 +3271,14 @@ static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
3242 base_addr |= (seg_desc->base1 << 16); 3271 base_addr |= (seg_desc->base1 << 16);
3243 base_addr |= (seg_desc->base2 << 24); 3272 base_addr |= (seg_desc->base2 << 24);
3244 3273
3245 return base_addr; 3274 return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
3246}
3247
3248static int load_tss_segment32(struct kvm_vcpu *vcpu,
3249 struct desc_struct *seg_desc,
3250 struct tss_segment_32 *tss)
3251{
3252 u32 base_addr;
3253
3254 base_addr = get_tss_base_addr(vcpu, seg_desc);
3255
3256 return kvm_read_guest(vcpu->kvm, base_addr, tss,
3257 sizeof(struct tss_segment_32));
3258}
3259
3260static int save_tss_segment32(struct kvm_vcpu *vcpu,
3261 struct desc_struct *seg_desc,
3262 struct tss_segment_32 *tss)
3263{
3264 u32 base_addr;
3265
3266 base_addr = get_tss_base_addr(vcpu, seg_desc);
3267
3268 return kvm_write_guest(vcpu->kvm, base_addr, tss,
3269 sizeof(struct tss_segment_32));
3270}
3271
3272static int load_tss_segment16(struct kvm_vcpu *vcpu,
3273 struct desc_struct *seg_desc,
3274 struct tss_segment_16 *tss)
3275{
3276 u32 base_addr;
3277
3278 base_addr = get_tss_base_addr(vcpu, seg_desc);
3279
3280 return kvm_read_guest(vcpu->kvm, base_addr, tss,
3281 sizeof(struct tss_segment_16));
3282}
3283
3284static int save_tss_segment16(struct kvm_vcpu *vcpu,
3285 struct desc_struct *seg_desc,
3286 struct tss_segment_16 *tss)
3287{
3288 u32 base_addr;
3289
3290 base_addr = get_tss_base_addr(vcpu, seg_desc);
3291
3292 return kvm_write_guest(vcpu->kvm, base_addr, tss,
3293 sizeof(struct tss_segment_16));
3294} 3275}
3295 3276
3296static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) 3277static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
3297{ 3278{
3298 struct kvm_segment kvm_seg; 3279 struct kvm_segment kvm_seg;
3299 3280
3300 get_segment(vcpu, &kvm_seg, seg); 3281 kvm_get_segment(vcpu, &kvm_seg, seg);
3301 return kvm_seg.selector; 3282 return kvm_seg.selector;
3302} 3283}
3303 3284
@@ -3313,8 +3294,8 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
3313 return 0; 3294 return 0;
3314} 3295}
3315 3296
3316static int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3297int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3317 int type_bits, int seg) 3298 int type_bits, int seg)
3318{ 3299{
3319 struct kvm_segment kvm_seg; 3300 struct kvm_segment kvm_seg;
3320 3301
@@ -3327,7 +3308,7 @@ static int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3327 if (!kvm_seg.s) 3308 if (!kvm_seg.s)
3328 kvm_seg.unusable = 1; 3309 kvm_seg.unusable = 1;
3329 3310
3330 set_segment(vcpu, &kvm_seg, seg); 3311 kvm_set_segment(vcpu, &kvm_seg, seg);
3331 return 0; 3312 return 0;
3332} 3313}
3333 3314
@@ -3373,25 +3354,25 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
3373 vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi; 3354 vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi;
3374 vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi; 3355 vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi;
3375 3356
3376 if (load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) 3357 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
3377 return 1; 3358 return 1;
3378 3359
3379 if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 3360 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3380 return 1; 3361 return 1;
3381 3362
3382 if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 3363 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3383 return 1; 3364 return 1;
3384 3365
3385 if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 3366 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3386 return 1; 3367 return 1;
3387 3368
3388 if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 3369 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3389 return 1; 3370 return 1;
3390 3371
3391 if (load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) 3372 if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
3392 return 1; 3373 return 1;
3393 3374
3394 if (load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) 3375 if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
3395 return 1; 3376 return 1;
3396 return 0; 3377 return 0;
3397} 3378}
@@ -3432,38 +3413,44 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu,
3432 vcpu->arch.regs[VCPU_REGS_RSI] = tss->si; 3413 vcpu->arch.regs[VCPU_REGS_RSI] = tss->si;
3433 vcpu->arch.regs[VCPU_REGS_RDI] = tss->di; 3414 vcpu->arch.regs[VCPU_REGS_RDI] = tss->di;
3434 3415
3435 if (load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) 3416 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
3436 return 1; 3417 return 1;
3437 3418
3438 if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 3419 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3439 return 1; 3420 return 1;
3440 3421
3441 if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 3422 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3442 return 1; 3423 return 1;
3443 3424
3444 if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 3425 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3445 return 1; 3426 return 1;
3446 3427
3447 if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 3428 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3448 return 1; 3429 return 1;
3449 return 0; 3430 return 0;
3450} 3431}
3451 3432
3452int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, 3433static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
3453 struct desc_struct *cseg_desc, 3434 u32 old_tss_base,
3454 struct desc_struct *nseg_desc) 3435 struct desc_struct *nseg_desc)
3455{ 3436{
3456 struct tss_segment_16 tss_segment_16; 3437 struct tss_segment_16 tss_segment_16;
3457 int ret = 0; 3438 int ret = 0;
3458 3439
3459 if (load_tss_segment16(vcpu, cseg_desc, &tss_segment_16)) 3440 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
3441 sizeof tss_segment_16))
3460 goto out; 3442 goto out;
3461 3443
3462 save_state_to_tss16(vcpu, &tss_segment_16); 3444 save_state_to_tss16(vcpu, &tss_segment_16);
3463 save_tss_segment16(vcpu, cseg_desc, &tss_segment_16);
3464 3445
3465 if (load_tss_segment16(vcpu, nseg_desc, &tss_segment_16)) 3446 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
3447 sizeof tss_segment_16))
3448 goto out;
3449
3450 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
3451 &tss_segment_16, sizeof tss_segment_16))
3466 goto out; 3452 goto out;
3453
3467 if (load_state_from_tss16(vcpu, &tss_segment_16)) 3454 if (load_state_from_tss16(vcpu, &tss_segment_16))
3468 goto out; 3455 goto out;
3469 3456
@@ -3472,21 +3459,27 @@ out:
3472 return ret; 3459 return ret;
3473} 3460}
3474 3461
3475int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, 3462static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
3476 struct desc_struct *cseg_desc, 3463 u32 old_tss_base,
3477 struct desc_struct *nseg_desc) 3464 struct desc_struct *nseg_desc)
3478{ 3465{
3479 struct tss_segment_32 tss_segment_32; 3466 struct tss_segment_32 tss_segment_32;
3480 int ret = 0; 3467 int ret = 0;
3481 3468
3482 if (load_tss_segment32(vcpu, cseg_desc, &tss_segment_32)) 3469 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
3470 sizeof tss_segment_32))
3483 goto out; 3471 goto out;
3484 3472
3485 save_state_to_tss32(vcpu, &tss_segment_32); 3473 save_state_to_tss32(vcpu, &tss_segment_32);
3486 save_tss_segment32(vcpu, cseg_desc, &tss_segment_32);
3487 3474
3488 if (load_tss_segment32(vcpu, nseg_desc, &tss_segment_32)) 3475 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
3476 sizeof tss_segment_32))
3477 goto out;
3478
3479 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
3480 &tss_segment_32, sizeof tss_segment_32))
3489 goto out; 3481 goto out;
3482
3490 if (load_state_from_tss32(vcpu, &tss_segment_32)) 3483 if (load_state_from_tss32(vcpu, &tss_segment_32))
3491 goto out; 3484 goto out;
3492 3485
@@ -3501,16 +3494,20 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3501 struct desc_struct cseg_desc; 3494 struct desc_struct cseg_desc;
3502 struct desc_struct nseg_desc; 3495 struct desc_struct nseg_desc;
3503 int ret = 0; 3496 int ret = 0;
3497 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
3498 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
3504 3499
3505 get_segment(vcpu, &tr_seg, VCPU_SREG_TR); 3500 old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base);
3506 3501
3502 /* FIXME: Handle errors. Failure to read either TSS or their
3503 * descriptors should generate a pagefault.
3504 */
3507 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) 3505 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
3508 goto out; 3506 goto out;
3509 3507
3510 if (load_guest_segment_descriptor(vcpu, tr_seg.selector, &cseg_desc)) 3508 if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
3511 goto out; 3509 goto out;
3512 3510
3513
3514 if (reason != TASK_SWITCH_IRET) { 3511 if (reason != TASK_SWITCH_IRET) {
3515 int cpl; 3512 int cpl;
3516 3513
@@ -3528,8 +3525,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3528 3525
3529 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { 3526 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
3530 cseg_desc.type &= ~(1 << 1); //clear the B flag 3527 cseg_desc.type &= ~(1 << 1); //clear the B flag
3531 save_guest_segment_descriptor(vcpu, tr_seg.selector, 3528 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
3532 &cseg_desc);
3533 } 3529 }
3534 3530
3535 if (reason == TASK_SWITCH_IRET) { 3531 if (reason == TASK_SWITCH_IRET) {
@@ -3541,10 +3537,10 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3541 kvm_x86_ops->cache_regs(vcpu); 3537 kvm_x86_ops->cache_regs(vcpu);
3542 3538
3543 if (nseg_desc.type & 8) 3539 if (nseg_desc.type & 8)
3544 ret = kvm_task_switch_32(vcpu, tss_selector, &cseg_desc, 3540 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,
3545 &nseg_desc); 3541 &nseg_desc);
3546 else 3542 else
3547 ret = kvm_task_switch_16(vcpu, tss_selector, &cseg_desc, 3543 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base,
3548 &nseg_desc); 3544 &nseg_desc);
3549 3545
3550 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 3546 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
@@ -3561,7 +3557,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3561 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); 3557 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);
3562 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); 3558 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
3563 tr_seg.type = 11; 3559 tr_seg.type = 11;
3564 set_segment(vcpu, &tr_seg, VCPU_SREG_TR); 3560 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
3565out: 3561out:
3566 kvm_x86_ops->decache_regs(vcpu); 3562 kvm_x86_ops->decache_regs(vcpu);
3567 return ret; 3563 return ret;
@@ -3628,15 +3624,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3628 } 3624 }
3629 } 3625 }
3630 3626
3631 set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3627 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3632 set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 3628 kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3633 set_segment(vcpu, &sregs->es, VCPU_SREG_ES); 3629 kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3634 set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 3630 kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3635 set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 3631 kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3636 set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 3632 kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
3637 3633
3638 set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3634 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3639 set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3635 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3640 3636
3641 vcpu_put(vcpu); 3637 vcpu_put(vcpu);
3642 3638
@@ -3751,14 +3747,14 @@ void fx_init(struct kvm_vcpu *vcpu)
3751 * allocate ram with GFP_KERNEL. 3747 * allocate ram with GFP_KERNEL.
3752 */ 3748 */
3753 if (!used_math()) 3749 if (!used_math())
3754 fx_save(&vcpu->arch.host_fx_image); 3750 kvm_fx_save(&vcpu->arch.host_fx_image);
3755 3751
3756 /* Initialize guest FPU by resetting ours and saving into guest's */ 3752 /* Initialize guest FPU by resetting ours and saving into guest's */
3757 preempt_disable(); 3753 preempt_disable();
3758 fx_save(&vcpu->arch.host_fx_image); 3754 kvm_fx_save(&vcpu->arch.host_fx_image);
3759 fx_finit(); 3755 kvm_fx_finit();
3760 fx_save(&vcpu->arch.guest_fx_image); 3756 kvm_fx_save(&vcpu->arch.guest_fx_image);
3761 fx_restore(&vcpu->arch.host_fx_image); 3757 kvm_fx_restore(&vcpu->arch.host_fx_image);
3762 preempt_enable(); 3758 preempt_enable();
3763 3759
3764 vcpu->arch.cr0 |= X86_CR0_ET; 3760 vcpu->arch.cr0 |= X86_CR0_ET;
@@ -3775,8 +3771,8 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
3775 return; 3771 return;
3776 3772
3777 vcpu->guest_fpu_loaded = 1; 3773 vcpu->guest_fpu_loaded = 1;
3778 fx_save(&vcpu->arch.host_fx_image); 3774 kvm_fx_save(&vcpu->arch.host_fx_image);
3779 fx_restore(&vcpu->arch.guest_fx_image); 3775 kvm_fx_restore(&vcpu->arch.guest_fx_image);
3780} 3776}
3781EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); 3777EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
3782 3778
@@ -3786,8 +3782,8 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
3786 return; 3782 return;
3787 3783
3788 vcpu->guest_fpu_loaded = 0; 3784 vcpu->guest_fpu_loaded = 0;
3789 fx_save(&vcpu->arch.guest_fx_image); 3785 kvm_fx_save(&vcpu->arch.guest_fx_image);
3790 fx_restore(&vcpu->arch.host_fx_image); 3786 kvm_fx_restore(&vcpu->arch.host_fx_image);
3791 ++vcpu->stat.fpu_reload; 3787 ++vcpu->stat.fpu_reload;
3792} 3788}
3793EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); 3789EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
@@ -3979,16 +3975,23 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
3979 */ 3975 */
3980 if (!user_alloc) { 3976 if (!user_alloc) {
3981 if (npages && !old.rmap) { 3977 if (npages && !old.rmap) {
3978 unsigned long userspace_addr;
3979
3982 down_write(&current->mm->mmap_sem); 3980 down_write(&current->mm->mmap_sem);
3983 memslot->userspace_addr = do_mmap(NULL, 0, 3981 userspace_addr = do_mmap(NULL, 0,
3984 npages * PAGE_SIZE, 3982 npages * PAGE_SIZE,
3985 PROT_READ | PROT_WRITE, 3983 PROT_READ | PROT_WRITE,
3986 MAP_SHARED | MAP_ANONYMOUS, 3984 MAP_SHARED | MAP_ANONYMOUS,
3987 0); 3985 0);
3988 up_write(&current->mm->mmap_sem); 3986 up_write(&current->mm->mmap_sem);
3989 3987
3990 if (IS_ERR((void *)memslot->userspace_addr)) 3988 if (IS_ERR((void *)userspace_addr))
3991 return PTR_ERR((void *)memslot->userspace_addr); 3989 return PTR_ERR((void *)userspace_addr);
3990
3991 /* set userspace_addr atomically for kvm_hva_to_rmapp */
3992 spin_lock(&kvm->mmu_lock);
3993 memslot->userspace_addr = userspace_addr;
3994 spin_unlock(&kvm->mmu_lock);
3992 } else { 3995 } else {
3993 if (!old.user_alloc && old.rmap) { 3996 if (!old.user_alloc && old.rmap) {
3994 int ret; 3997 int ret;
@@ -4016,6 +4019,11 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
4016 return 0; 4019 return 0;
4017} 4020}
4018 4021
4022void kvm_arch_flush_shadow(struct kvm *kvm)
4023{
4024 kvm_mmu_zap_all(kvm);
4025}
4026
4019int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 4027int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
4020{ 4028{
4021 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 4029 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
@@ -4044,6 +4052,6 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
4044 * So need not to call smp_call_function_single() in that case. 4052 * So need not to call smp_call_function_single() in that case.
4045 */ 4053 */
4046 if (vcpu->guest_mode && vcpu->cpu != cpu) 4054 if (vcpu->guest_mode && vcpu->cpu != cpu)
4047 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0); 4055 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);
4048 put_cpu(); 4056 put_cpu();
4049} 4057}
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 932f216d890c..f2f90468f8b1 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -121,7 +121,7 @@ static u16 opcode_table[256] = {
121 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , 121 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
122 0, 0, 0, 0, 122 0, 0, 0, 0,
123 /* 0x68 - 0x6F */ 123 /* 0x68 - 0x6F */
124 0, 0, ImplicitOps | Mov | Stack, 0, 124 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
125 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ 125 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
126 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ 126 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
127 /* 0x70 - 0x77 */ 127 /* 0x70 - 0x77 */
@@ -138,9 +138,11 @@ static u16 opcode_table[256] = {
138 /* 0x88 - 0x8F */ 138 /* 0x88 - 0x8F */
139 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, 139 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
140 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, 140 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
141 0, ModRM | DstReg, 0, Group | Group1A, 141 DstMem | SrcReg | ModRM | Mov, ModRM | DstReg,
142 /* 0x90 - 0x9F */ 142 DstReg | SrcMem | ModRM | Mov, Group | Group1A,
143 0, 0, 0, 0, 0, 0, 0, 0, 143 /* 0x90 - 0x97 */
144 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
145 /* 0x98 - 0x9F */
144 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, 146 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
145 /* 0xA0 - 0xA7 */ 147 /* 0xA0 - 0xA7 */
146 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, 148 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
@@ -152,7 +154,8 @@ static u16 opcode_table[256] = {
152 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 154 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
153 ByteOp | ImplicitOps | String, ImplicitOps | String, 155 ByteOp | ImplicitOps | String, ImplicitOps | String,
154 /* 0xB0 - 0xBF */ 156 /* 0xB0 - 0xBF */
155 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 157 0, 0, 0, 0, 0, 0, 0, 0,
158 DstReg | SrcImm | Mov, 0, 0, 0, 0, 0, 0, 0,
156 /* 0xC0 - 0xC7 */ 159 /* 0xC0 - 0xC7 */
157 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, 160 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
158 0, ImplicitOps | Stack, 0, 0, 161 0, ImplicitOps | Stack, 0, 0,
@@ -168,7 +171,8 @@ static u16 opcode_table[256] = {
168 /* 0xE0 - 0xE7 */ 171 /* 0xE0 - 0xE7 */
169 0, 0, 0, 0, 0, 0, 0, 0, 172 0, 0, 0, 0, 0, 0, 0, 0,
170 /* 0xE8 - 0xEF */ 173 /* 0xE8 - 0xEF */
171 ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, 174 ImplicitOps | Stack, SrcImm | ImplicitOps,
175 ImplicitOps, SrcImmByte | ImplicitOps,
172 0, 0, 0, 0, 176 0, 0, 0, 0,
173 /* 0xF0 - 0xF7 */ 177 /* 0xF0 - 0xF7 */
174 0, 0, 0, 0, 178 0, 0, 0, 0,
@@ -215,7 +219,7 @@ static u16 twobyte_table[256] = {
215 /* 0xA0 - 0xA7 */ 219 /* 0xA0 - 0xA7 */
216 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, 220 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
217 /* 0xA8 - 0xAF */ 221 /* 0xA8 - 0xAF */
218 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, 222 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, ModRM, 0,
219 /* 0xB0 - 0xB7 */ 223 /* 0xB0 - 0xB7 */
220 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, 224 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
221 DstMem | SrcReg | ModRM | BitOp, 225 DstMem | SrcReg | ModRM | BitOp,
@@ -518,6 +522,39 @@ static inline void jmp_rel(struct decode_cache *c, int rel)
518 register_address_increment(c, &c->eip, rel); 522 register_address_increment(c, &c->eip, rel);
519} 523}
520 524
525static void set_seg_override(struct decode_cache *c, int seg)
526{
527 c->has_seg_override = true;
528 c->seg_override = seg;
529}
530
531static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
532{
533 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
534 return 0;
535
536 return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg);
537}
538
539static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt,
540 struct decode_cache *c)
541{
542 if (!c->has_seg_override)
543 return 0;
544
545 return seg_base(ctxt, c->seg_override);
546}
547
548static unsigned long es_base(struct x86_emulate_ctxt *ctxt)
549{
550 return seg_base(ctxt, VCPU_SREG_ES);
551}
552
553static unsigned long ss_base(struct x86_emulate_ctxt *ctxt)
554{
555 return seg_base(ctxt, VCPU_SREG_SS);
556}
557
521static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 558static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
522 struct x86_emulate_ops *ops, 559 struct x86_emulate_ops *ops,
523 unsigned long linear, u8 *dest) 560 unsigned long linear, u8 *dest)
@@ -660,7 +697,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
660{ 697{
661 struct decode_cache *c = &ctxt->decode; 698 struct decode_cache *c = &ctxt->decode;
662 u8 sib; 699 u8 sib;
663 int index_reg = 0, base_reg = 0, scale, rip_relative = 0; 700 int index_reg = 0, base_reg = 0, scale;
664 int rc = 0; 701 int rc = 0;
665 702
666 if (c->rex_prefix) { 703 if (c->rex_prefix) {
@@ -731,47 +768,28 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
731 } 768 }
732 if (c->modrm_rm == 2 || c->modrm_rm == 3 || 769 if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
733 (c->modrm_rm == 6 && c->modrm_mod != 0)) 770 (c->modrm_rm == 6 && c->modrm_mod != 0))
734 if (!c->override_base) 771 if (!c->has_seg_override)
735 c->override_base = &ctxt->ss_base; 772 set_seg_override(c, VCPU_SREG_SS);
736 c->modrm_ea = (u16)c->modrm_ea; 773 c->modrm_ea = (u16)c->modrm_ea;
737 } else { 774 } else {
738 /* 32/64-bit ModR/M decode. */ 775 /* 32/64-bit ModR/M decode. */
739 switch (c->modrm_rm) { 776 if ((c->modrm_rm & 7) == 4) {
740 case 4:
741 case 12:
742 sib = insn_fetch(u8, 1, c->eip); 777 sib = insn_fetch(u8, 1, c->eip);
743 index_reg |= (sib >> 3) & 7; 778 index_reg |= (sib >> 3) & 7;
744 base_reg |= sib & 7; 779 base_reg |= sib & 7;
745 scale = sib >> 6; 780 scale = sib >> 6;
746 781
747 switch (base_reg) { 782 if ((base_reg & 7) == 5 && c->modrm_mod == 0)
748 case 5: 783 c->modrm_ea += insn_fetch(s32, 4, c->eip);
749 if (c->modrm_mod != 0) 784 else
750 c->modrm_ea += c->regs[base_reg];
751 else
752 c->modrm_ea +=
753 insn_fetch(s32, 4, c->eip);
754 break;
755 default:
756 c->modrm_ea += c->regs[base_reg]; 785 c->modrm_ea += c->regs[base_reg];
757 } 786 if (index_reg != 4)
758 switch (index_reg) {
759 case 4:
760 break;
761 default:
762 c->modrm_ea += c->regs[index_reg] << scale; 787 c->modrm_ea += c->regs[index_reg] << scale;
763 } 788 } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) {
764 break; 789 if (ctxt->mode == X86EMUL_MODE_PROT64)
765 case 5: 790 c->rip_relative = 1;
766 if (c->modrm_mod != 0) 791 } else
767 c->modrm_ea += c->regs[c->modrm_rm];
768 else if (ctxt->mode == X86EMUL_MODE_PROT64)
769 rip_relative = 1;
770 break;
771 default:
772 c->modrm_ea += c->regs[c->modrm_rm]; 792 c->modrm_ea += c->regs[c->modrm_rm];
773 break;
774 }
775 switch (c->modrm_mod) { 793 switch (c->modrm_mod) {
776 case 0: 794 case 0:
777 if (c->modrm_rm == 5) 795 if (c->modrm_rm == 5)
@@ -785,22 +803,6 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
785 break; 803 break;
786 } 804 }
787 } 805 }
788 if (rip_relative) {
789 c->modrm_ea += c->eip;
790 switch (c->d & SrcMask) {
791 case SrcImmByte:
792 c->modrm_ea += 1;
793 break;
794 case SrcImm:
795 if (c->d & ByteOp)
796 c->modrm_ea += 1;
797 else
798 if (c->op_bytes == 8)
799 c->modrm_ea += 4;
800 else
801 c->modrm_ea += c->op_bytes;
802 }
803 }
804done: 806done:
805 return rc; 807 return rc;
806} 808}
@@ -838,6 +840,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
838 840
839 memset(c, 0, sizeof(struct decode_cache)); 841 memset(c, 0, sizeof(struct decode_cache));
840 c->eip = ctxt->vcpu->arch.rip; 842 c->eip = ctxt->vcpu->arch.rip;
843 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
841 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 844 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
842 845
843 switch (mode) { 846 switch (mode) {
@@ -876,23 +879,15 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
876 /* switch between 2/4 bytes */ 879 /* switch between 2/4 bytes */
877 c->ad_bytes = def_ad_bytes ^ 6; 880 c->ad_bytes = def_ad_bytes ^ 6;
878 break; 881 break;
882 case 0x26: /* ES override */
879 case 0x2e: /* CS override */ 883 case 0x2e: /* CS override */
880 c->override_base = &ctxt->cs_base; 884 case 0x36: /* SS override */
881 break;
882 case 0x3e: /* DS override */ 885 case 0x3e: /* DS override */
883 c->override_base = &ctxt->ds_base; 886 set_seg_override(c, (c->b >> 3) & 3);
884 break;
885 case 0x26: /* ES override */
886 c->override_base = &ctxt->es_base;
887 break; 887 break;
888 case 0x64: /* FS override */ 888 case 0x64: /* FS override */
889 c->override_base = &ctxt->fs_base;
890 break;
891 case 0x65: /* GS override */ 889 case 0x65: /* GS override */
892 c->override_base = &ctxt->gs_base; 890 set_seg_override(c, c->b & 7);
893 break;
894 case 0x36: /* SS override */
895 c->override_base = &ctxt->ss_base;
896 break; 891 break;
897 case 0x40 ... 0x4f: /* REX */ 892 case 0x40 ... 0x4f: /* REX */
898 if (mode != X86EMUL_MODE_PROT64) 893 if (mode != X86EMUL_MODE_PROT64)
@@ -964,15 +959,11 @@ done_prefixes:
964 if (rc) 959 if (rc)
965 goto done; 960 goto done;
966 961
967 if (!c->override_base) 962 if (!c->has_seg_override)
968 c->override_base = &ctxt->ds_base; 963 set_seg_override(c, VCPU_SREG_DS);
969 if (mode == X86EMUL_MODE_PROT64 &&
970 c->override_base != &ctxt->fs_base &&
971 c->override_base != &ctxt->gs_base)
972 c->override_base = NULL;
973 964
974 if (c->override_base) 965 if (!(!c->twobyte && c->b == 0x8d))
975 c->modrm_ea += *c->override_base; 966 c->modrm_ea += seg_override_base(ctxt, c);
976 967
977 if (c->ad_bytes != 8) 968 if (c->ad_bytes != 8)
978 c->modrm_ea = (u32)c->modrm_ea; 969 c->modrm_ea = (u32)c->modrm_ea;
@@ -1049,6 +1040,7 @@ done_prefixes:
1049 break; 1040 break;
1050 case DstMem: 1041 case DstMem:
1051 if ((c->d & ModRM) && c->modrm_mod == 3) { 1042 if ((c->d & ModRM) && c->modrm_mod == 3) {
1043 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1052 c->dst.type = OP_REG; 1044 c->dst.type = OP_REG;
1053 c->dst.val = c->dst.orig_val = c->modrm_val; 1045 c->dst.val = c->dst.orig_val = c->modrm_val;
1054 c->dst.ptr = c->modrm_ptr; 1046 c->dst.ptr = c->modrm_ptr;
@@ -1058,6 +1050,9 @@ done_prefixes:
1058 break; 1050 break;
1059 } 1051 }
1060 1052
1053 if (c->rip_relative)
1054 c->modrm_ea += c->eip;
1055
1061done: 1056done:
1062 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 1057 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
1063} 1058}
@@ -1070,7 +1065,7 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
1070 c->dst.bytes = c->op_bytes; 1065 c->dst.bytes = c->op_bytes;
1071 c->dst.val = c->src.val; 1066 c->dst.val = c->src.val;
1072 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); 1067 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
1073 c->dst.ptr = (void *) register_address(c, ctxt->ss_base, 1068 c->dst.ptr = (void *) register_address(c, ss_base(ctxt),
1074 c->regs[VCPU_REGS_RSP]); 1069 c->regs[VCPU_REGS_RSP]);
1075} 1070}
1076 1071
@@ -1080,7 +1075,7 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
1080 struct decode_cache *c = &ctxt->decode; 1075 struct decode_cache *c = &ctxt->decode;
1081 int rc; 1076 int rc;
1082 1077
1083 rc = ops->read_std(register_address(c, ctxt->ss_base, 1078 rc = ops->read_std(register_address(c, ss_base(ctxt),
1084 c->regs[VCPU_REGS_RSP]), 1079 c->regs[VCPU_REGS_RSP]),
1085 &c->dst.val, c->dst.bytes, ctxt->vcpu); 1080 &c->dst.val, c->dst.bytes, ctxt->vcpu);
1086 if (rc != 0) 1081 if (rc != 0)
@@ -1402,11 +1397,11 @@ special_insn:
1402 register_address_increment(c, &c->regs[VCPU_REGS_RSP], 1397 register_address_increment(c, &c->regs[VCPU_REGS_RSP],
1403 -c->op_bytes); 1398 -c->op_bytes);
1404 c->dst.ptr = (void *) register_address( 1399 c->dst.ptr = (void *) register_address(
1405 c, ctxt->ss_base, c->regs[VCPU_REGS_RSP]); 1400 c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]);
1406 break; 1401 break;
1407 case 0x58 ... 0x5f: /* pop reg */ 1402 case 0x58 ... 0x5f: /* pop reg */
1408 pop_instruction: 1403 pop_instruction:
1409 if ((rc = ops->read_std(register_address(c, ctxt->ss_base, 1404 if ((rc = ops->read_std(register_address(c, ss_base(ctxt),
1410 c->regs[VCPU_REGS_RSP]), c->dst.ptr, 1405 c->regs[VCPU_REGS_RSP]), c->dst.ptr,
1411 c->op_bytes, ctxt->vcpu)) != 0) 1406 c->op_bytes, ctxt->vcpu)) != 0)
1412 goto done; 1407 goto done;
@@ -1420,9 +1415,8 @@ special_insn:
1420 goto cannot_emulate; 1415 goto cannot_emulate;
1421 c->dst.val = (s32) c->src.val; 1416 c->dst.val = (s32) c->src.val;
1422 break; 1417 break;
1418 case 0x68: /* push imm */
1423 case 0x6a: /* push imm8 */ 1419 case 0x6a: /* push imm8 */
1424 c->src.val = 0L;
1425 c->src.val = insn_fetch(s8, 1, c->eip);
1426 emulate_push(ctxt); 1420 emulate_push(ctxt);
1427 break; 1421 break;
1428 case 0x6c: /* insb */ 1422 case 0x6c: /* insb */
@@ -1433,7 +1427,7 @@ special_insn:
1433 c->rep_prefix ? 1427 c->rep_prefix ?
1434 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, 1428 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
1435 (ctxt->eflags & EFLG_DF), 1429 (ctxt->eflags & EFLG_DF),
1436 register_address(c, ctxt->es_base, 1430 register_address(c, es_base(ctxt),
1437 c->regs[VCPU_REGS_RDI]), 1431 c->regs[VCPU_REGS_RDI]),
1438 c->rep_prefix, 1432 c->rep_prefix,
1439 c->regs[VCPU_REGS_RDX]) == 0) { 1433 c->regs[VCPU_REGS_RDX]) == 0) {
@@ -1449,9 +1443,8 @@ special_insn:
1449 c->rep_prefix ? 1443 c->rep_prefix ?
1450 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, 1444 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
1451 (ctxt->eflags & EFLG_DF), 1445 (ctxt->eflags & EFLG_DF),
1452 register_address(c, c->override_base ? 1446 register_address(c,
1453 *c->override_base : 1447 seg_override_base(ctxt, c),
1454 ctxt->ds_base,
1455 c->regs[VCPU_REGS_RSI]), 1448 c->regs[VCPU_REGS_RSI]),
1456 c->rep_prefix, 1449 c->rep_prefix,
1457 c->regs[VCPU_REGS_RDX]) == 0) { 1450 c->regs[VCPU_REGS_RDX]) == 0) {
@@ -1490,6 +1483,7 @@ special_insn:
1490 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); 1483 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
1491 break; 1484 break;
1492 case 0x86 ... 0x87: /* xchg */ 1485 case 0x86 ... 0x87: /* xchg */
1486 xchg:
1493 /* Write back the register source. */ 1487 /* Write back the register source. */
1494 switch (c->dst.bytes) { 1488 switch (c->dst.bytes) {
1495 case 1: 1489 case 1:
@@ -1514,14 +1508,60 @@ special_insn:
1514 break; 1508 break;
1515 case 0x88 ... 0x8b: /* mov */ 1509 case 0x88 ... 0x8b: /* mov */
1516 goto mov; 1510 goto mov;
1511 case 0x8c: { /* mov r/m, sreg */
1512 struct kvm_segment segreg;
1513
1514 if (c->modrm_reg <= 5)
1515 kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
1516 else {
1517 printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n",
1518 c->modrm);
1519 goto cannot_emulate;
1520 }
1521 c->dst.val = segreg.selector;
1522 break;
1523 }
1517 case 0x8d: /* lea r16/r32, m */ 1524 case 0x8d: /* lea r16/r32, m */
1518 c->dst.val = c->modrm_ea; 1525 c->dst.val = c->modrm_ea;
1519 break; 1526 break;
1527 case 0x8e: { /* mov seg, r/m16 */
1528 uint16_t sel;
1529 int type_bits;
1530 int err;
1531
1532 sel = c->src.val;
1533 if (c->modrm_reg <= 5) {
1534 type_bits = (c->modrm_reg == 1) ? 9 : 1;
1535 err = kvm_load_segment_descriptor(ctxt->vcpu, sel,
1536 type_bits, c->modrm_reg);
1537 } else {
1538 printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n",
1539 c->modrm);
1540 goto cannot_emulate;
1541 }
1542
1543 if (err < 0)
1544 goto cannot_emulate;
1545
1546 c->dst.type = OP_NONE; /* Disable writeback. */
1547 break;
1548 }
1520 case 0x8f: /* pop (sole member of Grp1a) */ 1549 case 0x8f: /* pop (sole member of Grp1a) */
1521 rc = emulate_grp1a(ctxt, ops); 1550 rc = emulate_grp1a(ctxt, ops);
1522 if (rc != 0) 1551 if (rc != 0)
1523 goto done; 1552 goto done;
1524 break; 1553 break;
1554 case 0x90: /* nop / xchg r8,rax */
1555 if (!(c->rex_prefix & 1)) { /* nop */
1556 c->dst.type = OP_NONE;
1557 break;
1558 }
1559 case 0x91 ... 0x97: /* xchg reg,rax */
1560 c->src.type = c->dst.type = OP_REG;
1561 c->src.bytes = c->dst.bytes = c->op_bytes;
1562 c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
1563 c->src.val = *(c->src.ptr);
1564 goto xchg;
1525 case 0x9c: /* pushf */ 1565 case 0x9c: /* pushf */
1526 c->src.val = (unsigned long) ctxt->eflags; 1566 c->src.val = (unsigned long) ctxt->eflags;
1527 emulate_push(ctxt); 1567 emulate_push(ctxt);
@@ -1540,11 +1580,10 @@ special_insn:
1540 c->dst.type = OP_MEM; 1580 c->dst.type = OP_MEM;
1541 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1581 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1542 c->dst.ptr = (unsigned long *)register_address(c, 1582 c->dst.ptr = (unsigned long *)register_address(c,
1543 ctxt->es_base, 1583 es_base(ctxt),
1544 c->regs[VCPU_REGS_RDI]); 1584 c->regs[VCPU_REGS_RDI]);
1545 if ((rc = ops->read_emulated(register_address(c, 1585 if ((rc = ops->read_emulated(register_address(c,
1546 c->override_base ? *c->override_base : 1586 seg_override_base(ctxt, c),
1547 ctxt->ds_base,
1548 c->regs[VCPU_REGS_RSI]), 1587 c->regs[VCPU_REGS_RSI]),
1549 &c->dst.val, 1588 &c->dst.val,
1550 c->dst.bytes, ctxt->vcpu)) != 0) 1589 c->dst.bytes, ctxt->vcpu)) != 0)
@@ -1560,8 +1599,7 @@ special_insn:
1560 c->src.type = OP_NONE; /* Disable writeback. */ 1599 c->src.type = OP_NONE; /* Disable writeback. */
1561 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1600 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1562 c->src.ptr = (unsigned long *)register_address(c, 1601 c->src.ptr = (unsigned long *)register_address(c,
1563 c->override_base ? *c->override_base : 1602 seg_override_base(ctxt, c),
1564 ctxt->ds_base,
1565 c->regs[VCPU_REGS_RSI]); 1603 c->regs[VCPU_REGS_RSI]);
1566 if ((rc = ops->read_emulated((unsigned long)c->src.ptr, 1604 if ((rc = ops->read_emulated((unsigned long)c->src.ptr,
1567 &c->src.val, 1605 &c->src.val,
@@ -1572,7 +1610,7 @@ special_insn:
1572 c->dst.type = OP_NONE; /* Disable writeback. */ 1610 c->dst.type = OP_NONE; /* Disable writeback. */
1573 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1611 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1574 c->dst.ptr = (unsigned long *)register_address(c, 1612 c->dst.ptr = (unsigned long *)register_address(c,
1575 ctxt->es_base, 1613 es_base(ctxt),
1576 c->regs[VCPU_REGS_RDI]); 1614 c->regs[VCPU_REGS_RDI]);
1577 if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, 1615 if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
1578 &c->dst.val, 1616 &c->dst.val,
@@ -1596,7 +1634,7 @@ special_insn:
1596 c->dst.type = OP_MEM; 1634 c->dst.type = OP_MEM;
1597 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1635 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1598 c->dst.ptr = (unsigned long *)register_address(c, 1636 c->dst.ptr = (unsigned long *)register_address(c,
1599 ctxt->es_base, 1637 es_base(ctxt),
1600 c->regs[VCPU_REGS_RDI]); 1638 c->regs[VCPU_REGS_RDI]);
1601 c->dst.val = c->regs[VCPU_REGS_RAX]; 1639 c->dst.val = c->regs[VCPU_REGS_RAX];
1602 register_address_increment(c, &c->regs[VCPU_REGS_RDI], 1640 register_address_increment(c, &c->regs[VCPU_REGS_RDI],
@@ -1608,8 +1646,7 @@ special_insn:
1608 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1646 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1609 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; 1647 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1610 if ((rc = ops->read_emulated(register_address(c, 1648 if ((rc = ops->read_emulated(register_address(c,
1611 c->override_base ? *c->override_base : 1649 seg_override_base(ctxt, c),
1612 ctxt->ds_base,
1613 c->regs[VCPU_REGS_RSI]), 1650 c->regs[VCPU_REGS_RSI]),
1614 &c->dst.val, 1651 &c->dst.val,
1615 c->dst.bytes, 1652 c->dst.bytes,
@@ -1622,6 +1659,8 @@ special_insn:
1622 case 0xae ... 0xaf: /* scas */ 1659 case 0xae ... 0xaf: /* scas */
1623 DPRINTF("Urk! I don't handle SCAS.\n"); 1660 DPRINTF("Urk! I don't handle SCAS.\n");
1624 goto cannot_emulate; 1661 goto cannot_emulate;
1662 case 0xb8: /* mov r, imm */
1663 goto mov;
1625 case 0xc0 ... 0xc1: 1664 case 0xc0 ... 0xc1:
1626 emulate_grp2(ctxt); 1665 emulate_grp2(ctxt);
1627 break; 1666 break;
@@ -1660,13 +1699,39 @@ special_insn:
1660 break; 1699 break;
1661 } 1700 }
1662 case 0xe9: /* jmp rel */ 1701 case 0xe9: /* jmp rel */
1663 case 0xeb: /* jmp rel short */ 1702 goto jmp;
1703 case 0xea: /* jmp far */ {
1704 uint32_t eip;
1705 uint16_t sel;
1706
1707 switch (c->op_bytes) {
1708 case 2:
1709 eip = insn_fetch(u16, 2, c->eip);
1710 break;
1711 case 4:
1712 eip = insn_fetch(u32, 4, c->eip);
1713 break;
1714 default:
1715 DPRINTF("jmp far: Invalid op_bytes\n");
1716 goto cannot_emulate;
1717 }
1718 sel = insn_fetch(u16, 2, c->eip);
1719 if (kvm_load_segment_descriptor(ctxt->vcpu, sel, 9, VCPU_SREG_CS) < 0) {
1720 DPRINTF("jmp far: Failed to load CS descriptor\n");
1721 goto cannot_emulate;
1722 }
1723
1724 c->eip = eip;
1725 break;
1726 }
1727 case 0xeb:
1728 jmp: /* jmp rel short */
1664 jmp_rel(c, c->src.val); 1729 jmp_rel(c, c->src.val);
1665 c->dst.type = OP_NONE; /* Disable writeback. */ 1730 c->dst.type = OP_NONE; /* Disable writeback. */
1666 break; 1731 break;
1667 case 0xf4: /* hlt */ 1732 case 0xf4: /* hlt */
1668 ctxt->vcpu->arch.halt_request = 1; 1733 ctxt->vcpu->arch.halt_request = 1;
1669 goto done; 1734 break;
1670 case 0xf5: /* cmc */ 1735 case 0xf5: /* cmc */
1671 /* complement carry flag from eflags reg */ 1736 /* complement carry flag from eflags reg */
1672 ctxt->eflags ^= EFLG_CF; 1737 ctxt->eflags ^= EFLG_CF;
@@ -1882,6 +1947,8 @@ twobyte_insn:
1882 c->src.val &= (c->dst.bytes << 3) - 1; 1947 c->src.val &= (c->dst.bytes << 3) - 1;
1883 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); 1948 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
1884 break; 1949 break;
1950 case 0xae: /* clflush */
1951 break;
1885 case 0xb0 ... 0xb1: /* cmpxchg */ 1952 case 0xb0 ... 0xb1: /* cmpxchg */
1886 /* 1953 /*
1887 * Save real source value, then compare EAX against 1954 * Save real source value, then compare EAX against
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 964dfa36d367..c70e12b1a637 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -3,7 +3,7 @@ config LGUEST_GUEST
3 select PARAVIRT 3 select PARAVIRT
4 depends on X86_32 4 depends on X86_32
5 depends on !X86_PAE 5 depends on !X86_PAE
6 depends on !(X86_VISWS || X86_VOYAGER) 6 depends on !X86_VOYAGER
7 select VIRTIO 7 select VIRTIO
8 select VIRTIO_RING 8 select VIRTIO_RING
9 select VIRTIO_CONSOLE 9 select VIRTIO_CONSOLE
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 5c7e2fd52075..65f0b8a47bed 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -55,6 +55,7 @@
55#include <linux/lguest_launcher.h> 55#include <linux/lguest_launcher.h>
56#include <linux/virtio_console.h> 56#include <linux/virtio_console.h>
57#include <linux/pm.h> 57#include <linux/pm.h>
58#include <asm/apic.h>
58#include <asm/lguest.h> 59#include <asm/lguest.h>
59#include <asm/paravirt.h> 60#include <asm/paravirt.h>
60#include <asm/param.h> 61#include <asm/param.h>
@@ -607,7 +608,7 @@ static unsigned long lguest_get_wallclock(void)
607 * what speed it runs at, or 0 if it's unusable as a reliable clock source. 608 * what speed it runs at, or 0 if it's unusable as a reliable clock source.
608 * This matches what we want here: if we return 0 from this function, the x86 609 * This matches what we want here: if we return 0 from this function, the x86
609 * TSC clock will give up and not register itself. */ 610 * TSC clock will give up and not register itself. */
610static unsigned long lguest_cpu_khz(void) 611static unsigned long lguest_tsc_khz(void)
611{ 612{
612 return lguest_data.tsc_khz; 613 return lguest_data.tsc_khz;
613} 614}
@@ -783,14 +784,44 @@ static void lguest_wbinvd(void)
783 * code qualifies for Advanced. It will also never interrupt anything. It 784 * code qualifies for Advanced. It will also never interrupt anything. It
784 * does, however, allow us to get through the Linux boot code. */ 785 * does, however, allow us to get through the Linux boot code. */
785#ifdef CONFIG_X86_LOCAL_APIC 786#ifdef CONFIG_X86_LOCAL_APIC
786static void lguest_apic_write(unsigned long reg, u32 v) 787static void lguest_apic_write(u32 reg, u32 v)
787{ 788{
788} 789}
789 790
790static u32 lguest_apic_read(unsigned long reg) 791static u32 lguest_apic_read(u32 reg)
791{ 792{
792 return 0; 793 return 0;
793} 794}
795
796static u64 lguest_apic_icr_read(void)
797{
798 return 0;
799}
800
801static void lguest_apic_icr_write(u32 low, u32 id)
802{
803 /* Warn to see if there's any stray references */
804 WARN_ON(1);
805}
806
807static void lguest_apic_wait_icr_idle(void)
808{
809 return;
810}
811
812static u32 lguest_apic_safe_wait_icr_idle(void)
813{
814 return 0;
815}
816
817static struct apic_ops lguest_basic_apic_ops = {
818 .read = lguest_apic_read,
819 .write = lguest_apic_write,
820 .icr_read = lguest_apic_icr_read,
821 .icr_write = lguest_apic_icr_write,
822 .wait_icr_idle = lguest_apic_wait_icr_idle,
823 .safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle,
824};
794#endif 825#endif
795 826
796/* STOP! Until an interrupt comes in. */ 827/* STOP! Until an interrupt comes in. */
@@ -835,7 +866,7 @@ static __init char *lguest_memory_setup(void)
835 866
836 /* The Linux bootloader header contains an "e820" memory map: the 867 /* The Linux bootloader header contains an "e820" memory map: the
837 * Launcher populated the first entry with our memory limit. */ 868 * Launcher populated the first entry with our memory limit. */
838 add_memory_region(boot_params.e820_map[0].addr, 869 e820_add_region(boot_params.e820_map[0].addr,
839 boot_params.e820_map[0].size, 870 boot_params.e820_map[0].size,
840 boot_params.e820_map[0].type); 871 boot_params.e820_map[0].type);
841 872
@@ -990,15 +1021,13 @@ __init void lguest_init(void)
990 1021
991#ifdef CONFIG_X86_LOCAL_APIC 1022#ifdef CONFIG_X86_LOCAL_APIC
992 /* apic read/write intercepts */ 1023 /* apic read/write intercepts */
993 pv_apic_ops.apic_write = lguest_apic_write; 1024 apic_ops = &lguest_basic_apic_ops;
994 pv_apic_ops.apic_write_atomic = lguest_apic_write;
995 pv_apic_ops.apic_read = lguest_apic_read;
996#endif 1025#endif
997 1026
998 /* time operations */ 1027 /* time operations */
999 pv_time_ops.get_wallclock = lguest_get_wallclock; 1028 pv_time_ops.get_wallclock = lguest_get_wallclock;
1000 pv_time_ops.time_init = lguest_time_init; 1029 pv_time_ops.time_init = lguest_time_init;
1001 pv_time_ops.get_cpu_khz = lguest_cpu_khz; 1030 pv_time_ops.get_tsc_khz = lguest_tsc_khz;
1002 1031
1003 /* Now is a good time to look at the implementations of these functions 1032 /* Now is a good time to look at the implementations of these functions
1004 * before returning to the rest of lguest_init(). */ 1033 * before returning to the rest of lguest_init(). */
@@ -1012,8 +1041,12 @@ __init void lguest_init(void)
1012 * clobbered. The Launcher places our initial pagetables somewhere at 1041 * clobbered. The Launcher places our initial pagetables somewhere at
1013 * the top of our physical memory, so we don't need extra space: set 1042 * the top of our physical memory, so we don't need extra space: set
1014 * init_pg_tables_end to the end of the kernel. */ 1043 * init_pg_tables_end to the end of the kernel. */
1044 init_pg_tables_start = __pa(pg0);
1015 init_pg_tables_end = __pa(pg0); 1045 init_pg_tables_end = __pa(pg0);
1016 1046
1047 /* As described in head_32.S, we map the first 128M of memory. */
1048 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
1049
1017 /* Load the %fs segment register (the per-cpu segment register) with 1050 /* Load the %fs segment register (the per-cpu segment register) with
1018 * the normal data segment to get through booting. */ 1051 * the normal data segment to get through booting. */
1019 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); 1052 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
@@ -1065,9 +1098,9 @@ __init void lguest_init(void)
1065 pm_power_off = lguest_power_off; 1098 pm_power_off = lguest_power_off;
1066 machine_ops.restart = lguest_restart; 1099 machine_ops.restart = lguest_restart;
1067 1100
1068 /* Now we're set up, call start_kernel() in init/main.c and we proceed 1101 /* Now we're set up, call i386_start_kernel() in head32.c and we proceed
1069 * to boot as normal. It never returns. */ 1102 * to boot as normal. It never returns. */
1070 start_kernel(); 1103 i386_start_kernel();
1071} 1104}
1072/* 1105/*
1073 * This marks the end of stage II of our journey, The Guest. 1106 * This marks the end of stage II of our journey, The Guest.
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 76f60f52a885..55e11aa6d66c 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -4,8 +4,9 @@
4 4
5obj-$(CONFIG_SMP) := msr-on-cpu.o 5obj-$(CONFIG_SMP) := msr-on-cpu.o
6 6
7lib-y := delay_$(BITS).o 7lib-y := delay.o
8lib-y += usercopy_$(BITS).o getuser_$(BITS).o putuser_$(BITS).o 8lib-y += thunk_$(BITS).o
9lib-y += usercopy_$(BITS).o getuser.o putuser.o
9lib-y += memcpy_$(BITS).o 10lib-y += memcpy_$(BITS).o
10 11
11ifeq ($(CONFIG_X86_32),y) 12ifeq ($(CONFIG_X86_32),y)
@@ -16,9 +17,6 @@ ifeq ($(CONFIG_X86_32),y)
16 lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o 17 lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
17else 18else
18 obj-y += io_64.o iomap_copy_64.o 19 obj-y += io_64.o iomap_copy_64.o
19
20 CFLAGS_csum-partial_64.o := -funroll-loops
21
22 lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o 20 lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
23 lib-y += thunk_64.o clear_page_64.o copy_page_64.o 21 lib-y += thunk_64.o clear_page_64.o copy_page_64.o
24 lib-y += memmove_64.o memset_64.o 22 lib-y += memmove_64.o memset_64.o
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index ee1c3f635157..f118c110af32 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -1,8 +1,10 @@
1/* Copyright 2002 Andi Kleen, SuSE Labs. 1/*
2 * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
3 * Copyright 2002 Andi Kleen, SuSE Labs.
2 * Subject to the GNU Public License v2. 4 * Subject to the GNU Public License v2.
3 * 5 *
4 * Functions to copy from and to user space. 6 * Functions to copy from and to user space.
5 */ 7 */
6 8
7#include <linux/linkage.h> 9#include <linux/linkage.h>
8#include <asm/dwarf2.h> 10#include <asm/dwarf2.h>
@@ -20,60 +22,88 @@
20 .long \orig-1f /* by default jump to orig */ 22 .long \orig-1f /* by default jump to orig */
211: 231:
22 .section .altinstr_replacement,"ax" 24 .section .altinstr_replacement,"ax"
232: .byte 0xe9 /* near jump with 32bit immediate */ 252: .byte 0xe9 /* near jump with 32bit immediate */
24 .long \alt-1b /* offset */ /* or alternatively to alt */ 26 .long \alt-1b /* offset */ /* or alternatively to alt */
25 .previous 27 .previous
26 .section .altinstructions,"a" 28 .section .altinstructions,"a"
27 .align 8 29 .align 8
28 .quad 0b 30 .quad 0b
29 .quad 2b 31 .quad 2b
30 .byte \feature /* when feature is set */ 32 .byte \feature /* when feature is set */
31 .byte 5 33 .byte 5
32 .byte 5 34 .byte 5
33 .previous 35 .previous
34 .endm 36 .endm
35 37
36/* Standard copy_to_user with segment limit checking */ 38 .macro ALIGN_DESTINATION
39#ifdef FIX_ALIGNMENT
40 /* check for bad alignment of destination */
41 movl %edi,%ecx
42 andl $7,%ecx
43 jz 102f /* already aligned */
44 subl $8,%ecx
45 negl %ecx
46 subl %ecx,%edx
47100: movb (%rsi),%al
48101: movb %al,(%rdi)
49 incq %rsi
50 incq %rdi
51 decl %ecx
52 jnz 100b
53102:
54 .section .fixup,"ax"
55103: addl %ecx,%edx /* ecx is zerorest also */
56 jmp copy_user_handle_tail
57 .previous
58
59 .section __ex_table,"a"
60 .align 8
61 .quad 100b,103b
62 .quad 101b,103b
63 .previous
64#endif
65 .endm
66
67/* Standard copy_to_user with segment limit checking */
37ENTRY(copy_to_user) 68ENTRY(copy_to_user)
38 CFI_STARTPROC 69 CFI_STARTPROC
39 GET_THREAD_INFO(%rax) 70 GET_THREAD_INFO(%rax)
40 movq %rdi,%rcx 71 movq %rdi,%rcx
41 addq %rdx,%rcx 72 addq %rdx,%rcx
42 jc bad_to_user 73 jc bad_to_user
43 cmpq threadinfo_addr_limit(%rax),%rcx 74 cmpq TI_addr_limit(%rax),%rcx
44 jae bad_to_user 75 jae bad_to_user
45 xorl %eax,%eax /* clear zero flag */
46 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 76 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
47 CFI_ENDPROC 77 CFI_ENDPROC
48 78
49ENTRY(copy_user_generic) 79/* Standard copy_from_user with segment limit checking */
80ENTRY(copy_from_user)
50 CFI_STARTPROC 81 CFI_STARTPROC
51 movl $1,%ecx /* set zero flag */ 82 GET_THREAD_INFO(%rax)
83 movq %rsi,%rcx
84 addq %rdx,%rcx
85 jc bad_from_user
86 cmpq TI_addr_limit(%rax),%rcx
87 jae bad_from_user
52 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 88 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
53 CFI_ENDPROC 89 CFI_ENDPROC
90ENDPROC(copy_from_user)
54 91
55ENTRY(__copy_from_user_inatomic) 92ENTRY(copy_user_generic)
56 CFI_STARTPROC 93 CFI_STARTPROC
57 xorl %ecx,%ecx /* clear zero flag */
58 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 94 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
59 CFI_ENDPROC 95 CFI_ENDPROC
96ENDPROC(copy_user_generic)
60 97
61/* Standard copy_from_user with segment limit checking */ 98ENTRY(__copy_from_user_inatomic)
62ENTRY(copy_from_user)
63 CFI_STARTPROC 99 CFI_STARTPROC
64 GET_THREAD_INFO(%rax)
65 movq %rsi,%rcx
66 addq %rdx,%rcx
67 jc bad_from_user
68 cmpq threadinfo_addr_limit(%rax),%rcx
69 jae bad_from_user
70 movl $1,%ecx /* set zero flag */
71 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 100 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
72 CFI_ENDPROC 101 CFI_ENDPROC
73ENDPROC(copy_from_user) 102ENDPROC(__copy_from_user_inatomic)
74 103
75 .section .fixup,"ax" 104 .section .fixup,"ax"
76 /* must zero dest */ 105 /* must zero dest */
106ENTRY(bad_from_user)
77bad_from_user: 107bad_from_user:
78 CFI_STARTPROC 108 CFI_STARTPROC
79 movl %edx,%ecx 109 movl %edx,%ecx
@@ -81,271 +111,158 @@ bad_from_user:
81 rep 111 rep
82 stosb 112 stosb
83bad_to_user: 113bad_to_user:
84 movl %edx,%eax 114 movl %edx,%eax
85 ret 115 ret
86 CFI_ENDPROC 116 CFI_ENDPROC
87END(bad_from_user) 117ENDPROC(bad_from_user)
88 .previous 118 .previous
89 119
90
91/* 120/*
92 * copy_user_generic_unrolled - memory copy with exception handling. 121 * copy_user_generic_unrolled - memory copy with exception handling.
93 * This version is for CPUs like P4 that don't have efficient micro code for rep movsq 122 * This version is for CPUs like P4 that don't have efficient micro
94 * 123 * code for rep movsq
95 * Input: 124 *
125 * Input:
96 * rdi destination 126 * rdi destination
97 * rsi source 127 * rsi source
98 * rdx count 128 * rdx count
99 * ecx zero flag -- if true zero destination on error
100 * 129 *
101 * Output: 130 * Output:
102 * eax uncopied bytes or 0 if successful. 131 * eax uncopied bytes or 0 if successfull.
103 */ 132 */
104ENTRY(copy_user_generic_unrolled) 133ENTRY(copy_user_generic_unrolled)
105 CFI_STARTPROC 134 CFI_STARTPROC
106 pushq %rbx 135 cmpl $8,%edx
107 CFI_ADJUST_CFA_OFFSET 8 136 jb 20f /* less then 8 bytes, go to byte copy loop */
108 CFI_REL_OFFSET rbx, 0 137 ALIGN_DESTINATION
109 pushq %rcx 138 movl %edx,%ecx
110 CFI_ADJUST_CFA_OFFSET 8 139 andl $63,%edx
111 CFI_REL_OFFSET rcx, 0 140 shrl $6,%ecx
112 xorl %eax,%eax /*zero for the exception handler */ 141 jz 17f
113 1421: movq (%rsi),%r8
114#ifdef FIX_ALIGNMENT 1432: movq 1*8(%rsi),%r9
115 /* check for bad alignment of destination */ 1443: movq 2*8(%rsi),%r10
116 movl %edi,%ecx 1454: movq 3*8(%rsi),%r11
117 andl $7,%ecx 1465: movq %r8,(%rdi)
118 jnz .Lbad_alignment 1476: movq %r9,1*8(%rdi)
119.Lafter_bad_alignment: 1487: movq %r10,2*8(%rdi)
120#endif 1498: movq %r11,3*8(%rdi)
121 1509: movq 4*8(%rsi),%r8
122 movq %rdx,%rcx 15110: movq 5*8(%rsi),%r9
123 15211: movq 6*8(%rsi),%r10
124 movl $64,%ebx 15312: movq 7*8(%rsi),%r11
125 shrq $6,%rdx 15413: movq %r8,4*8(%rdi)
126 decq %rdx 15514: movq %r9,5*8(%rdi)
127 js .Lhandle_tail 15615: movq %r10,6*8(%rdi)
128 15716: movq %r11,7*8(%rdi)
129 .p2align 4
130.Lloop:
131.Ls1: movq (%rsi),%r11
132.Ls2: movq 1*8(%rsi),%r8
133.Ls3: movq 2*8(%rsi),%r9
134.Ls4: movq 3*8(%rsi),%r10
135.Ld1: movq %r11,(%rdi)
136.Ld2: movq %r8,1*8(%rdi)
137.Ld3: movq %r9,2*8(%rdi)
138.Ld4: movq %r10,3*8(%rdi)
139
140.Ls5: movq 4*8(%rsi),%r11
141.Ls6: movq 5*8(%rsi),%r8
142.Ls7: movq 6*8(%rsi),%r9
143.Ls8: movq 7*8(%rsi),%r10
144.Ld5: movq %r11,4*8(%rdi)
145.Ld6: movq %r8,5*8(%rdi)
146.Ld7: movq %r9,6*8(%rdi)
147.Ld8: movq %r10,7*8(%rdi)
148
149 decq %rdx
150
151 leaq 64(%rsi),%rsi 158 leaq 64(%rsi),%rsi
152 leaq 64(%rdi),%rdi 159 leaq 64(%rdi),%rdi
153
154 jns .Lloop
155
156 .p2align 4
157.Lhandle_tail:
158 movl %ecx,%edx
159 andl $63,%ecx
160 shrl $3,%ecx
161 jz .Lhandle_7
162 movl $8,%ebx
163 .p2align 4
164.Lloop_8:
165.Ls9: movq (%rsi),%r8
166.Ld9: movq %r8,(%rdi)
167 decl %ecx 160 decl %ecx
168 leaq 8(%rdi),%rdi 161 jnz 1b
16217: movl %edx,%ecx
163 andl $7,%edx
164 shrl $3,%ecx
165 jz 20f
16618: movq (%rsi),%r8
16719: movq %r8,(%rdi)
169 leaq 8(%rsi),%rsi 168 leaq 8(%rsi),%rsi
170 jnz .Lloop_8 169 leaq 8(%rdi),%rdi
171 170 decl %ecx
172.Lhandle_7: 171 jnz 18b
17220: andl %edx,%edx
173 jz 23f
173 movl %edx,%ecx 174 movl %edx,%ecx
174 andl $7,%ecx 17521: movb (%rsi),%al
175 jz .Lende 17622: movb %al,(%rdi)
176 .p2align 4
177.Lloop_1:
178.Ls10: movb (%rsi),%bl
179.Ld10: movb %bl,(%rdi)
180 incq %rdi
181 incq %rsi 177 incq %rsi
178 incq %rdi
182 decl %ecx 179 decl %ecx
183 jnz .Lloop_1 180 jnz 21b
184 18123: xor %eax,%eax
185 CFI_REMEMBER_STATE
186.Lende:
187 popq %rcx
188 CFI_ADJUST_CFA_OFFSET -8
189 CFI_RESTORE rcx
190 popq %rbx
191 CFI_ADJUST_CFA_OFFSET -8
192 CFI_RESTORE rbx
193 ret 182 ret
194 CFI_RESTORE_STATE
195 183
196#ifdef FIX_ALIGNMENT 184 .section .fixup,"ax"
197 /* align destination */ 18530: shll $6,%ecx
198 .p2align 4 186 addl %ecx,%edx
199.Lbad_alignment: 187 jmp 60f
200 movl $8,%r9d 18840: lea (%rdx,%rcx,8),%rdx
201 subl %ecx,%r9d 189 jmp 60f
202 movl %r9d,%ecx 19050: movl %ecx,%edx
203 cmpq %r9,%rdx 19160: jmp copy_user_handle_tail /* ecx is zerorest also */
204 jz .Lhandle_7 192 .previous
205 js .Lhandle_7
206.Lalign_1:
207.Ls11: movb (%rsi),%bl
208.Ld11: movb %bl,(%rdi)
209 incq %rsi
210 incq %rdi
211 decl %ecx
212 jnz .Lalign_1
213 subq %r9,%rdx
214 jmp .Lafter_bad_alignment
215#endif
216 193
217 /* table sorted by exception address */
218 .section __ex_table,"a" 194 .section __ex_table,"a"
219 .align 8 195 .align 8
220 .quad .Ls1,.Ls1e /* Ls1-Ls4 have copied zero bytes */ 196 .quad 1b,30b
221 .quad .Ls2,.Ls1e 197 .quad 2b,30b
222 .quad .Ls3,.Ls1e 198 .quad 3b,30b
223 .quad .Ls4,.Ls1e 199 .quad 4b,30b
224 .quad .Ld1,.Ls1e /* Ld1-Ld4 have copied 0-24 bytes */ 200 .quad 5b,30b
225 .quad .Ld2,.Ls2e 201 .quad 6b,30b
226 .quad .Ld3,.Ls3e 202 .quad 7b,30b
227 .quad .Ld4,.Ls4e 203 .quad 8b,30b
228 .quad .Ls5,.Ls5e /* Ls5-Ls8 have copied 32 bytes */ 204 .quad 9b,30b
229 .quad .Ls6,.Ls5e 205 .quad 10b,30b
230 .quad .Ls7,.Ls5e 206 .quad 11b,30b
231 .quad .Ls8,.Ls5e 207 .quad 12b,30b
232 .quad .Ld5,.Ls5e /* Ld5-Ld8 have copied 32-56 bytes */ 208 .quad 13b,30b
233 .quad .Ld6,.Ls6e 209 .quad 14b,30b
234 .quad .Ld7,.Ls7e 210 .quad 15b,30b
235 .quad .Ld8,.Ls8e 211 .quad 16b,30b
236 .quad .Ls9,.Le_quad 212 .quad 18b,40b
237 .quad .Ld9,.Le_quad 213 .quad 19b,40b
238 .quad .Ls10,.Le_byte 214 .quad 21b,50b
239 .quad .Ld10,.Le_byte 215 .quad 22b,50b
240#ifdef FIX_ALIGNMENT
241 .quad .Ls11,.Lzero_rest
242 .quad .Ld11,.Lzero_rest
243#endif
244 .quad .Le5,.Le_zero
245 .previous 216 .previous
246
247 /* eax: zero, ebx: 64 */
248.Ls1e: addl $8,%eax /* eax is bytes left uncopied within the loop (Ls1e: 64 .. Ls8e: 8) */
249.Ls2e: addl $8,%eax
250.Ls3e: addl $8,%eax
251.Ls4e: addl $8,%eax
252.Ls5e: addl $8,%eax
253.Ls6e: addl $8,%eax
254.Ls7e: addl $8,%eax
255.Ls8e: addl $8,%eax
256 addq %rbx,%rdi /* +64 */
257 subq %rax,%rdi /* correct destination with computed offset */
258
259 shlq $6,%rdx /* loop counter * 64 (stride length) */
260 addq %rax,%rdx /* add offset to loopcnt */
261 andl $63,%ecx /* remaining bytes */
262 addq %rcx,%rdx /* add them */
263 jmp .Lzero_rest
264
265 /* exception on quad word loop in tail handling */
266 /* ecx: loopcnt/8, %edx: length, rdi: correct */
267.Le_quad:
268 shll $3,%ecx
269 andl $7,%edx
270 addl %ecx,%edx
271 /* edx: bytes to zero, rdi: dest, eax:zero */
272.Lzero_rest:
273 cmpl $0,(%rsp)
274 jz .Le_zero
275 movq %rdx,%rcx
276.Le_byte:
277 xorl %eax,%eax
278.Le5: rep
279 stosb
280 /* when there is another exception while zeroing the rest just return */
281.Le_zero:
282 movq %rdx,%rax
283 jmp .Lende
284 CFI_ENDPROC 217 CFI_ENDPROC
285ENDPROC(copy_user_generic) 218ENDPROC(copy_user_generic_unrolled)
286 219
287 220/* Some CPUs run faster using the string copy instructions.
288 /* Some CPUs run faster using the string copy instructions. 221 * This is also a lot simpler. Use them when possible.
289 This is also a lot simpler. Use them when possible. 222 *
290 Patch in jmps to this code instead of copying it fully 223 * Only 4GB of copy is supported. This shouldn't be a problem
291 to avoid unwanted aliasing in the exception tables. */ 224 * because the kernel normally only writes from/to page sized chunks
292 225 * even if user space passed a longer buffer.
293 /* rdi destination 226 * And more would be dangerous because both Intel and AMD have
294 * rsi source 227 * errata with rep movsq > 4GB. If someone feels the need to fix
295 * rdx count 228 * this please consider this.
296 * ecx zero flag 229 *
297 * 230 * Input:
298 * Output: 231 * rdi destination
299 * eax uncopied bytes or 0 if successfull. 232 * rsi source
300 * 233 * rdx count
301 * Only 4GB of copy is supported. This shouldn't be a problem 234 *
302 * because the kernel normally only writes from/to page sized chunks 235 * Output:
303 * even if user space passed a longer buffer. 236 * eax uncopied bytes or 0 if successful.
304 * And more would be dangerous because both Intel and AMD have 237 */
305 * errata with rep movsq > 4GB. If someone feels the need to fix
306 * this please consider this.
307 */
308ENTRY(copy_user_generic_string) 238ENTRY(copy_user_generic_string)
309 CFI_STARTPROC 239 CFI_STARTPROC
310 movl %ecx,%r8d /* save zero flag */ 240 andl %edx,%edx
241 jz 4f
242 cmpl $8,%edx
243 jb 2f /* less than 8 bytes, go to byte copy loop */
244 ALIGN_DESTINATION
311 movl %edx,%ecx 245 movl %edx,%ecx
312 shrl $3,%ecx 246 shrl $3,%ecx
313 andl $7,%edx 247 andl $7,%edx
314 jz 10f 2481: rep
3151: rep
316 movsq
317 movl %edx,%ecx
3182: rep
319 movsb
3209: movl %ecx,%eax
321 ret
322
323 /* multiple of 8 byte */
32410: rep
325 movsq 249 movsq
326 xor %eax,%eax 2502: movl %edx,%ecx
2513: rep
252 movsb
2534: xorl %eax,%eax
327 ret 254 ret
328 255
329 /* exception handling */ 256 .section .fixup,"ax"
3303: lea (%rdx,%rcx,8),%rax /* exception on quad loop */ 25711: lea (%rdx,%rcx,8),%rcx
331 jmp 6f 25812: movl %ecx,%edx /* ecx is zerorest also */
3325: movl %ecx,%eax /* exception on byte loop */ 259 jmp copy_user_handle_tail
333 /* eax: left over bytes */ 260 .previous
3346: testl %r8d,%r8d /* zero flag set? */
335 jz 7f
336 movl %eax,%ecx /* initialize x86 loop counter */
337 push %rax
338 xorl %eax,%eax
3398: rep
340 stosb /* zero the rest */
34111: pop %rax
3427: ret
343 CFI_ENDPROC
344END(copy_user_generic_c)
345 261
346 .section __ex_table,"a" 262 .section __ex_table,"a"
347 .quad 1b,3b 263 .align 8
348 .quad 2b,5b 264 .quad 1b,11b
349 .quad 8b,11b 265 .quad 3b,12b
350 .quad 10b,3b
351 .previous 266 .previous
267 CFI_ENDPROC
268ENDPROC(copy_user_generic_string)
diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S
index 9d3d1ab83763..cb0c112386fb 100644
--- a/arch/x86/lib/copy_user_nocache_64.S
+++ b/arch/x86/lib/copy_user_nocache_64.S
@@ -1,4 +1,6 @@
1/* Copyright 2002 Andi Kleen, SuSE Labs. 1/*
2 * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
3 * Copyright 2002 Andi Kleen, SuSE Labs.
2 * Subject to the GNU Public License v2. 4 * Subject to the GNU Public License v2.
3 * 5 *
4 * Functions to copy from and to user space. 6 * Functions to copy from and to user space.
@@ -12,204 +14,124 @@
12#include <asm/current.h> 14#include <asm/current.h>
13#include <asm/asm-offsets.h> 15#include <asm/asm-offsets.h>
14#include <asm/thread_info.h> 16#include <asm/thread_info.h>
15#include <asm/cpufeature.h>
16
17/*
18 * copy_user_nocache - Uncached memory copy with exception handling
19 * This will force destination/source out of cache for more performance.
20 *
21 * Input:
22 * rdi destination
23 * rsi source
24 * rdx count
25 * rcx zero flag when 1 zero on exception
26 *
27 * Output:
28 * eax uncopied bytes or 0 if successful.
29 */
30ENTRY(__copy_user_nocache)
31 CFI_STARTPROC
32 pushq %rbx
33 CFI_ADJUST_CFA_OFFSET 8
34 CFI_REL_OFFSET rbx, 0
35 pushq %rcx /* save zero flag */
36 CFI_ADJUST_CFA_OFFSET 8
37 CFI_REL_OFFSET rcx, 0
38
39 xorl %eax,%eax /* zero for the exception handler */
40 17
18 .macro ALIGN_DESTINATION
41#ifdef FIX_ALIGNMENT 19#ifdef FIX_ALIGNMENT
42 /* check for bad alignment of destination */ 20 /* check for bad alignment of destination */
43 movl %edi,%ecx 21 movl %edi,%ecx
44 andl $7,%ecx 22 andl $7,%ecx
45 jnz .Lbad_alignment 23 jz 102f /* already aligned */
46.Lafter_bad_alignment: 24 subl $8,%ecx
47#endif 25 negl %ecx
48 26 subl %ecx,%edx
49 movq %rdx,%rcx 27100: movb (%rsi),%al
50 28101: movb %al,(%rdi)
51 movl $64,%ebx 29 incq %rsi
52 shrq $6,%rdx 30 incq %rdi
53 decq %rdx 31 decl %ecx
54 js .Lhandle_tail 32 jnz 100b
55 33102:
56 .p2align 4 34 .section .fixup,"ax"
57.Lloop: 35103: addl %ecx,%edx /* ecx is zerorest also */
58.Ls1: movq (%rsi),%r11 36 jmp copy_user_handle_tail
59.Ls2: movq 1*8(%rsi),%r8 37 .previous
60.Ls3: movq 2*8(%rsi),%r9
61.Ls4: movq 3*8(%rsi),%r10
62.Ld1: movnti %r11,(%rdi)
63.Ld2: movnti %r8,1*8(%rdi)
64.Ld3: movnti %r9,2*8(%rdi)
65.Ld4: movnti %r10,3*8(%rdi)
66
67.Ls5: movq 4*8(%rsi),%r11
68.Ls6: movq 5*8(%rsi),%r8
69.Ls7: movq 6*8(%rsi),%r9
70.Ls8: movq 7*8(%rsi),%r10
71.Ld5: movnti %r11,4*8(%rdi)
72.Ld6: movnti %r8,5*8(%rdi)
73.Ld7: movnti %r9,6*8(%rdi)
74.Ld8: movnti %r10,7*8(%rdi)
75 38
76 dec %rdx 39 .section __ex_table,"a"
40 .align 8
41 .quad 100b,103b
42 .quad 101b,103b
43 .previous
44#endif
45 .endm
77 46
47/*
48 * copy_user_nocache - Uncached memory copy with exception handling
49 * This will force destination/source out of cache for more performance.
50 */
51ENTRY(__copy_user_nocache)
52 CFI_STARTPROC
53 cmpl $8,%edx
54 jb 20f /* less then 8 bytes, go to byte copy loop */
55 ALIGN_DESTINATION
56 movl %edx,%ecx
57 andl $63,%edx
58 shrl $6,%ecx
59 jz 17f
601: movq (%rsi),%r8
612: movq 1*8(%rsi),%r9
623: movq 2*8(%rsi),%r10
634: movq 3*8(%rsi),%r11
645: movnti %r8,(%rdi)
656: movnti %r9,1*8(%rdi)
667: movnti %r10,2*8(%rdi)
678: movnti %r11,3*8(%rdi)
689: movq 4*8(%rsi),%r8
6910: movq 5*8(%rsi),%r9
7011: movq 6*8(%rsi),%r10
7112: movq 7*8(%rsi),%r11
7213: movnti %r8,4*8(%rdi)
7314: movnti %r9,5*8(%rdi)
7415: movnti %r10,6*8(%rdi)
7516: movnti %r11,7*8(%rdi)
78 leaq 64(%rsi),%rsi 76 leaq 64(%rsi),%rsi
79 leaq 64(%rdi),%rdi 77 leaq 64(%rdi),%rdi
80
81 jns .Lloop
82
83 .p2align 4
84.Lhandle_tail:
85 movl %ecx,%edx
86 andl $63,%ecx
87 shrl $3,%ecx
88 jz .Lhandle_7
89 movl $8,%ebx
90 .p2align 4
91.Lloop_8:
92.Ls9: movq (%rsi),%r8
93.Ld9: movnti %r8,(%rdi)
94 decl %ecx 78 decl %ecx
95 leaq 8(%rdi),%rdi 79 jnz 1b
8017: movl %edx,%ecx
81 andl $7,%edx
82 shrl $3,%ecx
83 jz 20f
8418: movq (%rsi),%r8
8519: movnti %r8,(%rdi)
96 leaq 8(%rsi),%rsi 86 leaq 8(%rsi),%rsi
97 jnz .Lloop_8 87 leaq 8(%rdi),%rdi
98 88 decl %ecx
99.Lhandle_7: 89 jnz 18b
9020: andl %edx,%edx
91 jz 23f
100 movl %edx,%ecx 92 movl %edx,%ecx
101 andl $7,%ecx 9321: movb (%rsi),%al
102 jz .Lende 9422: movb %al,(%rdi)
103 .p2align 4
104.Lloop_1:
105.Ls10: movb (%rsi),%bl
106.Ld10: movb %bl,(%rdi)
107 incq %rdi
108 incq %rsi 95 incq %rsi
96 incq %rdi
109 decl %ecx 97 decl %ecx
110 jnz .Lloop_1 98 jnz 21b
111 9923: xorl %eax,%eax
112 CFI_REMEMBER_STATE
113.Lende:
114 popq %rcx
115 CFI_ADJUST_CFA_OFFSET -8
116 CFI_RESTORE %rcx
117 popq %rbx
118 CFI_ADJUST_CFA_OFFSET -8
119 CFI_RESTORE rbx
120 sfence 100 sfence
121 ret 101 ret
122 CFI_RESTORE_STATE
123 102
124#ifdef FIX_ALIGNMENT 103 .section .fixup,"ax"
125 /* align destination */ 10430: shll $6,%ecx
126 .p2align 4 105 addl %ecx,%edx
127.Lbad_alignment: 106 jmp 60f
128 movl $8,%r9d 10740: lea (%rdx,%rcx,8),%rdx
129 subl %ecx,%r9d 108 jmp 60f
130 movl %r9d,%ecx 10950: movl %ecx,%edx
131 cmpq %r9,%rdx 11060: sfence
132 jz .Lhandle_7 111 jmp copy_user_handle_tail
133 js .Lhandle_7 112 .previous
134.Lalign_1:
135.Ls11: movb (%rsi),%bl
136.Ld11: movb %bl,(%rdi)
137 incq %rsi
138 incq %rdi
139 decl %ecx
140 jnz .Lalign_1
141 subq %r9,%rdx
142 jmp .Lafter_bad_alignment
143#endif
144 113
145 /* table sorted by exception address */
146 .section __ex_table,"a" 114 .section __ex_table,"a"
147 .align 8 115 .quad 1b,30b
148 .quad .Ls1,.Ls1e /* .Ls[1-4] - 0 bytes copied */ 116 .quad 2b,30b
149 .quad .Ls2,.Ls1e 117 .quad 3b,30b
150 .quad .Ls3,.Ls1e 118 .quad 4b,30b
151 .quad .Ls4,.Ls1e 119 .quad 5b,30b
152 .quad .Ld1,.Ls1e /* .Ld[1-4] - 0..24 bytes coped */ 120 .quad 6b,30b
153 .quad .Ld2,.Ls2e 121 .quad 7b,30b
154 .quad .Ld3,.Ls3e 122 .quad 8b,30b
155 .quad .Ld4,.Ls4e 123 .quad 9b,30b
156 .quad .Ls5,.Ls5e /* .Ls[5-8] - 32 bytes copied */ 124 .quad 10b,30b
157 .quad .Ls6,.Ls5e 125 .quad 11b,30b
158 .quad .Ls7,.Ls5e 126 .quad 12b,30b
159 .quad .Ls8,.Ls5e 127 .quad 13b,30b
160 .quad .Ld5,.Ls5e /* .Ld[5-8] - 32..56 bytes copied */ 128 .quad 14b,30b
161 .quad .Ld6,.Ls6e 129 .quad 15b,30b
162 .quad .Ld7,.Ls7e 130 .quad 16b,30b
163 .quad .Ld8,.Ls8e 131 .quad 18b,40b
164 .quad .Ls9,.Le_quad 132 .quad 19b,40b
165 .quad .Ld9,.Le_quad 133 .quad 21b,50b
166 .quad .Ls10,.Le_byte 134 .quad 22b,50b
167 .quad .Ld10,.Le_byte
168#ifdef FIX_ALIGNMENT
169 .quad .Ls11,.Lzero_rest
170 .quad .Ld11,.Lzero_rest
171#endif
172 .quad .Le5,.Le_zero
173 .previous 135 .previous
174
175 /* eax: zero, ebx: 64 */
176.Ls1e: addl $8,%eax /* eax: bytes left uncopied: Ls1e: 64 .. Ls8e: 8 */
177.Ls2e: addl $8,%eax
178.Ls3e: addl $8,%eax
179.Ls4e: addl $8,%eax
180.Ls5e: addl $8,%eax
181.Ls6e: addl $8,%eax
182.Ls7e: addl $8,%eax
183.Ls8e: addl $8,%eax
184 addq %rbx,%rdi /* +64 */
185 subq %rax,%rdi /* correct destination with computed offset */
186
187 shlq $6,%rdx /* loop counter * 64 (stride length) */
188 addq %rax,%rdx /* add offset to loopcnt */
189 andl $63,%ecx /* remaining bytes */
190 addq %rcx,%rdx /* add them */
191 jmp .Lzero_rest
192
193 /* exception on quad word loop in tail handling */
194 /* ecx: loopcnt/8, %edx: length, rdi: correct */
195.Le_quad:
196 shll $3,%ecx
197 andl $7,%edx
198 addl %ecx,%edx
199 /* edx: bytes to zero, rdi: dest, eax:zero */
200.Lzero_rest:
201 cmpl $0,(%rsp) /* zero flag set? */
202 jz .Le_zero
203 movq %rdx,%rcx
204.Le_byte:
205 xorl %eax,%eax
206.Le5: rep
207 stosb
208 /* when there is another exception while zeroing the rest just return */
209.Le_zero:
210 movq %rdx,%rax
211 jmp .Lende
212 CFI_ENDPROC 136 CFI_ENDPROC
213ENDPROC(__copy_user_nocache) 137ENDPROC(__copy_user_nocache)
214
215
diff --git a/arch/x86/lib/delay_32.c b/arch/x86/lib/delay.c
index d710f2d167bb..f4568605d7d5 100644
--- a/arch/x86/lib/delay_32.c
+++ b/arch/x86/lib/delay.c
@@ -3,6 +3,7 @@
3 * 3 *
4 * Copyright (C) 1993 Linus Torvalds 4 * Copyright (C) 1993 Linus Torvalds
5 * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz> 5 * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
6 * Copyright (C) 2008 Jiri Hladky <hladky _dot_ jiri _at_ gmail _dot_ com>
6 * 7 *
7 * The __delay function must _NOT_ be inlined as its execution time 8 * The __delay function must _NOT_ be inlined as its execution time
8 * depends wildly on alignment on many x86 processors. The additional 9 * depends wildly on alignment on many x86 processors. The additional
@@ -28,16 +29,22 @@
28/* simple loop based delay: */ 29/* simple loop based delay: */
29static void delay_loop(unsigned long loops) 30static void delay_loop(unsigned long loops)
30{ 31{
31 int d0; 32 asm volatile(
32 33 " test %0,%0 \n"
33 __asm__ __volatile__( 34 " jz 3f \n"
34 "\tjmp 1f\n" 35 " jmp 1f \n"
35 ".align 16\n" 36
36 "1:\tjmp 2f\n" 37 ".align 16 \n"
37 ".align 16\n" 38 "1: jmp 2f \n"
38 "2:\tdecl %0\n\tjns 2b" 39
39 :"=&a" (d0) 40 ".align 16 \n"
40 :"0" (loops)); 41 "2: dec %0 \n"
42 " jnz 2b \n"
43 "3: dec %0 \n"
44
45 : /* we don't need output */
46 :"a" (loops)
47 );
41} 48}
42 49
43/* TSC based delay: */ 50/* TSC based delay: */
@@ -91,7 +98,7 @@ void use_tsc_delay(void)
91int __devinit read_current_timer(unsigned long *timer_val) 98int __devinit read_current_timer(unsigned long *timer_val)
92{ 99{
93 if (delay_fn == delay_tsc) { 100 if (delay_fn == delay_tsc) {
94 rdtscl(*timer_val); 101 rdtscll(*timer_val);
95 return 0; 102 return 0;
96 } 103 }
97 return -1; 104 return -1;
@@ -101,31 +108,30 @@ void __delay(unsigned long loops)
101{ 108{
102 delay_fn(loops); 109 delay_fn(loops);
103} 110}
111EXPORT_SYMBOL(__delay);
104 112
105inline void __const_udelay(unsigned long xloops) 113inline void __const_udelay(unsigned long xloops)
106{ 114{
107 int d0; 115 int d0;
108 116
109 xloops *= 4; 117 xloops *= 4;
110 __asm__("mull %0" 118 asm("mull %%edx"
111 :"=d" (xloops), "=&a" (d0) 119 :"=d" (xloops), "=&a" (d0)
112 :"1" (xloops), "0" 120 :"1" (xloops), "0"
113 (cpu_data(raw_smp_processor_id()).loops_per_jiffy * (HZ/4))); 121 (cpu_data(raw_smp_processor_id()).loops_per_jiffy * (HZ/4)));
114 122
115 __delay(++xloops); 123 __delay(++xloops);
116} 124}
125EXPORT_SYMBOL(__const_udelay);
117 126
118void __udelay(unsigned long usecs) 127void __udelay(unsigned long usecs)
119{ 128{
120 __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ 129 __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
121} 130}
131EXPORT_SYMBOL(__udelay);
122 132
123void __ndelay(unsigned long nsecs) 133void __ndelay(unsigned long nsecs)
124{ 134{
125 __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ 135 __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
126} 136}
127
128EXPORT_SYMBOL(__delay);
129EXPORT_SYMBOL(__const_udelay);
130EXPORT_SYMBOL(__udelay);
131EXPORT_SYMBOL(__ndelay); 137EXPORT_SYMBOL(__ndelay);
diff --git a/arch/x86/lib/delay_64.c b/arch/x86/lib/delay_64.c
deleted file mode 100644
index 4c441be92641..000000000000
--- a/arch/x86/lib/delay_64.c
+++ /dev/null
@@ -1,85 +0,0 @@
1/*
2 * Precise Delay Loops for x86-64
3 *
4 * Copyright (C) 1993 Linus Torvalds
5 * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
6 *
7 * The __delay function must _NOT_ be inlined as its execution time
8 * depends wildly on alignment on many x86 processors.
9 */
10
11#include <linux/module.h>
12#include <linux/sched.h>
13#include <linux/timex.h>
14#include <linux/preempt.h>
15#include <linux/delay.h>
16#include <linux/init.h>
17
18#include <asm/delay.h>
19#include <asm/msr.h>
20
21#ifdef CONFIG_SMP
22#include <asm/smp.h>
23#endif
24
25int __devinit read_current_timer(unsigned long *timer_value)
26{
27 rdtscll(*timer_value);
28 return 0;
29}
30
31void __delay(unsigned long loops)
32{
33 unsigned bclock, now;
34 int cpu;
35
36 preempt_disable();
37 cpu = smp_processor_id();
38 rdtscl(bclock);
39 for (;;) {
40 rdtscl(now);
41 if ((now - bclock) >= loops)
42 break;
43
44 /* Allow RT tasks to run */
45 preempt_enable();
46 rep_nop();
47 preempt_disable();
48
49 /*
50 * It is possible that we moved to another CPU, and
51 * since TSC's are per-cpu we need to calculate
52 * that. The delay must guarantee that we wait "at
53 * least" the amount of time. Being moved to another
54 * CPU could make the wait longer but we just need to
55 * make sure we waited long enough. Rebalance the
56 * counter for this CPU.
57 */
58 if (unlikely(cpu != smp_processor_id())) {
59 loops -= (now - bclock);
60 cpu = smp_processor_id();
61 rdtscl(bclock);
62 }
63 }
64 preempt_enable();
65}
66EXPORT_SYMBOL(__delay);
67
68inline void __const_udelay(unsigned long xloops)
69{
70 __delay(((xloops * HZ *
71 cpu_data(raw_smp_processor_id()).loops_per_jiffy) >> 32) + 1);
72}
73EXPORT_SYMBOL(__const_udelay);
74
75void __udelay(unsigned long usecs)
76{
77 __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
78}
79EXPORT_SYMBOL(__udelay);
80
81void __ndelay(unsigned long nsecs)
82{
83 __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
84}
85EXPORT_SYMBOL(__ndelay);
diff --git a/arch/x86/lib/getuser_64.S b/arch/x86/lib/getuser.S
index 5448876261f8..ad374003742f 100644
--- a/arch/x86/lib/getuser_64.S
+++ b/arch/x86/lib/getuser.S
@@ -3,6 +3,7 @@
3 * 3 *
4 * (C) Copyright 1998 Linus Torvalds 4 * (C) Copyright 1998 Linus Torvalds
5 * (C) Copyright 2005 Andi Kleen 5 * (C) Copyright 2005 Andi Kleen
6 * (C) Copyright 2008 Glauber Costa
6 * 7 *
7 * These functions have a non-standard call interface 8 * These functions have a non-standard call interface
8 * to make them more efficient, especially as they 9 * to make them more efficient, especially as they
@@ -13,14 +14,13 @@
13/* 14/*
14 * __get_user_X 15 * __get_user_X
15 * 16 *
16 * Inputs: %rcx contains the address. 17 * Inputs: %[r|e]ax contains the address.
17 * The register is modified, but all changes are undone 18 * The register is modified, but all changes are undone
18 * before returning because the C code doesn't know about it. 19 * before returning because the C code doesn't know about it.
19 * 20 *
20 * Outputs: %rax is error code (0 or -EFAULT) 21 * Outputs: %[r|e]ax is error code (0 or -EFAULT)
21 * %rdx contains zero-extended value 22 * %[r|e]dx contains zero-extended value
22 * 23 *
23 * %r8 is destroyed.
24 * 24 *
25 * These functions should not modify any other registers, 25 * These functions should not modify any other registers,
26 * as they get called from within inline assembly. 26 * as they get called from within inline assembly.
@@ -32,78 +32,73 @@
32#include <asm/errno.h> 32#include <asm/errno.h>
33#include <asm/asm-offsets.h> 33#include <asm/asm-offsets.h>
34#include <asm/thread_info.h> 34#include <asm/thread_info.h>
35#include <asm/asm.h>
35 36
36 .text 37 .text
37ENTRY(__get_user_1) 38ENTRY(__get_user_1)
38 CFI_STARTPROC 39 CFI_STARTPROC
39 GET_THREAD_INFO(%r8) 40 GET_THREAD_INFO(%_ASM_DX)
40 cmpq threadinfo_addr_limit(%r8),%rcx 41 cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
41 jae bad_get_user 42 jae bad_get_user
421: movzb (%rcx),%edx 431: movzb (%_ASM_AX),%edx
43 xorl %eax,%eax 44 xor %eax,%eax
44 ret 45 ret
45 CFI_ENDPROC 46 CFI_ENDPROC
46ENDPROC(__get_user_1) 47ENDPROC(__get_user_1)
47 48
48ENTRY(__get_user_2) 49ENTRY(__get_user_2)
49 CFI_STARTPROC 50 CFI_STARTPROC
50 GET_THREAD_INFO(%r8) 51 add $1,%_ASM_AX
51 addq $1,%rcx 52 jc bad_get_user
52 jc 20f 53 GET_THREAD_INFO(%_ASM_DX)
53 cmpq threadinfo_addr_limit(%r8),%rcx 54 cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
54 jae 20f 55 jae bad_get_user
55 decq %rcx 562: movzwl -1(%_ASM_AX),%edx
562: movzwl (%rcx),%edx 57 xor %eax,%eax
57 xorl %eax,%eax
58 ret 58 ret
5920: decq %rcx
60 jmp bad_get_user
61 CFI_ENDPROC 59 CFI_ENDPROC
62ENDPROC(__get_user_2) 60ENDPROC(__get_user_2)
63 61
64ENTRY(__get_user_4) 62ENTRY(__get_user_4)
65 CFI_STARTPROC 63 CFI_STARTPROC
66 GET_THREAD_INFO(%r8) 64 add $3,%_ASM_AX
67 addq $3,%rcx 65 jc bad_get_user
68 jc 30f 66 GET_THREAD_INFO(%_ASM_DX)
69 cmpq threadinfo_addr_limit(%r8),%rcx 67 cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
70 jae 30f 68 jae bad_get_user
71 subq $3,%rcx 693: mov -3(%_ASM_AX),%edx
723: movl (%rcx),%edx 70 xor %eax,%eax
73 xorl %eax,%eax
74 ret 71 ret
7530: subq $3,%rcx
76 jmp bad_get_user
77 CFI_ENDPROC 72 CFI_ENDPROC
78ENDPROC(__get_user_4) 73ENDPROC(__get_user_4)
79 74
75#ifdef CONFIG_X86_64
80ENTRY(__get_user_8) 76ENTRY(__get_user_8)
81 CFI_STARTPROC 77 CFI_STARTPROC
82 GET_THREAD_INFO(%r8) 78 add $7,%_ASM_AX
83 addq $7,%rcx 79 jc bad_get_user
84 jc 40f 80 GET_THREAD_INFO(%_ASM_DX)
85 cmpq threadinfo_addr_limit(%r8),%rcx 81 cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
86 jae 40f 82 jae bad_get_user
87 subq $7,%rcx 834: movq -7(%_ASM_AX),%_ASM_DX
884: movq (%rcx),%rdx 84 xor %eax,%eax
89 xorl %eax,%eax
90 ret 85 ret
9140: subq $7,%rcx
92 jmp bad_get_user
93 CFI_ENDPROC 86 CFI_ENDPROC
94ENDPROC(__get_user_8) 87ENDPROC(__get_user_8)
88#endif
95 89
96bad_get_user: 90bad_get_user:
97 CFI_STARTPROC 91 CFI_STARTPROC
98 xorl %edx,%edx 92 xor %edx,%edx
99 movq $(-EFAULT),%rax 93 mov $(-EFAULT),%_ASM_AX
100 ret 94 ret
101 CFI_ENDPROC 95 CFI_ENDPROC
102END(bad_get_user) 96END(bad_get_user)
103 97
104.section __ex_table,"a" 98.section __ex_table,"a"
105 .quad 1b,bad_get_user 99 _ASM_PTR 1b,bad_get_user
106 .quad 2b,bad_get_user 100 _ASM_PTR 2b,bad_get_user
107 .quad 3b,bad_get_user 101 _ASM_PTR 3b,bad_get_user
108 .quad 4b,bad_get_user 102#ifdef CONFIG_X86_64
109.previous 103 _ASM_PTR 4b,bad_get_user
104#endif
diff --git a/arch/x86/lib/getuser_32.S b/arch/x86/lib/getuser_32.S
deleted file mode 100644
index 6d84b53f12a2..000000000000
--- a/arch/x86/lib/getuser_32.S
+++ /dev/null
@@ -1,78 +0,0 @@
1/*
2 * __get_user functions.
3 *
4 * (C) Copyright 1998 Linus Torvalds
5 *
6 * These functions have a non-standard call interface
7 * to make them more efficient, especially as they
8 * return an error value in addition to the "real"
9 * return value.
10 */
11#include <linux/linkage.h>
12#include <asm/dwarf2.h>
13#include <asm/thread_info.h>
14
15
16/*
17 * __get_user_X
18 *
19 * Inputs: %eax contains the address
20 *
21 * Outputs: %eax is error code (0 or -EFAULT)
22 * %edx contains zero-extended value
23 *
24 * These functions should not modify any other registers,
25 * as they get called from within inline assembly.
26 */
27
28.text
29ENTRY(__get_user_1)
30 CFI_STARTPROC
31 GET_THREAD_INFO(%edx)
32 cmpl TI_addr_limit(%edx),%eax
33 jae bad_get_user
341: movzbl (%eax),%edx
35 xorl %eax,%eax
36 ret
37 CFI_ENDPROC
38ENDPROC(__get_user_1)
39
40ENTRY(__get_user_2)
41 CFI_STARTPROC
42 addl $1,%eax
43 jc bad_get_user
44 GET_THREAD_INFO(%edx)
45 cmpl TI_addr_limit(%edx),%eax
46 jae bad_get_user
472: movzwl -1(%eax),%edx
48 xorl %eax,%eax
49 ret
50 CFI_ENDPROC
51ENDPROC(__get_user_2)
52
53ENTRY(__get_user_4)
54 CFI_STARTPROC
55 addl $3,%eax
56 jc bad_get_user
57 GET_THREAD_INFO(%edx)
58 cmpl TI_addr_limit(%edx),%eax
59 jae bad_get_user
603: movl -3(%eax),%edx
61 xorl %eax,%eax
62 ret
63 CFI_ENDPROC
64ENDPROC(__get_user_4)
65
66bad_get_user:
67 CFI_STARTPROC
68 xorl %edx,%edx
69 movl $-14,%eax
70 ret
71 CFI_ENDPROC
72END(bad_get_user)
73
74.section __ex_table,"a"
75 .long 1b,bad_get_user
76 .long 2b,bad_get_user
77 .long 3b,bad_get_user
78.previous
diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c
index 57d043fa893e..321cf720dbb6 100644
--- a/arch/x86/lib/msr-on-cpu.c
+++ b/arch/x86/lib/msr-on-cpu.c
@@ -16,36 +16,46 @@ static void __rdmsr_on_cpu(void *info)
16 rdmsr(rv->msr_no, rv->l, rv->h); 16 rdmsr(rv->msr_no, rv->l, rv->h);
17} 17}
18 18
19static void __rdmsr_safe_on_cpu(void *info) 19static void __wrmsr_on_cpu(void *info)
20{ 20{
21 struct msr_info *rv = info; 21 struct msr_info *rv = info;
22 22
23 rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h); 23 wrmsr(rv->msr_no, rv->l, rv->h);
24} 24}
25 25
26static int _rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h, int safe) 26int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
27{ 27{
28 int err = 0; 28 int err;
29 struct msr_info rv; 29 struct msr_info rv;
30 30
31 rv.msr_no = msr_no; 31 rv.msr_no = msr_no;
32 if (safe) { 32 err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
33 smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 0, 1);
34 err = rv.err;
35 } else {
36 smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 0, 1);
37 }
38 *l = rv.l; 33 *l = rv.l;
39 *h = rv.h; 34 *h = rv.h;
40 35
41 return err; 36 return err;
42} 37}
43 38
44static void __wrmsr_on_cpu(void *info) 39int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
40{
41 int err;
42 struct msr_info rv;
43
44 rv.msr_no = msr_no;
45 rv.l = l;
46 rv.h = h;
47 err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
48
49 return err;
50}
51
52/* These "safe" variants are slower and should be used when the target MSR
53 may not actually exist. */
54static void __rdmsr_safe_on_cpu(void *info)
45{ 55{
46 struct msr_info *rv = info; 56 struct msr_info *rv = info;
47 57
48 wrmsr(rv->msr_no, rv->l, rv->h); 58 rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h);
49} 59}
50 60
51static void __wrmsr_safe_on_cpu(void *info) 61static void __wrmsr_safe_on_cpu(void *info)
@@ -55,44 +65,30 @@ static void __wrmsr_safe_on_cpu(void *info)
55 rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h); 65 rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h);
56} 66}
57 67
58static int _wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h, int safe) 68int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
59{ 69{
60 int err = 0; 70 int err;
61 struct msr_info rv; 71 struct msr_info rv;
62 72
63 rv.msr_no = msr_no; 73 rv.msr_no = msr_no;
64 rv.l = l; 74 err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
65 rv.h = h; 75 *l = rv.l;
66 if (safe) { 76 *h = rv.h;
67 smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 0, 1);
68 err = rv.err;
69 } else {
70 smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 0, 1);
71 }
72
73 return err;
74}
75
76void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
77{
78 _wrmsr_on_cpu(cpu, msr_no, l, h, 0);
79}
80 77
81void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) 78 return err ? err : rv.err;
82{
83 _rdmsr_on_cpu(cpu, msr_no, l, h, 0);
84} 79}
85 80
86/* These "safe" variants are slower and should be used when the target MSR
87 may not actually exist. */
88int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) 81int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
89{ 82{
90 return _wrmsr_on_cpu(cpu, msr_no, l, h, 1); 83 int err;
91} 84 struct msr_info rv;
92 85
93int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) 86 rv.msr_no = msr_no;
94{ 87 rv.l = l;
95 return _rdmsr_on_cpu(cpu, msr_no, l, h, 1); 88 rv.h = h;
89 err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
90
91 return err ? err : rv.err;
96} 92}
97 93
98EXPORT_SYMBOL(rdmsr_on_cpu); 94EXPORT_SYMBOL(rdmsr_on_cpu);
diff --git a/arch/x86/lib/putuser_32.S b/arch/x86/lib/putuser.S
index f58fba109d18..36b0d15ae6e9 100644
--- a/arch/x86/lib/putuser_32.S
+++ b/arch/x86/lib/putuser.S
@@ -2,6 +2,8 @@
2 * __put_user functions. 2 * __put_user functions.
3 * 3 *
4 * (C) Copyright 2005 Linus Torvalds 4 * (C) Copyright 2005 Linus Torvalds
5 * (C) Copyright 2005 Andi Kleen
6 * (C) Copyright 2008 Glauber Costa
5 * 7 *
6 * These functions have a non-standard call interface 8 * These functions have a non-standard call interface
7 * to make them more efficient, especially as they 9 * to make them more efficient, especially as they
@@ -11,6 +13,8 @@
11#include <linux/linkage.h> 13#include <linux/linkage.h>
12#include <asm/dwarf2.h> 14#include <asm/dwarf2.h>
13#include <asm/thread_info.h> 15#include <asm/thread_info.h>
16#include <asm/errno.h>
17#include <asm/asm.h>
14 18
15 19
16/* 20/*
@@ -26,73 +30,68 @@
26 */ 30 */
27 31
28#define ENTER CFI_STARTPROC ; \ 32#define ENTER CFI_STARTPROC ; \
29 pushl %ebx ; \ 33 GET_THREAD_INFO(%_ASM_BX)
30 CFI_ADJUST_CFA_OFFSET 4 ; \ 34#define EXIT ret ; \
31 CFI_REL_OFFSET ebx, 0 ; \
32 GET_THREAD_INFO(%ebx)
33#define EXIT popl %ebx ; \
34 CFI_ADJUST_CFA_OFFSET -4 ; \
35 CFI_RESTORE ebx ; \
36 ret ; \
37 CFI_ENDPROC 35 CFI_ENDPROC
38 36
39.text 37.text
40ENTRY(__put_user_1) 38ENTRY(__put_user_1)
41 ENTER 39 ENTER
42 cmpl TI_addr_limit(%ebx),%ecx 40 cmp TI_addr_limit(%_ASM_BX),%_ASM_CX
43 jae bad_put_user 41 jae bad_put_user
441: movb %al,(%ecx) 421: movb %al,(%_ASM_CX)
45 xorl %eax,%eax 43 xor %eax,%eax
46 EXIT 44 EXIT
47ENDPROC(__put_user_1) 45ENDPROC(__put_user_1)
48 46
49ENTRY(__put_user_2) 47ENTRY(__put_user_2)
50 ENTER 48 ENTER
51 movl TI_addr_limit(%ebx),%ebx 49 mov TI_addr_limit(%_ASM_BX),%_ASM_BX
52 subl $1,%ebx 50 sub $1,%_ASM_BX
53 cmpl %ebx,%ecx 51 cmp %_ASM_BX,%_ASM_CX
54 jae bad_put_user 52 jae bad_put_user
552: movw %ax,(%ecx) 532: movw %ax,(%_ASM_CX)
56 xorl %eax,%eax 54 xor %eax,%eax
57 EXIT 55 EXIT
58ENDPROC(__put_user_2) 56ENDPROC(__put_user_2)
59 57
60ENTRY(__put_user_4) 58ENTRY(__put_user_4)
61 ENTER 59 ENTER
62 movl TI_addr_limit(%ebx),%ebx 60 mov TI_addr_limit(%_ASM_BX),%_ASM_BX
63 subl $3,%ebx 61 sub $3,%_ASM_BX
64 cmpl %ebx,%ecx 62 cmp %_ASM_BX,%_ASM_CX
65 jae bad_put_user 63 jae bad_put_user
663: movl %eax,(%ecx) 643: movl %eax,(%_ASM_CX)
67 xorl %eax,%eax 65 xor %eax,%eax
68 EXIT 66 EXIT
69ENDPROC(__put_user_4) 67ENDPROC(__put_user_4)
70 68
71ENTRY(__put_user_8) 69ENTRY(__put_user_8)
72 ENTER 70 ENTER
73 movl TI_addr_limit(%ebx),%ebx 71 mov TI_addr_limit(%_ASM_BX),%_ASM_BX
74 subl $7,%ebx 72 sub $7,%_ASM_BX
75 cmpl %ebx,%ecx 73 cmp %_ASM_BX,%_ASM_CX
76 jae bad_put_user 74 jae bad_put_user
774: movl %eax,(%ecx) 754: mov %_ASM_AX,(%_ASM_CX)
785: movl %edx,4(%ecx) 76#ifdef CONFIG_X86_32
79 xorl %eax,%eax 775: movl %edx,4(%_ASM_CX)
78#endif
79 xor %eax,%eax
80 EXIT 80 EXIT
81ENDPROC(__put_user_8) 81ENDPROC(__put_user_8)
82 82
83bad_put_user: 83bad_put_user:
84 CFI_STARTPROC simple 84 CFI_STARTPROC
85 CFI_DEF_CFA esp, 2*4 85 movl $-EFAULT,%eax
86 CFI_OFFSET eip, -1*4
87 CFI_OFFSET ebx, -2*4
88 movl $-14,%eax
89 EXIT 86 EXIT
90END(bad_put_user) 87END(bad_put_user)
91 88
92.section __ex_table,"a" 89.section __ex_table,"a"
93 .long 1b,bad_put_user 90 _ASM_PTR 1b,bad_put_user
94 .long 2b,bad_put_user 91 _ASM_PTR 2b,bad_put_user
95 .long 3b,bad_put_user 92 _ASM_PTR 3b,bad_put_user
96 .long 4b,bad_put_user 93 _ASM_PTR 4b,bad_put_user
97 .long 5b,bad_put_user 94#ifdef CONFIG_X86_32
95 _ASM_PTR 5b,bad_put_user
96#endif
98.previous 97.previous
diff --git a/arch/x86/lib/putuser_64.S b/arch/x86/lib/putuser_64.S
deleted file mode 100644
index 4989f5a8fa9b..000000000000
--- a/arch/x86/lib/putuser_64.S
+++ /dev/null
@@ -1,106 +0,0 @@
1/*
2 * __put_user functions.
3 *
4 * (C) Copyright 1998 Linus Torvalds
5 * (C) Copyright 2005 Andi Kleen
6 *
7 * These functions have a non-standard call interface
8 * to make them more efficient, especially as they
9 * return an error value in addition to the "real"
10 * return value.
11 */
12
13/*
14 * __put_user_X
15 *
16 * Inputs: %rcx contains the address
17 * %rdx contains new value
18 *
19 * Outputs: %rax is error code (0 or -EFAULT)
20 *
21 * %r8 is destroyed.
22 *
23 * These functions should not modify any other registers,
24 * as they get called from within inline assembly.
25 */
26
27#include <linux/linkage.h>
28#include <asm/dwarf2.h>
29#include <asm/page.h>
30#include <asm/errno.h>
31#include <asm/asm-offsets.h>
32#include <asm/thread_info.h>
33
34 .text
35ENTRY(__put_user_1)
36 CFI_STARTPROC
37 GET_THREAD_INFO(%r8)
38 cmpq threadinfo_addr_limit(%r8),%rcx
39 jae bad_put_user
401: movb %dl,(%rcx)
41 xorl %eax,%eax
42 ret
43 CFI_ENDPROC
44ENDPROC(__put_user_1)
45
46ENTRY(__put_user_2)
47 CFI_STARTPROC
48 GET_THREAD_INFO(%r8)
49 addq $1,%rcx
50 jc 20f
51 cmpq threadinfo_addr_limit(%r8),%rcx
52 jae 20f
53 decq %rcx
542: movw %dx,(%rcx)
55 xorl %eax,%eax
56 ret
5720: decq %rcx
58 jmp bad_put_user
59 CFI_ENDPROC
60ENDPROC(__put_user_2)
61
62ENTRY(__put_user_4)
63 CFI_STARTPROC
64 GET_THREAD_INFO(%r8)
65 addq $3,%rcx
66 jc 30f
67 cmpq threadinfo_addr_limit(%r8),%rcx
68 jae 30f
69 subq $3,%rcx
703: movl %edx,(%rcx)
71 xorl %eax,%eax
72 ret
7330: subq $3,%rcx
74 jmp bad_put_user
75 CFI_ENDPROC
76ENDPROC(__put_user_4)
77
78ENTRY(__put_user_8)
79 CFI_STARTPROC
80 GET_THREAD_INFO(%r8)
81 addq $7,%rcx
82 jc 40f
83 cmpq threadinfo_addr_limit(%r8),%rcx
84 jae 40f
85 subq $7,%rcx
864: movq %rdx,(%rcx)
87 xorl %eax,%eax
88 ret
8940: subq $7,%rcx
90 jmp bad_put_user
91 CFI_ENDPROC
92ENDPROC(__put_user_8)
93
94bad_put_user:
95 CFI_STARTPROC
96 movq $(-EFAULT),%rax
97 ret
98 CFI_ENDPROC
99END(bad_put_user)
100
101.section __ex_table,"a"
102 .quad 1b,bad_put_user
103 .quad 2b,bad_put_user
104 .quad 3b,bad_put_user
105 .quad 4b,bad_put_user
106.previous
diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c
index 94972e7c094d..82004d2bf05e 100644
--- a/arch/x86/lib/string_32.c
+++ b/arch/x86/lib/string_32.c
@@ -22,7 +22,7 @@ char *strcpy(char *dest, const char *src)
22 "testb %%al,%%al\n\t" 22 "testb %%al,%%al\n\t"
23 "jne 1b" 23 "jne 1b"
24 : "=&S" (d0), "=&D" (d1), "=&a" (d2) 24 : "=&S" (d0), "=&D" (d1), "=&a" (d2)
25 :"0" (src), "1" (dest) : "memory"); 25 : "0" (src), "1" (dest) : "memory");
26 return dest; 26 return dest;
27} 27}
28EXPORT_SYMBOL(strcpy); 28EXPORT_SYMBOL(strcpy);
@@ -42,7 +42,7 @@ char *strncpy(char *dest, const char *src, size_t count)
42 "stosb\n" 42 "stosb\n"
43 "2:" 43 "2:"
44 : "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3) 44 : "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3)
45 :"0" (src), "1" (dest), "2" (count) : "memory"); 45 : "0" (src), "1" (dest), "2" (count) : "memory");
46 return dest; 46 return dest;
47} 47}
48EXPORT_SYMBOL(strncpy); 48EXPORT_SYMBOL(strncpy);
@@ -60,7 +60,7 @@ char *strcat(char *dest, const char *src)
60 "testb %%al,%%al\n\t" 60 "testb %%al,%%al\n\t"
61 "jne 1b" 61 "jne 1b"
62 : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3) 62 : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
63 : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu): "memory"); 63 : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu) : "memory");
64 return dest; 64 return dest;
65} 65}
66EXPORT_SYMBOL(strcat); 66EXPORT_SYMBOL(strcat);
@@ -105,9 +105,9 @@ int strcmp(const char *cs, const char *ct)
105 "2:\tsbbl %%eax,%%eax\n\t" 105 "2:\tsbbl %%eax,%%eax\n\t"
106 "orb $1,%%al\n" 106 "orb $1,%%al\n"
107 "3:" 107 "3:"
108 :"=a" (res), "=&S" (d0), "=&D" (d1) 108 : "=a" (res), "=&S" (d0), "=&D" (d1)
109 :"1" (cs), "2" (ct) 109 : "1" (cs), "2" (ct)
110 :"memory"); 110 : "memory");
111 return res; 111 return res;
112} 112}
113EXPORT_SYMBOL(strcmp); 113EXPORT_SYMBOL(strcmp);
@@ -130,9 +130,9 @@ int strncmp(const char *cs, const char *ct, size_t count)
130 "3:\tsbbl %%eax,%%eax\n\t" 130 "3:\tsbbl %%eax,%%eax\n\t"
131 "orb $1,%%al\n" 131 "orb $1,%%al\n"
132 "4:" 132 "4:"
133 :"=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2) 133 : "=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2)
134 :"1" (cs), "2" (ct), "3" (count) 134 : "1" (cs), "2" (ct), "3" (count)
135 :"memory"); 135 : "memory");
136 return res; 136 return res;
137} 137}
138EXPORT_SYMBOL(strncmp); 138EXPORT_SYMBOL(strncmp);
@@ -152,9 +152,9 @@ char *strchr(const char *s, int c)
152 "movl $1,%1\n" 152 "movl $1,%1\n"
153 "2:\tmovl %1,%0\n\t" 153 "2:\tmovl %1,%0\n\t"
154 "decl %0" 154 "decl %0"
155 :"=a" (res), "=&S" (d0) 155 : "=a" (res), "=&S" (d0)
156 :"1" (s), "0" (c) 156 : "1" (s), "0" (c)
157 :"memory"); 157 : "memory");
158 return res; 158 return res;
159} 159}
160EXPORT_SYMBOL(strchr); 160EXPORT_SYMBOL(strchr);
@@ -169,9 +169,9 @@ size_t strlen(const char *s)
169 "scasb\n\t" 169 "scasb\n\t"
170 "notl %0\n\t" 170 "notl %0\n\t"
171 "decl %0" 171 "decl %0"
172 :"=c" (res), "=&D" (d0) 172 : "=c" (res), "=&D" (d0)
173 :"1" (s), "a" (0), "0" (0xffffffffu) 173 : "1" (s), "a" (0), "0" (0xffffffffu)
174 :"memory"); 174 : "memory");
175 return res; 175 return res;
176} 176}
177EXPORT_SYMBOL(strlen); 177EXPORT_SYMBOL(strlen);
@@ -189,9 +189,9 @@ void *memchr(const void *cs, int c, size_t count)
189 "je 1f\n\t" 189 "je 1f\n\t"
190 "movl $1,%0\n" 190 "movl $1,%0\n"
191 "1:\tdecl %0" 191 "1:\tdecl %0"
192 :"=D" (res), "=&c" (d0) 192 : "=D" (res), "=&c" (d0)
193 :"a" (c), "0" (cs), "1" (count) 193 : "a" (c), "0" (cs), "1" (count)
194 :"memory"); 194 : "memory");
195 return res; 195 return res;
196} 196}
197EXPORT_SYMBOL(memchr); 197EXPORT_SYMBOL(memchr);
@@ -228,9 +228,9 @@ size_t strnlen(const char *s, size_t count)
228 "cmpl $-1,%1\n\t" 228 "cmpl $-1,%1\n\t"
229 "jne 1b\n" 229 "jne 1b\n"
230 "3:\tsubl %2,%0" 230 "3:\tsubl %2,%0"
231 :"=a" (res), "=&d" (d0) 231 : "=a" (res), "=&d" (d0)
232 :"c" (s), "1" (count) 232 : "c" (s), "1" (count)
233 :"memory"); 233 : "memory");
234 return res; 234 return res;
235} 235}
236EXPORT_SYMBOL(strnlen); 236EXPORT_SYMBOL(strnlen);
diff --git a/arch/x86/lib/strstr_32.c b/arch/x86/lib/strstr_32.c
index 42e8a50303f3..8e2d55f754bf 100644
--- a/arch/x86/lib/strstr_32.c
+++ b/arch/x86/lib/strstr_32.c
@@ -23,9 +23,9 @@ __asm__ __volatile__(
23 "jne 1b\n\t" 23 "jne 1b\n\t"
24 "xorl %%eax,%%eax\n\t" 24 "xorl %%eax,%%eax\n\t"
25 "2:" 25 "2:"
26 :"=a" (__res), "=&c" (d0), "=&S" (d1) 26 : "=a" (__res), "=&c" (d0), "=&S" (d1)
27 :"0" (0), "1" (0xffffffff), "2" (cs), "g" (ct) 27 : "0" (0), "1" (0xffffffff), "2" (cs), "g" (ct)
28 :"dx", "di"); 28 : "dx", "di");
29return __res; 29return __res;
30} 30}
31 31
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
new file mode 100644
index 000000000000..650b11e00ecc
--- /dev/null
+++ b/arch/x86/lib/thunk_32.S
@@ -0,0 +1,47 @@
1/*
2 * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash)
3 * Copyright 2008 by Steven Rostedt, Red Hat, Inc
4 * (inspired by Andi Kleen's thunk_64.S)
5 * Subject to the GNU public license, v.2. No warranty of any kind.
6 */
7
8 #include <linux/linkage.h>
9
10#define ARCH_TRACE_IRQS_ON \
11 pushl %eax; \
12 pushl %ecx; \
13 pushl %edx; \
14 call trace_hardirqs_on; \
15 popl %edx; \
16 popl %ecx; \
17 popl %eax;
18
19#define ARCH_TRACE_IRQS_OFF \
20 pushl %eax; \
21 pushl %ecx; \
22 pushl %edx; \
23 call trace_hardirqs_off; \
24 popl %edx; \
25 popl %ecx; \
26 popl %eax;
27
28#ifdef CONFIG_TRACE_IRQFLAGS
29 /* put return address in eax (arg1) */
30 .macro thunk_ra name,func
31 .globl \name
32\name:
33 pushl %eax
34 pushl %ecx
35 pushl %edx
36 /* Place EIP in the arg1 */
37 movl 3*4(%esp), %eax
38 call \func
39 popl %edx
40 popl %ecx
41 popl %eax
42 ret
43 .endm
44
45 thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
46 thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller
47#endif
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index e009251d4e9f..bf9a7d5a5428 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -2,6 +2,7 @@
2 * Save registers before calling assembly functions. This avoids 2 * Save registers before calling assembly functions. This avoids
3 * disturbance of register allocation in some inline assembly constructs. 3 * disturbance of register allocation in some inline assembly constructs.
4 * Copyright 2001,2002 by Andi Kleen, SuSE Labs. 4 * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
5 * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc.
5 * Subject to the GNU public license, v.2. No warranty of any kind. 6 * Subject to the GNU public license, v.2. No warranty of any kind.
6 */ 7 */
7 8
@@ -42,8 +43,22 @@
42#endif 43#endif
43 44
44#ifdef CONFIG_TRACE_IRQFLAGS 45#ifdef CONFIG_TRACE_IRQFLAGS
45 thunk trace_hardirqs_on_thunk,trace_hardirqs_on 46 /* put return address in rdi (arg1) */
46 thunk trace_hardirqs_off_thunk,trace_hardirqs_off 47 .macro thunk_ra name,func
48 .globl \name
49\name:
50 CFI_STARTPROC
51 SAVE_ARGS
52 /* SAVE_ARGS pushs 9 elements */
53 /* the next element would be the rip */
54 movq 9*8(%rsp), %rdi
55 call \func
56 jmp restore
57 CFI_ENDPROC
58 .endm
59
60 thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
61 thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller
47#endif 62#endif
48 63
49#ifdef CONFIG_DEBUG_LOCK_ALLOC 64#ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 24e60944971a..9e68075544f6 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -14,6 +14,13 @@
14#include <asm/uaccess.h> 14#include <asm/uaccess.h>
15#include <asm/mmx.h> 15#include <asm/mmx.h>
16 16
17#ifdef CONFIG_X86_INTEL_USERCOPY
18/*
19 * Alignment at which movsl is preferred for bulk memory copies.
20 */
21struct movsl_mask movsl_mask __read_mostly;
22#endif
23
17static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned long n) 24static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned long n)
18{ 25{
19#ifdef CONFIG_X86_INTEL_USERCOPY 26#ifdef CONFIG_X86_INTEL_USERCOPY
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 0c89d1bb0287..f4df6e7c718b 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -158,3 +158,26 @@ unsigned long copy_in_user(void __user *to, const void __user *from, unsigned le
158} 158}
159EXPORT_SYMBOL(copy_in_user); 159EXPORT_SYMBOL(copy_in_user);
160 160
161/*
162 * Try to copy last bytes and clear the rest if needed.
163 * Since protection fault in copy_from/to_user is not a normal situation,
164 * it is not necessary to optimize tail handling.
165 */
166unsigned long
167copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest)
168{
169 char c;
170 unsigned zero_len;
171
172 for (; len; --len) {
173 if (__get_user_nocheck(c, from++, sizeof(char)))
174 break;
175 if (__put_user_nocheck(c, to++, sizeof(char)))
176 break;
177 }
178
179 for (c = 0, zero_len = len; zerorest && zero_len; --zero_len)
180 if (__put_user_nocheck(c, to++, sizeof(char)))
181 break;
182 return len;
183}
diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c
index 0c28a071824c..37b9ae4d44c5 100644
--- a/arch/x86/mach-default/setup.c
+++ b/arch/x86/mach-default/setup.c
@@ -10,13 +10,15 @@
10#include <asm/e820.h> 10#include <asm/e820.h>
11#include <asm/setup.h> 11#include <asm/setup.h>
12 12
13#include <mach_ipi.h>
14
13#ifdef CONFIG_HOTPLUG_CPU 15#ifdef CONFIG_HOTPLUG_CPU
14#define DEFAULT_SEND_IPI (1) 16#define DEFAULT_SEND_IPI (1)
15#else 17#else
16#define DEFAULT_SEND_IPI (0) 18#define DEFAULT_SEND_IPI (0)
17#endif 19#endif
18 20
19int no_broadcast=DEFAULT_SEND_IPI; 21int no_broadcast = DEFAULT_SEND_IPI;
20 22
21/** 23/**
22 * pre_intr_init_hook - initialisation prior to setting up interrupt vectors 24 * pre_intr_init_hook - initialisation prior to setting up interrupt vectors
@@ -29,18 +31,13 @@ int no_broadcast=DEFAULT_SEND_IPI;
29 **/ 31 **/
30void __init pre_intr_init_hook(void) 32void __init pre_intr_init_hook(void)
31{ 33{
34 if (x86_quirks->arch_pre_intr_init) {
35 if (x86_quirks->arch_pre_intr_init())
36 return;
37 }
32 init_ISA_irqs(); 38 init_ISA_irqs();
33} 39}
34 40
35/*
36 * IRQ2 is cascade interrupt to second interrupt controller
37 */
38static struct irqaction irq2 = {
39 .handler = no_action,
40 .mask = CPU_MASK_NONE,
41 .name = "cascade",
42};
43
44/** 41/**
45 * intr_init_hook - post gate setup interrupt initialisation 42 * intr_init_hook - post gate setup interrupt initialisation
46 * 43 *
@@ -52,12 +49,10 @@ static struct irqaction irq2 = {
52 **/ 49 **/
53void __init intr_init_hook(void) 50void __init intr_init_hook(void)
54{ 51{
55#ifdef CONFIG_X86_LOCAL_APIC 52 if (x86_quirks->arch_intr_init) {
56 apic_intr_init(); 53 if (x86_quirks->arch_intr_init())
57#endif 54 return;
58 55 }
59 if (!acpi_ioapic)
60 setup_irq(2, &irq2);
61} 56}
62 57
63/** 58/**
@@ -65,7 +60,7 @@ void __init intr_init_hook(void)
65 * 60 *
66 * Description: 61 * Description:
67 * generally used to activate any machine specific identification 62 * generally used to activate any machine specific identification
68 * routines that may be needed before setup_arch() runs. On VISWS 63 * routines that may be needed before setup_arch() runs. On Voyager
69 * this is used to get the board revision and type. 64 * this is used to get the board revision and type.
70 **/ 65 **/
71void __init pre_setup_arch_hook(void) 66void __init pre_setup_arch_hook(void)
@@ -81,6 +76,10 @@ void __init pre_setup_arch_hook(void)
81 **/ 76 **/
82void __init trap_init_hook(void) 77void __init trap_init_hook(void)
83{ 78{
79 if (x86_quirks->arch_trap_init) {
80 if (x86_quirks->arch_trap_init())
81 return;
82 }
84} 83}
85 84
86static struct irqaction irq0 = { 85static struct irqaction irq0 = {
@@ -91,6 +90,16 @@ static struct irqaction irq0 = {
91}; 90};
92 91
93/** 92/**
93 * pre_time_init_hook - do any specific initialisations before.
94 *
95 **/
96void __init pre_time_init_hook(void)
97{
98 if (x86_quirks->arch_pre_time_init)
99 x86_quirks->arch_pre_time_init();
100}
101
102/**
94 * time_init_hook - do any specific initialisations for the system timer. 103 * time_init_hook - do any specific initialisations for the system timer.
95 * 104 *
96 * Description: 105 * Description:
@@ -99,6 +108,16 @@ static struct irqaction irq0 = {
99 **/ 108 **/
100void __init time_init_hook(void) 109void __init time_init_hook(void)
101{ 110{
111 if (x86_quirks->arch_time_init) {
112 /*
113 * A nonzero return code does not mean failure, it means
114 * that the architecture quirk does not want any
115 * generic (timer) setup to be performed after this:
116 */
117 if (x86_quirks->arch_time_init())
118 return;
119 }
120
102 irq0.mask = cpumask_of_cpu(0); 121 irq0.mask = cpumask_of_cpu(0);
103 setup_irq(0, &irq0); 122 setup_irq(0, &irq0);
104} 123}
@@ -142,45 +161,3 @@ static int __init print_ipi_mode(void)
142 161
143late_initcall(print_ipi_mode); 162late_initcall(print_ipi_mode);
144 163
145/**
146 * machine_specific_memory_setup - Hook for machine specific memory setup.
147 *
148 * Description:
149 * This is included late in kernel/setup.c so that it can make
150 * use of all of the static functions.
151 **/
152
153char * __init machine_specific_memory_setup(void)
154{
155 char *who;
156
157
158 who = "BIOS-e820";
159
160 /*
161 * Try to copy the BIOS-supplied E820-map.
162 *
163 * Otherwise fake a memory map; one section from 0k->640k,
164 * the next section from 1mb->appropriate_mem_k
165 */
166 sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
167 if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries)
168 < 0) {
169 unsigned long mem_size;
170
171 /* compare results from other methods and take the greater */
172 if (boot_params.alt_mem_k
173 < boot_params.screen_info.ext_mem_k) {
174 mem_size = boot_params.screen_info.ext_mem_k;
175 who = "BIOS-88";
176 } else {
177 mem_size = boot_params.alt_mem_k;
178 who = "BIOS-e801";
179 }
180
181 e820.nr_map = 0;
182 add_memory_region(0, LOWMEMSIZE(), E820_RAM);
183 add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
184 }
185 return who;
186}
diff --git a/arch/x86/mach-es7000/Makefile b/arch/x86/mach-es7000/Makefile
deleted file mode 100644
index 69dd4da218dc..000000000000
--- a/arch/x86/mach-es7000/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
1#
2# Makefile for the linux kernel.
3#
4
5obj-$(CONFIG_X86_ES7000) := es7000plat.o
6obj-$(CONFIG_X86_GENERICARCH) := es7000plat.o
diff --git a/arch/x86/mach-es7000/es7000.h b/arch/x86/mach-es7000/es7000.h
deleted file mode 100644
index c8d5aa132fa0..000000000000
--- a/arch/x86/mach-es7000/es7000.h
+++ /dev/null
@@ -1,114 +0,0 @@
1/*
2 * Written by: Garry Forsgren, Unisys Corporation
3 * Natalie Protasevich, Unisys Corporation
4 * This file contains the code to configure and interface
5 * with Unisys ES7000 series hardware system manager.
6 *
7 * Copyright (c) 2003 Unisys Corporation. All Rights Reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of version 2 of the GNU General Public License as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it would be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
16 *
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write the Free Software Foundation, Inc., 59
19 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
20 *
21 * Contact information: Unisys Corporation, Township Line & Union Meeting
22 * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or:
23 *
24 * http://www.unisys.com
25 */
26
27/*
28 * ES7000 chipsets
29 */
30
31#define NON_UNISYS 0
32#define ES7000_CLASSIC 1
33#define ES7000_ZORRO 2
34
35
36#define MIP_REG 1
37#define MIP_PSAI_REG 4
38
39#define MIP_BUSY 1
40#define MIP_SPIN 0xf0000
41#define MIP_VALID 0x0100000000000000ULL
42#define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff)
43
44#define MIP_RD_LO(VALUE) (VALUE & 0xffffffff)
45
46struct mip_reg_info {
47 unsigned long long mip_info;
48 unsigned long long delivery_info;
49 unsigned long long host_reg;
50 unsigned long long mip_reg;
51};
52
53struct part_info {
54 unsigned char type;
55 unsigned char length;
56 unsigned char part_id;
57 unsigned char apic_mode;
58 unsigned long snum;
59 char ptype[16];
60 char sname[64];
61 char pname[64];
62};
63
64struct psai {
65 unsigned long long entry_type;
66 unsigned long long addr;
67 unsigned long long bep_addr;
68};
69
70struct es7000_mem_info {
71 unsigned char type;
72 unsigned char length;
73 unsigned char resv[6];
74 unsigned long long start;
75 unsigned long long size;
76};
77
78struct es7000_oem_table {
79 unsigned long long hdr;
80 struct mip_reg_info mip;
81 struct part_info pif;
82 struct es7000_mem_info shm;
83 struct psai psai;
84};
85
86#ifdef CONFIG_ACPI
87
88struct oem_table {
89 struct acpi_table_header Header;
90 u32 OEMTableAddr;
91 u32 OEMTableSize;
92};
93
94extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
95#endif
96
97struct mip_reg {
98 unsigned long long off_0;
99 unsigned long long off_8;
100 unsigned long long off_10;
101 unsigned long long off_18;
102 unsigned long long off_20;
103 unsigned long long off_28;
104 unsigned long long off_30;
105 unsigned long long off_38;
106};
107
108#define MIP_SW_APIC 0x1020b
109#define MIP_FUNC(VALUE) (VALUE & 0xff)
110
111extern int parse_unisys_oem (char *oemptr);
112extern void setup_unisys(void);
113extern int es7000_start_cpu(int cpu, unsigned long eip);
114extern void es7000_sw_apic(void);
diff --git a/arch/x86/mach-generic/Makefile b/arch/x86/mach-generic/Makefile
index 19d6d407737b..6730f4e7c744 100644
--- a/arch/x86/mach-generic/Makefile
+++ b/arch/x86/mach-generic/Makefile
@@ -2,7 +2,10 @@
2# Makefile for the generic architecture 2# Makefile for the generic architecture
3# 3#
4 4
5EXTRA_CFLAGS := -Iarch/x86/kernel 5EXTRA_CFLAGS := -Iarch/x86/kernel
6 6
7obj-y := probe.o summit.o bigsmp.o es7000.o default.o 7obj-y := probe.o default.o
8obj-y += ../../x86/mach-es7000/ 8obj-$(CONFIG_X86_NUMAQ) += numaq.o
9obj-$(CONFIG_X86_SUMMIT) += summit.o
10obj-$(CONFIG_X86_BIGSMP) += bigsmp.o
11obj-$(CONFIG_X86_ES7000) += es7000.o
diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c
index 95fc463056d0..df37fc9d6a26 100644
--- a/arch/x86/mach-generic/bigsmp.c
+++ b/arch/x86/mach-generic/bigsmp.c
@@ -5,28 +5,25 @@
5#define APIC_DEFINITION 1 5#define APIC_DEFINITION 1
6#include <linux/threads.h> 6#include <linux/threads.h>
7#include <linux/cpumask.h> 7#include <linux/cpumask.h>
8#include <asm/smp.h>
9#include <asm/mpspec.h> 8#include <asm/mpspec.h>
10#include <asm/genapic.h> 9#include <asm/genapic.h>
11#include <asm/fixmap.h> 10#include <asm/fixmap.h>
12#include <asm/apicdef.h> 11#include <asm/apicdef.h>
13#include <linux/kernel.h> 12#include <linux/kernel.h>
14#include <linux/smp.h>
15#include <linux/init.h> 13#include <linux/init.h>
16#include <linux/dmi.h> 14#include <linux/dmi.h>
17#include <asm/mach-bigsmp/mach_apic.h> 15#include <asm/bigsmp/apicdef.h>
18#include <asm/mach-bigsmp/mach_apicdef.h> 16#include <linux/smp.h>
19#include <asm/mach-bigsmp/mach_ipi.h> 17#include <asm/bigsmp/apic.h>
18#include <asm/bigsmp/ipi.h>
20#include <asm/mach-default/mach_mpparse.h> 19#include <asm/mach-default/mach_mpparse.h>
21 20
22static int dmi_bigsmp; /* can be set by dmi scanners */ 21static int dmi_bigsmp; /* can be set by dmi scanners */
23 22
24static int hp_ht_bigsmp(const struct dmi_system_id *d) 23static int hp_ht_bigsmp(const struct dmi_system_id *d)
25{ 24{
26#ifdef CONFIG_X86_GENERICARCH
27 printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident); 25 printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident);
28 dmi_bigsmp = 1; 26 dmi_bigsmp = 1;
29#endif
30 return 0; 27 return 0;
31} 28}
32 29
@@ -48,7 +45,7 @@ static const struct dmi_system_id bigsmp_dmi_table[] = {
48static int probe_bigsmp(void) 45static int probe_bigsmp(void)
49{ 46{
50 if (def_to_bigsmp) 47 if (def_to_bigsmp)
51 dmi_bigsmp = 1; 48 dmi_bigsmp = 1;
52 else 49 else
53 dmi_check_system(bigsmp_dmi_table); 50 dmi_check_system(bigsmp_dmi_table);
54 return dmi_bigsmp; 51 return dmi_bigsmp;
diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c
index 4742626f08c4..6513d41ea21e 100644
--- a/arch/x86/mach-generic/es7000.c
+++ b/arch/x86/mach-generic/es7000.c
@@ -4,20 +4,19 @@
4#define APIC_DEFINITION 1 4#define APIC_DEFINITION 1
5#include <linux/threads.h> 5#include <linux/threads.h>
6#include <linux/cpumask.h> 6#include <linux/cpumask.h>
7#include <asm/smp.h>
8#include <asm/mpspec.h> 7#include <asm/mpspec.h>
9#include <asm/genapic.h> 8#include <asm/genapic.h>
10#include <asm/fixmap.h> 9#include <asm/fixmap.h>
11#include <asm/apicdef.h> 10#include <asm/apicdef.h>
12#include <linux/kernel.h> 11#include <linux/kernel.h>
13#include <linux/string.h> 12#include <linux/string.h>
14#include <linux/smp.h>
15#include <linux/init.h> 13#include <linux/init.h>
16#include <asm/mach-es7000/mach_apicdef.h> 14#include <asm/es7000/apicdef.h>
17#include <asm/mach-es7000/mach_apic.h> 15#include <linux/smp.h>
18#include <asm/mach-es7000/mach_ipi.h> 16#include <asm/es7000/apic.h>
19#include <asm/mach-es7000/mach_mpparse.h> 17#include <asm/es7000/ipi.h>
20#include <asm/mach-es7000/mach_wakecpu.h> 18#include <asm/es7000/mpparse.h>
19#include <asm/es7000/wakecpu.h>
21 20
22static int probe_es7000(void) 21static int probe_es7000(void)
23{ 22{
@@ -48,16 +47,26 @@ static __init int mps_oem_check(struct mp_config_table *mpc, char *oem,
48/* Hook from generic ACPI tables.c */ 47/* Hook from generic ACPI tables.c */
49static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) 48static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
50{ 49{
51 unsigned long oem_addr; 50 unsigned long oem_addr = 0;
51 int check_dsdt;
52 int ret = 0;
53
54 /* check dsdt at first to avoid clear fix_map for oem_addr */
55 check_dsdt = es7000_check_dsdt();
56
52 if (!find_unisys_acpi_oem_table(&oem_addr)) { 57 if (!find_unisys_acpi_oem_table(&oem_addr)) {
53 if (es7000_check_dsdt()) 58 if (check_dsdt)
54 return parse_unisys_oem((char *)oem_addr); 59 ret = parse_unisys_oem((char *)oem_addr);
55 else { 60 else {
56 setup_unisys(); 61 setup_unisys();
57 return 1; 62 ret = 1;
58 } 63 }
64 /*
65 * we need to unmap it
66 */
67 unmap_unisys_acpi_oem_table(oem_addr);
59 } 68 }
60 return 0; 69 return ret;
61} 70}
62#else 71#else
63static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) 72static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c
new file mode 100644
index 000000000000..8cf58394975e
--- /dev/null
+++ b/arch/x86/mach-generic/numaq.c
@@ -0,0 +1,41 @@
1/*
2 * APIC driver for the IBM NUMAQ chipset.
3 */
4#define APIC_DEFINITION 1
5#include <linux/threads.h>
6#include <linux/cpumask.h>
7#include <asm/mpspec.h>
8#include <asm/genapic.h>
9#include <asm/fixmap.h>
10#include <asm/apicdef.h>
11#include <linux/kernel.h>
12#include <linux/string.h>
13#include <linux/init.h>
14#include <asm/numaq/apicdef.h>
15#include <linux/smp.h>
16#include <asm/numaq/apic.h>
17#include <asm/numaq/ipi.h>
18#include <asm/numaq/mpparse.h>
19#include <asm/numaq/wakecpu.h>
20#include <asm/numaq.h>
21
22static int mps_oem_check(struct mp_config_table *mpc, char *oem,
23 char *productid)
24{
25 numaq_mps_oem_check(mpc, oem, productid);
26 return found_numaq;
27}
28
29static int probe_numaq(void)
30{
31 /* already know from get_memcfg_numaq() */
32 return found_numaq;
33}
34
35/* Hook from generic ACPI tables.c */
36static int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
37{
38 return 0;
39}
40
41struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq);
diff --git a/arch/x86/mach-generic/probe.c b/arch/x86/mach-generic/probe.c
index c5ae751b994a..5a7e4619e1c4 100644
--- a/arch/x86/mach-generic/probe.c
+++ b/arch/x86/mach-generic/probe.c
@@ -16,6 +16,7 @@
16#include <asm/apicdef.h> 16#include <asm/apicdef.h>
17#include <asm/genapic.h> 17#include <asm/genapic.h>
18 18
19extern struct genapic apic_numaq;
19extern struct genapic apic_summit; 20extern struct genapic apic_summit;
20extern struct genapic apic_bigsmp; 21extern struct genapic apic_bigsmp;
21extern struct genapic apic_es7000; 22extern struct genapic apic_es7000;
@@ -24,9 +25,18 @@ extern struct genapic apic_default;
24struct genapic *genapic = &apic_default; 25struct genapic *genapic = &apic_default;
25 26
26static struct genapic *apic_probe[] __initdata = { 27static struct genapic *apic_probe[] __initdata = {
28#ifdef CONFIG_X86_NUMAQ
29 &apic_numaq,
30#endif
31#ifdef CONFIG_X86_SUMMIT
27 &apic_summit, 32 &apic_summit,
33#endif
34#ifdef CONFIG_X86_BIGSMP
28 &apic_bigsmp, 35 &apic_bigsmp,
36#endif
37#ifdef CONFIG_X86_ES7000
29 &apic_es7000, 38 &apic_es7000,
39#endif
30 &apic_default, /* must be last */ 40 &apic_default, /* must be last */
31 NULL, 41 NULL,
32}; 42};
@@ -54,6 +64,7 @@ early_param("apic", parse_apic);
54 64
55void __init generic_bigsmp_probe(void) 65void __init generic_bigsmp_probe(void)
56{ 66{
67#ifdef CONFIG_X86_BIGSMP
57 /* 68 /*
58 * This routine is used to switch to bigsmp mode when 69 * This routine is used to switch to bigsmp mode when
59 * - There is no apic= option specified by the user 70 * - There is no apic= option specified by the user
@@ -67,6 +78,7 @@ void __init generic_bigsmp_probe(void)
67 printk(KERN_INFO "Overriding APIC driver with %s\n", 78 printk(KERN_INFO "Overriding APIC driver with %s\n",
68 genapic->name); 79 genapic->name);
69 } 80 }
81#endif
70} 82}
71 83
72void __init generic_apic_probe(void) 84void __init generic_apic_probe(void)
@@ -88,7 +100,8 @@ void __init generic_apic_probe(void)
88 100
89/* These functions can switch the APIC even after the initial ->probe() */ 101/* These functions can switch the APIC even after the initial ->probe() */
90 102
91int __init mps_oem_check(struct mp_config_table *mpc, char *oem, char *productid) 103int __init mps_oem_check(struct mp_config_table *mpc, char *oem,
104 char *productid)
92{ 105{
93 int i; 106 int i;
94 for (i = 0; apic_probe[i]; ++i) { 107 for (i = 0; apic_probe[i]; ++i) {
diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c
index a97ea0f35b1e..6ad6b67a723d 100644
--- a/arch/x86/mach-generic/summit.c
+++ b/arch/x86/mach-generic/summit.c
@@ -4,19 +4,18 @@
4#define APIC_DEFINITION 1 4#define APIC_DEFINITION 1
5#include <linux/threads.h> 5#include <linux/threads.h>
6#include <linux/cpumask.h> 6#include <linux/cpumask.h>
7#include <asm/smp.h>
8#include <asm/mpspec.h> 7#include <asm/mpspec.h>
9#include <asm/genapic.h> 8#include <asm/genapic.h>
10#include <asm/fixmap.h> 9#include <asm/fixmap.h>
11#include <asm/apicdef.h> 10#include <asm/apicdef.h>
12#include <linux/kernel.h> 11#include <linux/kernel.h>
13#include <linux/string.h> 12#include <linux/string.h>
14#include <linux/smp.h>
15#include <linux/init.h> 13#include <linux/init.h>
16#include <asm/mach-summit/mach_apic.h> 14#include <asm/summit/apicdef.h>
17#include <asm/mach-summit/mach_apicdef.h> 15#include <linux/smp.h>
18#include <asm/mach-summit/mach_ipi.h> 16#include <asm/summit/apic.h>
19#include <asm/mach-summit/mach_mpparse.h> 17#include <asm/summit/ipi.h>
18#include <asm/summit/mpparse.h>
20 19
21static int probe_summit(void) 20static int probe_summit(void)
22{ 21{
diff --git a/arch/x86/mach-rdc321x/platform.c b/arch/x86/mach-rdc321x/platform.c
index a037041817c7..4f4e50c3ad3b 100644
--- a/arch/x86/mach-rdc321x/platform.c
+++ b/arch/x86/mach-rdc321x/platform.c
@@ -25,7 +25,6 @@
25#include <linux/list.h> 25#include <linux/list.h>
26#include <linux/device.h> 26#include <linux/device.h>
27#include <linux/platform_device.h> 27#include <linux/platform_device.h>
28#include <linux/version.h>
29#include <linux/leds.h> 28#include <linux/leds.h>
30 29
31#include <asm/gpio.h> 30#include <asm/gpio.h>
diff --git a/arch/x86/mach-visws/Makefile b/arch/x86/mach-visws/Makefile
deleted file mode 100644
index 835fd96ad768..000000000000
--- a/arch/x86/mach-visws/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
1#
2# Makefile for the linux kernel.
3#
4
5obj-y := setup.o traps.o reboot.o
6
7obj-$(CONFIG_X86_VISWS_APIC) += visws_apic.o
8obj-$(CONFIG_X86_LOCAL_APIC) += mpparse.o
diff --git a/arch/x86/mach-visws/mpparse.c b/arch/x86/mach-visws/mpparse.c
deleted file mode 100644
index 57484e91ab90..000000000000
--- a/arch/x86/mach-visws/mpparse.c
+++ /dev/null
@@ -1,88 +0,0 @@
1
2#include <linux/init.h>
3#include <linux/smp.h>
4
5#include <asm/smp.h>
6#include <asm/io.h>
7
8#include "cobalt.h"
9#include "mach_apic.h"
10
11/* Have we found an MP table */
12int smp_found_config;
13
14int pic_mode;
15
16extern unsigned int __cpuinitdata maxcpus;
17
18/*
19 * The Visual Workstation is Intel MP compliant in the hardware
20 * sense, but it doesn't have a BIOS(-configuration table).
21 * No problem for Linux.
22 */
23
24static void __init MP_processor_info (struct mpc_config_processor *m)
25{
26 int ver, logical_apicid;
27 physid_mask_t apic_cpus;
28
29 if (!(m->mpc_cpuflag & CPU_ENABLED))
30 return;
31
32 logical_apicid = m->mpc_apicid;
33 printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n",
34 m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "",
35 m->mpc_apicid,
36 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
37 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
38 m->mpc_apicver);
39
40 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR)
41 boot_cpu_physical_apicid = m->mpc_apicid;
42
43 ver = m->mpc_apicver;
44 if ((ver >= 0x14 && m->mpc_apicid >= 0xff) || m->mpc_apicid >= 0xf) {
45 printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
46 m->mpc_apicid, MAX_APICS);
47 return;
48 }
49
50 apic_cpus = apicid_to_cpu_present(m->mpc_apicid);
51 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
52 /*
53 * Validate version
54 */
55 if (ver == 0x0) {
56 printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! "
57 "fixing up to 0x10. (tell your hw vendor)\n",
58 m->mpc_apicid);
59 ver = 0x10;
60 }
61 apic_version[m->mpc_apicid] = ver;
62}
63
64void __init find_smp_config(void)
65{
66 struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS);
67 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
68
69 if (ncpus > CO_CPU_MAX) {
70 printk(KERN_WARNING "find_visws_smp: got cpu count of %d at %p\n",
71 ncpus, mp);
72
73 ncpus = CO_CPU_MAX;
74 }
75
76 if (ncpus > maxcpus)
77 ncpus = maxcpus;
78
79 smp_found_config = 1;
80 while (ncpus--)
81 MP_processor_info(mp++);
82
83 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
84}
85
86void __init get_smp_config (void)
87{
88}
diff --git a/arch/x86/mach-visws/reboot.c b/arch/x86/mach-visws/reboot.c
deleted file mode 100644
index 99332abfad42..000000000000
--- a/arch/x86/mach-visws/reboot.c
+++ /dev/null
@@ -1,55 +0,0 @@
1#include <linux/module.h>
2#include <linux/smp.h>
3#include <linux/delay.h>
4
5#include <asm/io.h>
6#include "piix4.h"
7
8void (*pm_power_off)(void);
9EXPORT_SYMBOL(pm_power_off);
10
11void machine_shutdown(void)
12{
13#ifdef CONFIG_SMP
14 smp_send_stop();
15#endif
16}
17
18void machine_emergency_restart(void)
19{
20 /*
21 * Visual Workstations restart after this
22 * register is poked on the PIIX4
23 */
24 outb(PIIX4_RESET_VAL, PIIX4_RESET_PORT);
25}
26
27void machine_restart(char * __unused)
28{
29 machine_shutdown();
30 machine_emergency_restart();
31}
32
33void machine_power_off(void)
34{
35 unsigned short pm_status;
36 extern unsigned int pci_bus0;
37
38 while ((pm_status = inw(PMSTS_PORT)) & 0x100)
39 outw(pm_status, PMSTS_PORT);
40
41 outw(PM_SUSPEND_ENABLE, PMCNTRL_PORT);
42
43 mdelay(10);
44
45#define PCI_CONF1_ADDRESS(bus, devfn, reg) \
46 (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3))
47
48 outl(PCI_CONF1_ADDRESS(pci_bus0, SPECIAL_DEV, SPECIAL_REG), 0xCF8);
49 outl(PIIX_SPECIAL_STOP, 0xCFC);
50}
51
52void machine_halt(void)
53{
54}
55
diff --git a/arch/x86/mach-visws/setup.c b/arch/x86/mach-visws/setup.c
deleted file mode 100644
index de4c9dbd086f..000000000000
--- a/arch/x86/mach-visws/setup.c
+++ /dev/null
@@ -1,183 +0,0 @@
1/*
2 * Unmaintained SGI Visual Workstation support.
3 * Split out from setup.c by davej@suse.de
4 */
5
6#include <linux/smp.h>
7#include <linux/init.h>
8#include <linux/interrupt.h>
9#include <linux/module.h>
10
11#include <asm/fixmap.h>
12#include <asm/arch_hooks.h>
13#include <asm/io.h>
14#include <asm/e820.h>
15#include <asm/setup.h>
16#include "cobalt.h"
17#include "piix4.h"
18
19int no_broadcast;
20
21char visws_board_type = -1;
22char visws_board_rev = -1;
23
24void __init visws_get_board_type_and_rev(void)
25{
26 int raw;
27
28 visws_board_type = (char)(inb_p(PIIX_GPI_BD_REG) & PIIX_GPI_BD_REG)
29 >> PIIX_GPI_BD_SHIFT;
30 /*
31 * Get Board rev.
32 * First, we have to initialize the 307 part to allow us access
33 * to the GPIO registers. Let's map them at 0x0fc0 which is right
34 * after the PIIX4 PM section.
35 */
36 outb_p(SIO_DEV_SEL, SIO_INDEX);
37 outb_p(SIO_GP_DEV, SIO_DATA); /* Talk to GPIO regs. */
38
39 outb_p(SIO_DEV_MSB, SIO_INDEX);
40 outb_p(SIO_GP_MSB, SIO_DATA); /* MSB of GPIO base address */
41
42 outb_p(SIO_DEV_LSB, SIO_INDEX);
43 outb_p(SIO_GP_LSB, SIO_DATA); /* LSB of GPIO base address */
44
45 outb_p(SIO_DEV_ENB, SIO_INDEX);
46 outb_p(1, SIO_DATA); /* Enable GPIO registers. */
47
48 /*
49 * Now, we have to map the power management section to write
50 * a bit which enables access to the GPIO registers.
51 * What lunatic came up with this shit?
52 */
53 outb_p(SIO_DEV_SEL, SIO_INDEX);
54 outb_p(SIO_PM_DEV, SIO_DATA); /* Talk to GPIO regs. */
55
56 outb_p(SIO_DEV_MSB, SIO_INDEX);
57 outb_p(SIO_PM_MSB, SIO_DATA); /* MSB of PM base address */
58
59 outb_p(SIO_DEV_LSB, SIO_INDEX);
60 outb_p(SIO_PM_LSB, SIO_DATA); /* LSB of PM base address */
61
62 outb_p(SIO_DEV_ENB, SIO_INDEX);
63 outb_p(1, SIO_DATA); /* Enable PM registers. */
64
65 /*
66 * Now, write the PM register which enables the GPIO registers.
67 */
68 outb_p(SIO_PM_FER2, SIO_PM_INDEX);
69 outb_p(SIO_PM_GP_EN, SIO_PM_DATA);
70
71 /*
72 * Now, initialize the GPIO registers.
73 * We want them all to be inputs which is the
74 * power on default, so let's leave them alone.
75 * So, let's just read the board rev!
76 */
77 raw = inb_p(SIO_GP_DATA1);
78 raw &= 0x7f; /* 7 bits of valid board revision ID. */
79
80 if (visws_board_type == VISWS_320) {
81 if (raw < 0x6) {
82 visws_board_rev = 4;
83 } else if (raw < 0xc) {
84 visws_board_rev = 5;
85 } else {
86 visws_board_rev = 6;
87 }
88 } else if (visws_board_type == VISWS_540) {
89 visws_board_rev = 2;
90 } else {
91 visws_board_rev = raw;
92 }
93
94 printk(KERN_INFO "Silicon Graphics Visual Workstation %s (rev %d) detected\n",
95 (visws_board_type == VISWS_320 ? "320" :
96 (visws_board_type == VISWS_540 ? "540" :
97 "unknown")), visws_board_rev);
98}
99
100void __init pre_intr_init_hook(void)
101{
102 init_VISWS_APIC_irqs();
103}
104
105void __init intr_init_hook(void)
106{
107#ifdef CONFIG_X86_LOCAL_APIC
108 apic_intr_init();
109#endif
110}
111
112void __init pre_setup_arch_hook()
113{
114 visws_get_board_type_and_rev();
115}
116
117static struct irqaction irq0 = {
118 .handler = timer_interrupt,
119 .flags = IRQF_DISABLED | IRQF_IRQPOLL,
120 .name = "timer",
121};
122
123void __init time_init_hook(void)
124{
125 printk(KERN_INFO "Starting Cobalt Timer system clock\n");
126
127 /* Set the countdown value */
128 co_cpu_write(CO_CPU_TIMEVAL, CO_TIME_HZ/HZ);
129
130 /* Start the timer */
131 co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) | CO_CTRL_TIMERUN);
132
133 /* Enable (unmask) the timer interrupt */
134 co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK);
135
136 /* Wire cpu IDT entry to s/w handler (and Cobalt APIC to IDT) */
137 setup_irq(0, &irq0);
138}
139
140/* Hook for machine specific memory setup. */
141
142#define MB (1024 * 1024)
143
144unsigned long sgivwfb_mem_phys;
145unsigned long sgivwfb_mem_size;
146EXPORT_SYMBOL(sgivwfb_mem_phys);
147EXPORT_SYMBOL(sgivwfb_mem_size);
148
149long long mem_size __initdata = 0;
150
151char * __init machine_specific_memory_setup(void)
152{
153 long long gfx_mem_size = 8 * MB;
154
155 mem_size = boot_params.alt_mem_k;
156
157 if (!mem_size) {
158 printk(KERN_WARNING "Bootloader didn't set memory size, upgrade it !\n");
159 mem_size = 128 * MB;
160 }
161
162 /*
163 * this hardcodes the graphics memory to 8 MB
164 * it really should be sized dynamically (or at least
165 * set as a boot param)
166 */
167 if (!sgivwfb_mem_size) {
168 printk(KERN_WARNING "Defaulting to 8 MB framebuffer size\n");
169 sgivwfb_mem_size = 8 * MB;
170 }
171
172 /*
173 * Trim to nearest MB
174 */
175 sgivwfb_mem_size &= ~((1 << 20) - 1);
176 sgivwfb_mem_phys = mem_size - gfx_mem_size;
177
178 add_memory_region(0, LOWMEMSIZE(), E820_RAM);
179 add_memory_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM);
180 add_memory_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED);
181
182 return "PROM";
183}
diff --git a/arch/x86/mach-visws/traps.c b/arch/x86/mach-visws/traps.c
deleted file mode 100644
index bfac6ba10f8a..000000000000
--- a/arch/x86/mach-visws/traps.c
+++ /dev/null
@@ -1,69 +0,0 @@
1/* VISWS traps */
2
3#include <linux/sched.h>
4#include <linux/kernel.h>
5#include <linux/init.h>
6#include <linux/pci.h>
7#include <linux/pci_ids.h>
8
9#include <asm/io.h>
10#include <asm/arch_hooks.h>
11#include <asm/apic.h>
12#include "cobalt.h"
13#include "lithium.h"
14
15
16#define A01234 (LI_INTA_0 | LI_INTA_1 | LI_INTA_2 | LI_INTA_3 | LI_INTA_4)
17#define BCD (LI_INTB | LI_INTC | LI_INTD)
18#define ALLDEVS (A01234 | BCD)
19
20static __init void lithium_init(void)
21{
22 set_fixmap(FIX_LI_PCIA, LI_PCI_A_PHYS);
23 set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS);
24
25 if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
26 (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
27 printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A');
28 panic("This machine is not SGI Visual Workstation 320/540");
29 }
30
31 if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
32 (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
33 printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B');
34 panic("This machine is not SGI Visual Workstation 320/540");
35 }
36
37 li_pcia_write16(LI_PCI_INTEN, ALLDEVS);
38 li_pcib_write16(LI_PCI_INTEN, ALLDEVS);
39}
40
41static __init void cobalt_init(void)
42{
43 /*
44 * On normal SMP PC this is used only with SMP, but we have to
45 * use it and set it up here to start the Cobalt clock
46 */
47 set_fixmap(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE);
48 setup_local_APIC();
49 printk(KERN_INFO "Local APIC Version %#x, ID %#x\n",
50 (unsigned int)apic_read(APIC_LVR),
51 (unsigned int)apic_read(APIC_ID));
52
53 set_fixmap(FIX_CO_CPU, CO_CPU_PHYS);
54 set_fixmap(FIX_CO_APIC, CO_APIC_PHYS);
55 printk(KERN_INFO "Cobalt Revision %#lx, APIC ID %#lx\n",
56 co_cpu_read(CO_CPU_REV), co_apic_read(CO_APIC_ID));
57
58 /* Enable Cobalt APIC being careful to NOT change the ID! */
59 co_apic_write(CO_APIC_ID, co_apic_read(CO_APIC_ID) | CO_APIC_ENABLE);
60
61 printk(KERN_INFO "Cobalt APIC enabled: ID reg %#lx\n",
62 co_apic_read(CO_APIC_ID));
63}
64
65void __init trap_init_hook(void)
66{
67 lithium_init();
68 cobalt_init();
69}
diff --git a/arch/x86/mach-visws/visws_apic.c b/arch/x86/mach-visws/visws_apic.c
deleted file mode 100644
index cef9cb1d15ac..000000000000
--- a/arch/x86/mach-visws/visws_apic.c
+++ /dev/null
@@ -1,297 +0,0 @@
1/*
2 * Copyright (C) 1999 Bent Hagemark, Ingo Molnar
3 *
4 * SGI Visual Workstation interrupt controller
5 *
6 * The Cobalt system ASIC in the Visual Workstation contains a "Cobalt" APIC
7 * which serves as the main interrupt controller in the system. Non-legacy
8 * hardware in the system uses this controller directly. Legacy devices
9 * are connected to the PIIX4 which in turn has its 8259(s) connected to
10 * a of the Cobalt APIC entry.
11 *
12 * 09/02/2000 - Updated for 2.4 by jbarnes@sgi.com
13 *
14 * 25/11/2002 - Updated for 2.5 by Andrey Panin <pazke@orbita1.ru>
15 */
16
17#include <linux/kernel_stat.h>
18#include <linux/interrupt.h>
19#include <linux/init.h>
20
21#include <asm/io.h>
22#include <asm/apic.h>
23#include <asm/i8259.h>
24
25#include "cobalt.h"
26#include "irq_vectors.h"
27
28
29static DEFINE_SPINLOCK(cobalt_lock);
30
31/*
32 * Set the given Cobalt APIC Redirection Table entry to point
33 * to the given IDT vector/index.
34 */
35static inline void co_apic_set(int entry, int irq)
36{
37 co_apic_write(CO_APIC_LO(entry), CO_APIC_LEVEL | (irq + FIRST_EXTERNAL_VECTOR));
38 co_apic_write(CO_APIC_HI(entry), 0);
39}
40
41/*
42 * Cobalt (IO)-APIC functions to handle PCI devices.
43 */
44static inline int co_apic_ide0_hack(void)
45{
46 extern char visws_board_type;
47 extern char visws_board_rev;
48
49 if (visws_board_type == VISWS_320 && visws_board_rev == 5)
50 return 5;
51 return CO_APIC_IDE0;
52}
53
54static int is_co_apic(unsigned int irq)
55{
56 if (IS_CO_APIC(irq))
57 return CO_APIC(irq);
58
59 switch (irq) {
60 case 0: return CO_APIC_CPU;
61 case CO_IRQ_IDE0: return co_apic_ide0_hack();
62 case CO_IRQ_IDE1: return CO_APIC_IDE1;
63 default: return -1;
64 }
65}
66
67
68/*
69 * This is the SGI Cobalt (IO-)APIC:
70 */
71
72static void enable_cobalt_irq(unsigned int irq)
73{
74 co_apic_set(is_co_apic(irq), irq);
75}
76
77static void disable_cobalt_irq(unsigned int irq)
78{
79 int entry = is_co_apic(irq);
80
81 co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK);
82 co_apic_read(CO_APIC_LO(entry));
83}
84
85/*
86 * "irq" really just serves to identify the device. Here is where we
87 * map this to the Cobalt APIC entry where it's physically wired.
88 * This is called via request_irq -> setup_irq -> irq_desc->startup()
89 */
90static unsigned int startup_cobalt_irq(unsigned int irq)
91{
92 unsigned long flags;
93
94 spin_lock_irqsave(&cobalt_lock, flags);
95 if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
96 irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
97 enable_cobalt_irq(irq);
98 spin_unlock_irqrestore(&cobalt_lock, flags);
99 return 0;
100}
101
102static void ack_cobalt_irq(unsigned int irq)
103{
104 unsigned long flags;
105
106 spin_lock_irqsave(&cobalt_lock, flags);
107 disable_cobalt_irq(irq);
108 apic_write(APIC_EOI, APIC_EIO_ACK);
109 spin_unlock_irqrestore(&cobalt_lock, flags);
110}
111
112static void end_cobalt_irq(unsigned int irq)
113{
114 unsigned long flags;
115
116 spin_lock_irqsave(&cobalt_lock, flags);
117 if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS)))
118 enable_cobalt_irq(irq);
119 spin_unlock_irqrestore(&cobalt_lock, flags);
120}
121
122static struct irq_chip cobalt_irq_type = {
123 .typename = "Cobalt-APIC",
124 .startup = startup_cobalt_irq,
125 .shutdown = disable_cobalt_irq,
126 .enable = enable_cobalt_irq,
127 .disable = disable_cobalt_irq,
128 .ack = ack_cobalt_irq,
129 .end = end_cobalt_irq,
130};
131
132
133/*
134 * This is the PIIX4-based 8259 that is wired up indirectly to Cobalt
135 * -- not the manner expected by the code in i8259.c.
136 *
137 * there is a 'master' physical interrupt source that gets sent to
138 * the CPU. But in the chipset there are various 'virtual' interrupts
139 * waiting to be handled. We represent this to Linux through a 'master'
140 * interrupt controller type, and through a special virtual interrupt-
141 * controller. Device drivers only see the virtual interrupt sources.
142 */
143static unsigned int startup_piix4_master_irq(unsigned int irq)
144{
145 init_8259A(0);
146
147 return startup_cobalt_irq(irq);
148}
149
150static void end_piix4_master_irq(unsigned int irq)
151{
152 unsigned long flags;
153
154 spin_lock_irqsave(&cobalt_lock, flags);
155 enable_cobalt_irq(irq);
156 spin_unlock_irqrestore(&cobalt_lock, flags);
157}
158
159static struct irq_chip piix4_master_irq_type = {
160 .typename = "PIIX4-master",
161 .startup = startup_piix4_master_irq,
162 .ack = ack_cobalt_irq,
163 .end = end_piix4_master_irq,
164};
165
166
167static struct irq_chip piix4_virtual_irq_type = {
168 .typename = "PIIX4-virtual",
169 .shutdown = disable_8259A_irq,
170 .enable = enable_8259A_irq,
171 .disable = disable_8259A_irq,
172};
173
174
175/*
176 * PIIX4-8259 master/virtual functions to handle interrupt requests
177 * from legacy devices: floppy, parallel, serial, rtc.
178 *
179 * None of these get Cobalt APIC entries, neither do they have IDT
180 * entries. These interrupts are purely virtual and distributed from
181 * the 'master' interrupt source: CO_IRQ_8259.
182 *
183 * When the 8259 interrupts its handler figures out which of these
184 * devices is interrupting and dispatches to its handler.
185 *
186 * CAREFUL: devices see the 'virtual' interrupt only. Thus disable/
187 * enable_irq gets the right irq. This 'master' irq is never directly
188 * manipulated by any driver.
189 */
190static irqreturn_t piix4_master_intr(int irq, void *dev_id)
191{
192 int realirq;
193 irq_desc_t *desc;
194 unsigned long flags;
195
196 spin_lock_irqsave(&i8259A_lock, flags);
197
198 /* Find out what's interrupting in the PIIX4 master 8259 */
199 outb(0x0c, 0x20); /* OCW3 Poll command */
200 realirq = inb(0x20);
201
202 /*
203 * Bit 7 == 0 means invalid/spurious
204 */
205 if (unlikely(!(realirq & 0x80)))
206 goto out_unlock;
207
208 realirq &= 7;
209
210 if (unlikely(realirq == 2)) {
211 outb(0x0c, 0xa0);
212 realirq = inb(0xa0);
213
214 if (unlikely(!(realirq & 0x80)))
215 goto out_unlock;
216
217 realirq = (realirq & 7) + 8;
218 }
219
220 /* mask and ack interrupt */
221 cached_irq_mask |= 1 << realirq;
222 if (unlikely(realirq > 7)) {
223 inb(0xa1);
224 outb(cached_slave_mask, 0xa1);
225 outb(0x60 + (realirq & 7), 0xa0);
226 outb(0x60 + 2, 0x20);
227 } else {
228 inb(0x21);
229 outb(cached_master_mask, 0x21);
230 outb(0x60 + realirq, 0x20);
231 }
232
233 spin_unlock_irqrestore(&i8259A_lock, flags);
234
235 desc = irq_desc + realirq;
236
237 /*
238 * handle this 'virtual interrupt' as a Cobalt one now.
239 */
240 kstat_cpu(smp_processor_id()).irqs[realirq]++;
241
242 if (likely(desc->action != NULL))
243 handle_IRQ_event(realirq, desc->action);
244
245 if (!(desc->status & IRQ_DISABLED))
246 enable_8259A_irq(realirq);
247
248 return IRQ_HANDLED;
249
250out_unlock:
251 spin_unlock_irqrestore(&i8259A_lock, flags);
252 return IRQ_NONE;
253}
254
255static struct irqaction master_action = {
256 .handler = piix4_master_intr,
257 .name = "PIIX4-8259",
258};
259
260static struct irqaction cascade_action = {
261 .handler = no_action,
262 .name = "cascade",
263};
264
265
266void init_VISWS_APIC_irqs(void)
267{
268 int i;
269
270 for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
271 irq_desc[i].status = IRQ_DISABLED;
272 irq_desc[i].action = 0;
273 irq_desc[i].depth = 1;
274
275 if (i == 0) {
276 irq_desc[i].chip = &cobalt_irq_type;
277 }
278 else if (i == CO_IRQ_IDE0) {
279 irq_desc[i].chip = &cobalt_irq_type;
280 }
281 else if (i == CO_IRQ_IDE1) {
282 irq_desc[i].chip = &cobalt_irq_type;
283 }
284 else if (i == CO_IRQ_8259) {
285 irq_desc[i].chip = &piix4_master_irq_type;
286 }
287 else if (i < CO_IRQ_APIC0) {
288 irq_desc[i].chip = &piix4_virtual_irq_type;
289 }
290 else if (IS_CO_APIC(i)) {
291 irq_desc[i].chip = &cobalt_irq_type;
292 }
293 }
294
295 setup_irq(CO_IRQ_8259, &master_action);
296 setup_irq(2, &cascade_action);
297}
diff --git a/arch/x86/mach-voyager/setup.c b/arch/x86/mach-voyager/setup.c
index 5ae5466b9eb9..6bbdd633864c 100644
--- a/arch/x86/mach-voyager/setup.c
+++ b/arch/x86/mach-voyager/setup.c
@@ -62,6 +62,7 @@ void __init time_init_hook(void)
62char *__init machine_specific_memory_setup(void) 62char *__init machine_specific_memory_setup(void)
63{ 63{
64 char *who; 64 char *who;
65 int new_nr;
65 66
66 who = "NOT VOYAGER"; 67 who = "NOT VOYAGER";
67 68
@@ -73,7 +74,7 @@ char *__init machine_specific_memory_setup(void)
73 74
74 e820.nr_map = 0; 75 e820.nr_map = 0;
75 for (i = 0; voyager_memory_detect(i, &addr, &length); i++) { 76 for (i = 0; voyager_memory_detect(i, &addr, &length); i++) {
76 add_memory_region(addr, length, E820_RAM); 77 e820_add_region(addr, length, E820_RAM);
77 } 78 }
78 return who; 79 return who;
79 } else if (voyager_level == 4) { 80 } else if (voyager_level == 4) {
@@ -91,43 +92,17 @@ char *__init machine_specific_memory_setup(void)
91 tom = (boot_params.screen_info.ext_mem_k) << 10; 92 tom = (boot_params.screen_info.ext_mem_k) << 10;
92 } 93 }
93 who = "Voyager-TOM"; 94 who = "Voyager-TOM";
94 add_memory_region(0, 0x9f000, E820_RAM); 95 e820_add_region(0, 0x9f000, E820_RAM);
95 /* map from 1M to top of memory */ 96 /* map from 1M to top of memory */
96 add_memory_region(1 * 1024 * 1024, tom - 1 * 1024 * 1024, 97 e820_add_region(1 * 1024 * 1024, tom - 1 * 1024 * 1024,
97 E820_RAM); 98 E820_RAM);
98 /* FIXME: Should check the ASICs to see if I need to 99 /* FIXME: Should check the ASICs to see if I need to
99 * take out the 8M window. Just do it at the moment 100 * take out the 8M window. Just do it at the moment
100 * */ 101 * */
101 add_memory_region(8 * 1024 * 1024, 8 * 1024 * 1024, 102 e820_add_region(8 * 1024 * 1024, 8 * 1024 * 1024,
102 E820_RESERVED); 103 E820_RESERVED);
103 return who; 104 return who;
104 } 105 }
105 106
106 who = "BIOS-e820"; 107 return default_machine_specific_memory_setup();
107
108 /*
109 * Try to copy the BIOS-supplied E820-map.
110 *
111 * Otherwise fake a memory map; one section from 0k->640k,
112 * the next section from 1mb->appropriate_mem_k
113 */
114 sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
115 if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries)
116 < 0) {
117 unsigned long mem_size;
118
119 /* compare results from other methods and take the greater */
120 if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) {
121 mem_size = boot_params.screen_info.ext_mem_k;
122 who = "BIOS-88";
123 } else {
124 mem_size = boot_params.alt_mem_k;
125 who = "BIOS-e801";
126 }
127
128 e820.nr_map = 0;
129 add_memory_region(0, LOWMEMSIZE(), E820_RAM);
130 add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
131 }
132 return who;
133} 108}
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 8acbf0cdf1a5..199a5f4a873c 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -59,11 +59,6 @@ __u32 voyager_quad_processors = 0;
59 * activity count. Finally exported by i386_ksyms.c */ 59 * activity count. Finally exported by i386_ksyms.c */
60static int voyager_extended_cpus = 1; 60static int voyager_extended_cpus = 1;
61 61
62/* Have we found an SMP box - used by time.c to do the profiling
63 interrupt for timeslicing; do not set to 1 until the per CPU timer
64 interrupt is active */
65int smp_found_config = 0;
66
67/* Used for the invalidate map that's also checked in the spinlock */ 62/* Used for the invalidate map that's also checked in the spinlock */
68static volatile unsigned long smp_invalidate_needed; 63static volatile unsigned long smp_invalidate_needed;
69 64
@@ -453,6 +448,8 @@ static void __init start_secondary(void *unused)
453 448
454 VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid)); 449 VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid));
455 450
451 notify_cpu_starting(cpuid);
452
456 /* enable interrupts */ 453 /* enable interrupts */
457 local_irq_enable(); 454 local_irq_enable();
458 455
@@ -955,94 +952,24 @@ static void smp_stop_cpu_function(void *dummy)
955 halt(); 952 halt();
956} 953}
957 954
958static DEFINE_SPINLOCK(call_lock);
959
960struct call_data_struct {
961 void (*func) (void *info);
962 void *info;
963 volatile unsigned long started;
964 volatile unsigned long finished;
965 int wait;
966};
967
968static struct call_data_struct *call_data;
969
970/* execute a thread on a new CPU. The function to be called must be 955/* execute a thread on a new CPU. The function to be called must be
971 * previously set up. This is used to schedule a function for 956 * previously set up. This is used to schedule a function for
972 * execution on all CPUs - set up the function then broadcast a 957 * execution on all CPUs - set up the function then broadcast a
973 * function_interrupt CPI to come here on each CPU */ 958 * function_interrupt CPI to come here on each CPU */
974static void smp_call_function_interrupt(void) 959static void smp_call_function_interrupt(void)
975{ 960{
976 void (*func) (void *info) = call_data->func;
977 void *info = call_data->info;
978 /* must take copy of wait because call_data may be replaced
979 * unless the function is waiting for us to finish */
980 int wait = call_data->wait;
981 __u8 cpu = smp_processor_id();
982
983 /*
984 * Notify initiating CPU that I've grabbed the data and am
985 * about to execute the function
986 */
987 mb();
988 if (!test_and_clear_bit(cpu, &call_data->started)) {
989 /* If the bit wasn't set, this could be a replay */
990 printk(KERN_WARNING "VOYAGER SMP: CPU %d received call funtion"
991 " with no call pending\n", cpu);
992 return;
993 }
994 /*
995 * At this point the info structure may be out of scope unless wait==1
996 */
997 irq_enter(); 961 irq_enter();
998 (*func) (info); 962 generic_smp_call_function_interrupt();
999 __get_cpu_var(irq_stat).irq_call_count++; 963 __get_cpu_var(irq_stat).irq_call_count++;
1000 irq_exit(); 964 irq_exit();
1001 if (wait) {
1002 mb();
1003 clear_bit(cpu, &call_data->finished);
1004 }
1005} 965}
1006 966
1007static int 967static void smp_call_function_single_interrupt(void)
1008voyager_smp_call_function_mask(cpumask_t cpumask,
1009 void (*func) (void *info), void *info, int wait)
1010{ 968{
1011 struct call_data_struct data; 969 irq_enter();
1012 u32 mask = cpus_addr(cpumask)[0]; 970 generic_smp_call_function_single_interrupt();
1013 971 __get_cpu_var(irq_stat).irq_call_count++;
1014 mask &= ~(1 << smp_processor_id()); 972 irq_exit();
1015
1016 if (!mask)
1017 return 0;
1018
1019 /* Can deadlock when called with interrupts disabled */
1020 WARN_ON(irqs_disabled());
1021
1022 data.func = func;
1023 data.info = info;
1024 data.started = mask;
1025 data.wait = wait;
1026 if (wait)
1027 data.finished = mask;
1028
1029 spin_lock(&call_lock);
1030 call_data = &data;
1031 wmb();
1032 /* Send a message to all other CPUs and wait for them to respond */
1033 send_CPI(mask, VIC_CALL_FUNCTION_CPI);
1034
1035 /* Wait for response */
1036 while (data.started)
1037 barrier();
1038
1039 if (wait)
1040 while (data.finished)
1041 barrier();
1042
1043 spin_unlock(&call_lock);
1044
1045 return 0;
1046} 973}
1047 974
1048/* Sorry about the name. In an APIC based system, the APICs 975/* Sorry about the name. In an APIC based system, the APICs
@@ -1099,6 +1026,12 @@ void smp_qic_call_function_interrupt(struct pt_regs *regs)
1099 smp_call_function_interrupt(); 1026 smp_call_function_interrupt();
1100} 1027}
1101 1028
1029void smp_qic_call_function_single_interrupt(struct pt_regs *regs)
1030{
1031 ack_QIC_CPI(QIC_CALL_FUNCTION_SINGLE_CPI);
1032 smp_call_function_single_interrupt();
1033}
1034
1102void smp_vic_cpi_interrupt(struct pt_regs *regs) 1035void smp_vic_cpi_interrupt(struct pt_regs *regs)
1103{ 1036{
1104 struct pt_regs *old_regs = set_irq_regs(regs); 1037 struct pt_regs *old_regs = set_irq_regs(regs);
@@ -1119,6 +1052,8 @@ void smp_vic_cpi_interrupt(struct pt_regs *regs)
1119 smp_enable_irq_interrupt(); 1052 smp_enable_irq_interrupt();
1120 if (test_and_clear_bit(VIC_CALL_FUNCTION_CPI, &vic_cpi_mailbox[cpu])) 1053 if (test_and_clear_bit(VIC_CALL_FUNCTION_CPI, &vic_cpi_mailbox[cpu]))
1121 smp_call_function_interrupt(); 1054 smp_call_function_interrupt();
1055 if (test_and_clear_bit(VIC_CALL_FUNCTION_SINGLE_CPI, &vic_cpi_mailbox[cpu]))
1056 smp_call_function_single_interrupt();
1122 set_irq_regs(old_regs); 1057 set_irq_regs(old_regs);
1123} 1058}
1124 1059
@@ -1134,16 +1069,7 @@ static void do_flush_tlb_all(void *info)
1134/* flush the TLB of every active CPU in the system */ 1069/* flush the TLB of every active CPU in the system */
1135void flush_tlb_all(void) 1070void flush_tlb_all(void)
1136{ 1071{
1137 on_each_cpu(do_flush_tlb_all, 0, 1, 1); 1072 on_each_cpu(do_flush_tlb_all, 0, 1);
1138}
1139
1140/* used to set up the trampoline for other CPUs when the memory manager
1141 * is sorted out */
1142void __init smp_alloc_memory(void)
1143{
1144 trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE);
1145 if (__pa(trampoline_base) >= 0x93000)
1146 BUG();
1147} 1073}
1148 1074
1149/* send a reschedule CPI to one CPU by physical CPU number*/ 1075/* send a reschedule CPI to one CPU by physical CPU number*/
@@ -1175,7 +1101,7 @@ int safe_smp_processor_id(void)
1175/* broadcast a halt to all other CPUs */ 1101/* broadcast a halt to all other CPUs */
1176static void voyager_smp_send_stop(void) 1102static void voyager_smp_send_stop(void)
1177{ 1103{
1178 smp_call_function(smp_stop_cpu_function, NULL, 1, 1); 1104 smp_call_function(smp_stop_cpu_function, NULL, 1);
1179} 1105}
1180 1106
1181/* this function is triggered in time.c when a clock tick fires 1107/* this function is triggered in time.c when a clock tick fires
@@ -1862,5 +1788,7 @@ struct smp_ops smp_ops = {
1862 1788
1863 .smp_send_stop = voyager_smp_send_stop, 1789 .smp_send_stop = voyager_smp_send_stop,
1864 .smp_send_reschedule = voyager_smp_send_reschedule, 1790 .smp_send_reschedule = voyager_smp_send_reschedule,
1865 .smp_call_function_mask = voyager_smp_call_function_mask, 1791
1792 .send_call_func_ipi = native_send_call_func_ipi,
1793 .send_call_func_single_ipi = native_send_call_func_single_ipi,
1866}; 1794};
diff --git a/arch/x86/math-emu/reg_constant.c b/arch/x86/math-emu/reg_constant.c
index 04869e64b18e..00548354912f 100644
--- a/arch/x86/math-emu/reg_constant.c
+++ b/arch/x86/math-emu/reg_constant.c
@@ -16,8 +16,8 @@
16#include "reg_constant.h" 16#include "reg_constant.h"
17#include "control_w.h" 17#include "control_w.h"
18 18
19#define MAKE_REG(s,e,l,h) { l, h, \ 19#define MAKE_REG(s, e, l, h) { l, h, \
20 ((EXTENDED_Ebias+(e)) | ((SIGN_##s != 0)*0x8000)) } 20 ((EXTENDED_Ebias+(e)) | ((SIGN_##s != 0)*0x8000)) }
21 21
22FPU_REG const CONST_1 = MAKE_REG(POS, 0, 0x00000000, 0x80000000); 22FPU_REG const CONST_1 = MAKE_REG(POS, 0, 0x00000000, 0x80000000);
23#if 0 23#if 0
@@ -40,7 +40,7 @@ FPU_REG const CONST_PI2extra = MAKE_REG(NEG, -66,
40FPU_REG const CONST_Z = MAKE_REG(POS, EXP_UNDER, 0x0, 0x0); 40FPU_REG const CONST_Z = MAKE_REG(POS, EXP_UNDER, 0x0, 0x0);
41 41
42/* Only the sign and significand (and tag) are used in internal NaNs */ 42/* Only the sign and significand (and tag) are used in internal NaNs */
43/* The 80486 never generates one of these 43/* The 80486 never generates one of these
44FPU_REG const CONST_SNAN = MAKE_REG(POS, EXP_OVER, 0x00000001, 0x80000000); 44FPU_REG const CONST_SNAN = MAKE_REG(POS, EXP_OVER, 0x00000001, 0x80000000);
45 */ 45 */
46/* This is the real indefinite QNaN */ 46/* This is the real indefinite QNaN */
@@ -49,7 +49,7 @@ FPU_REG const CONST_QNaN = MAKE_REG(NEG, EXP_OVER, 0x00000000, 0xC0000000);
49/* Only the sign (and tag) is used in internal infinities */ 49/* Only the sign (and tag) is used in internal infinities */
50FPU_REG const CONST_INF = MAKE_REG(POS, EXP_OVER, 0x00000000, 0x80000000); 50FPU_REG const CONST_INF = MAKE_REG(POS, EXP_OVER, 0x00000000, 0x80000000);
51 51
52static void fld_const(FPU_REG const *c, int adj, u_char tag) 52static void fld_const(FPU_REG const * c, int adj, u_char tag)
53{ 53{
54 FPU_REG *st_new_ptr; 54 FPU_REG *st_new_ptr;
55 55
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index b7b3e4c7cfc9..59f89b434b45 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,5 @@
1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ 1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
2 pat.o pgtable.o 2 pat.o pgtable.o gup.o
3 3
4obj-$(CONFIG_X86_32) += pgtable_32.o 4obj-$(CONFIG_X86_32) += pgtable_32.o
5 5
@@ -8,10 +8,13 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o
8 8
9obj-$(CONFIG_HIGHMEM) += highmem_32.o 9obj-$(CONFIG_HIGHMEM) += highmem_32.o
10 10
11ifeq ($(CONFIG_X86_32),y) 11obj-$(CONFIG_MMIOTRACE_HOOKS) += kmmio.o
12obj-$(CONFIG_NUMA) += discontig_32.o 12obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
13else 13mmiotrace-y := pf_in.o mmio-mod.o
14obj-$(CONFIG_NUMA) += numa_64.o 14obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
15
16obj-$(CONFIG_NUMA) += numa_$(BITS).o
15obj-$(CONFIG_K8_NUMA) += k8topology_64.o 17obj-$(CONFIG_K8_NUMA) += k8topology_64.o
16obj-$(CONFIG_ACPI_NUMA) += srat_64.o 18obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
17endif 19
20obj-$(CONFIG_MEMTEST) += memtest.o
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 2c24bea92c66..e7277cbcfb40 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -42,7 +42,7 @@ static struct addr_marker address_markers[] = {
42 { 0, "User Space" }, 42 { 0, "User Space" },
43#ifdef CONFIG_X86_64 43#ifdef CONFIG_X86_64
44 { 0x8000000000000000UL, "Kernel Space" }, 44 { 0x8000000000000000UL, "Kernel Space" },
45 { 0xffff810000000000UL, "Low Kernel Mapping" }, 45 { PAGE_OFFSET, "Low Kernel Mapping" },
46 { VMALLOC_START, "vmalloc() Area" }, 46 { VMALLOC_START, "vmalloc() Area" },
47 { VMEMMAP_START, "Vmemmap" }, 47 { VMEMMAP_START, "Vmemmap" },
48 { __START_KERNEL_map, "High Kernel Mapping" }, 48 { __START_KERNEL_map, "High Kernel Mapping" },
@@ -148,8 +148,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
148 * we have now. "break" is either changing perms, levels or 148 * we have now. "break" is either changing perms, levels or
149 * address space marker. 149 * address space marker.
150 */ 150 */
151 prot = pgprot_val(new_prot) & ~(PTE_MASK); 151 prot = pgprot_val(new_prot) & PTE_FLAGS_MASK;
152 cur = pgprot_val(st->current_prot) & ~(PTE_MASK); 152 cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK;
153 153
154 if (!st->level) { 154 if (!st->level) {
155 /* First entry */ 155 /* First entry */
@@ -221,7 +221,7 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
221 for (i = 0; i < PTRS_PER_PMD; i++) { 221 for (i = 0; i < PTRS_PER_PMD; i++) {
222 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); 222 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
223 if (!pmd_none(*start)) { 223 if (!pmd_none(*start)) {
224 pgprotval_t prot = pmd_val(*start) & ~PTE_MASK; 224 pgprotval_t prot = pmd_val(*start) & PTE_FLAGS_MASK;
225 225
226 if (pmd_large(*start) || !pmd_present(*start)) 226 if (pmd_large(*start) || !pmd_present(*start))
227 note_page(m, st, __pgprot(prot), 3); 227 note_page(m, st, __pgprot(prot), 3);
@@ -253,7 +253,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
253 for (i = 0; i < PTRS_PER_PUD; i++) { 253 for (i = 0; i < PTRS_PER_PUD; i++) {
254 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); 254 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
255 if (!pud_none(*start)) { 255 if (!pud_none(*start)) {
256 pgprotval_t prot = pud_val(*start) & ~PTE_MASK; 256 pgprotval_t prot = pud_val(*start) & PTE_FLAGS_MASK;
257 257
258 if (pud_large(*start) || !pud_present(*start)) 258 if (pud_large(*start) || !pud_present(*start))
259 note_page(m, st, __pgprot(prot), 2); 259 note_page(m, st, __pgprot(prot), 2);
@@ -288,7 +288,7 @@ static void walk_pgd_level(struct seq_file *m)
288 for (i = 0; i < PTRS_PER_PGD; i++) { 288 for (i = 0; i < PTRS_PER_PGD; i++) {
289 st.current_address = normalize_addr(i * PGD_LEVEL_MULT); 289 st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
290 if (!pgd_none(*start)) { 290 if (!pgd_none(*start)) {
291 pgprotval_t prot = pgd_val(*start) & ~PTE_MASK; 291 pgprotval_t prot = pgd_val(*start) & PTE_FLAGS_MASK;
292 292
293 if (pgd_large(*start) || !pgd_present(*start)) 293 if (pgd_large(*start) || !pgd_present(*start))
294 note_page(m, &st, __pgprot(prot), 1); 294 note_page(m, &st, __pgprot(prot), 1);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 0c5dcee23bb1..d18ea136d8a6 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -10,6 +10,7 @@
10#include <linux/string.h> 10#include <linux/string.h>
11#include <linux/types.h> 11#include <linux/types.h>
12#include <linux/ptrace.h> 12#include <linux/ptrace.h>
13#include <linux/mmiotrace.h>
13#include <linux/mman.h> 14#include <linux/mman.h>
14#include <linux/mm.h> 15#include <linux/mm.h>
15#include <linux/smp.h> 16#include <linux/smp.h>
@@ -35,6 +36,7 @@
35#include <asm/tlbflush.h> 36#include <asm/tlbflush.h>
36#include <asm/proto.h> 37#include <asm/proto.h>
37#include <asm-generic/sections.h> 38#include <asm-generic/sections.h>
39#include <asm/traps.h>
38 40
39/* 41/*
40 * Page fault error code bits 42 * Page fault error code bits
@@ -50,17 +52,23 @@
50#define PF_RSVD (1<<3) 52#define PF_RSVD (1<<3)
51#define PF_INSTR (1<<4) 53#define PF_INSTR (1<<4)
52 54
55static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
56{
57#ifdef CONFIG_MMIOTRACE_HOOKS
58 if (unlikely(is_kmmio_active()))
59 if (kmmio_handler(regs, addr) == 1)
60 return -1;
61#endif
62 return 0;
63}
64
53static inline int notify_page_fault(struct pt_regs *regs) 65static inline int notify_page_fault(struct pt_regs *regs)
54{ 66{
55#ifdef CONFIG_KPROBES 67#ifdef CONFIG_KPROBES
56 int ret = 0; 68 int ret = 0;
57 69
58 /* kprobe_running() needs smp_processor_id() */ 70 /* kprobe_running() needs smp_processor_id() */
59#ifdef CONFIG_X86_32
60 if (!user_mode_vm(regs)) { 71 if (!user_mode_vm(regs)) {
61#else
62 if (!user_mode(regs)) {
63#endif
64 preempt_disable(); 72 preempt_disable();
65 if (kprobe_running() && kprobe_fault_handler(regs, 14)) 73 if (kprobe_running() && kprobe_fault_handler(regs, 14))
66 ret = 1; 74 ret = 1;
@@ -351,8 +359,6 @@ static int is_errata100(struct pt_regs *regs, unsigned long address)
351 return 0; 359 return 0;
352} 360}
353 361
354void do_invalid_op(struct pt_regs *, unsigned long);
355
356static int is_f00f_bug(struct pt_regs *regs, unsigned long address) 362static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
357{ 363{
358#ifdef CONFIG_X86_F00F_BUG 364#ifdef CONFIG_X86_F00F_BUG
@@ -397,11 +403,7 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
397 printk(KERN_CONT "NULL pointer dereference"); 403 printk(KERN_CONT "NULL pointer dereference");
398 else 404 else
399 printk(KERN_CONT "paging request"); 405 printk(KERN_CONT "paging request");
400#ifdef CONFIG_X86_32 406 printk(KERN_CONT " at %p\n", (void *) address);
401 printk(KERN_CONT " at %08lx\n", address);
402#else
403 printk(KERN_CONT " at %016lx\n", address);
404#endif
405 printk(KERN_ALERT "IP:"); 407 printk(KERN_ALERT "IP:");
406 printk_address(regs->ip, 1); 408 printk_address(regs->ip, 1);
407 dump_pagetable(address); 409 dump_pagetable(address);
@@ -593,11 +595,6 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
593 unsigned long flags; 595 unsigned long flags;
594#endif 596#endif
595 597
596 /*
597 * We can fault from pretty much anywhere, with unknown IRQ state.
598 */
599 trace_hardirqs_fixup();
600
601 tsk = current; 598 tsk = current;
602 mm = tsk->mm; 599 mm = tsk->mm;
603 prefetchw(&mm->mmap_sem); 600 prefetchw(&mm->mmap_sem);
@@ -609,6 +606,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
609 606
610 if (notify_page_fault(regs)) 607 if (notify_page_fault(regs))
611 return; 608 return;
609 if (unlikely(kmmio_fault(regs, address)))
610 return;
612 611
613 /* 612 /*
614 * We fault-in kernel-space virtual memory on-demand. The 613 * We fault-in kernel-space virtual memory on-demand. The
@@ -803,14 +802,10 @@ bad_area_nosemaphore:
803 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && 802 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
804 printk_ratelimit()) { 803 printk_ratelimit()) {
805 printk( 804 printk(
806#ifdef CONFIG_X86_32 805 "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
807 "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
808#else
809 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
810#endif
811 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, 806 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
812 tsk->comm, task_pid_nr(tsk), address, regs->ip, 807 tsk->comm, task_pid_nr(tsk), address,
813 regs->sp, error_code); 808 (void *) regs->ip, (void *) regs->sp, error_code);
814 print_vma_addr(" in ", regs->ip); 809 print_vma_addr(" in ", regs->ip);
815 printk("\n"); 810 printk("\n");
816 } 811 }
@@ -921,72 +916,45 @@ LIST_HEAD(pgd_list);
921 916
922void vmalloc_sync_all(void) 917void vmalloc_sync_all(void)
923{ 918{
924#ifdef CONFIG_X86_32
925 /*
926 * Note that races in the updates of insync and start aren't
927 * problematic: insync can only get set bits added, and updates to
928 * start are only improving performance (without affecting correctness
929 * if undone).
930 */
931 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
932 static unsigned long start = TASK_SIZE;
933 unsigned long address; 919 unsigned long address;
934 920
921#ifdef CONFIG_X86_32
935 if (SHARED_KERNEL_PMD) 922 if (SHARED_KERNEL_PMD)
936 return; 923 return;
937 924
938 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); 925 for (address = VMALLOC_START & PMD_MASK;
939 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { 926 address >= TASK_SIZE && address < FIXADDR_TOP;
940 if (!test_bit(pgd_index(address), insync)) { 927 address += PMD_SIZE) {
941 unsigned long flags; 928 unsigned long flags;
942 struct page *page; 929 struct page *page;
943 930
944 spin_lock_irqsave(&pgd_lock, flags); 931 spin_lock_irqsave(&pgd_lock, flags);
945 list_for_each_entry(page, &pgd_list, lru) { 932 list_for_each_entry(page, &pgd_list, lru) {
946 if (!vmalloc_sync_one(page_address(page), 933 if (!vmalloc_sync_one(page_address(page),
947 address)) 934 address))
948 break; 935 break;
949 }
950 spin_unlock_irqrestore(&pgd_lock, flags);
951 if (!page)
952 set_bit(pgd_index(address), insync);
953 } 936 }
954 if (address == start && test_bit(pgd_index(address), insync)) 937 spin_unlock_irqrestore(&pgd_lock, flags);
955 start = address + PGDIR_SIZE;
956 } 938 }
957#else /* CONFIG_X86_64 */ 939#else /* CONFIG_X86_64 */
958 /* 940 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
959 * Note that races in the updates of insync and start aren't 941 address += PGDIR_SIZE) {
960 * problematic: insync can only get set bits added, and updates to 942 const pgd_t *pgd_ref = pgd_offset_k(address);
961 * start are only improving performance (without affecting correctness 943 unsigned long flags;
962 * if undone). 944 struct page *page;
963 */ 945
964 static DECLARE_BITMAP(insync, PTRS_PER_PGD); 946 if (pgd_none(*pgd_ref))
965 static unsigned long start = VMALLOC_START & PGDIR_MASK; 947 continue;
966 unsigned long address; 948 spin_lock_irqsave(&pgd_lock, flags);
967 949 list_for_each_entry(page, &pgd_list, lru) {
968 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { 950 pgd_t *pgd;
969 if (!test_bit(pgd_index(address), insync)) { 951 pgd = (pgd_t *)page_address(page) + pgd_index(address);
970 const pgd_t *pgd_ref = pgd_offset_k(address); 952 if (pgd_none(*pgd))
971 unsigned long flags; 953 set_pgd(pgd, *pgd_ref);
972 struct page *page; 954 else
973 955 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
974 if (pgd_none(*pgd_ref))
975 continue;
976 spin_lock_irqsave(&pgd_lock, flags);
977 list_for_each_entry(page, &pgd_list, lru) {
978 pgd_t *pgd;
979 pgd = (pgd_t *)page_address(page) + pgd_index(address);
980 if (pgd_none(*pgd))
981 set_pgd(pgd, *pgd_ref);
982 else
983 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
984 }
985 spin_unlock_irqrestore(&pgd_lock, flags);
986 set_bit(pgd_index(address), insync);
987 } 956 }
988 if (address == start) 957 spin_unlock_irqrestore(&pgd_lock, flags);
989 start = address + PGDIR_SIZE;
990 } 958 }
991#endif 959#endif
992} 960}
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
new file mode 100644
index 000000000000..4ba373c5b8c8
--- /dev/null
+++ b/arch/x86/mm/gup.c
@@ -0,0 +1,298 @@
1/*
2 * Lockless get_user_pages_fast for x86
3 *
4 * Copyright (C) 2008 Nick Piggin
5 * Copyright (C) 2008 Novell Inc.
6 */
7#include <linux/sched.h>
8#include <linux/mm.h>
9#include <linux/vmstat.h>
10#include <linux/highmem.h>
11
12#include <asm/pgtable.h>
13
14static inline pte_t gup_get_pte(pte_t *ptep)
15{
16#ifndef CONFIG_X86_PAE
17 return *ptep;
18#else
19 /*
20 * With get_user_pages_fast, we walk down the pagetables without taking
21 * any locks. For this we would like to load the pointers atoimcally,
22 * but that is not possible (without expensive cmpxchg8b) on PAE. What
23 * we do have is the guarantee that a pte will only either go from not
24 * present to present, or present to not present or both -- it will not
25 * switch to a completely different present page without a TLB flush in
26 * between; something that we are blocking by holding interrupts off.
27 *
28 * Setting ptes from not present to present goes:
29 * ptep->pte_high = h;
30 * smp_wmb();
31 * ptep->pte_low = l;
32 *
33 * And present to not present goes:
34 * ptep->pte_low = 0;
35 * smp_wmb();
36 * ptep->pte_high = 0;
37 *
38 * We must ensure here that the load of pte_low sees l iff pte_high
39 * sees h. We load pte_high *after* loading pte_low, which ensures we
40 * don't see an older value of pte_high. *Then* we recheck pte_low,
41 * which ensures that we haven't picked up a changed pte high. We might
42 * have got rubbish values from pte_low and pte_high, but we are
43 * guaranteed that pte_low will not have the present bit set *unless*
44 * it is 'l'. And get_user_pages_fast only operates on present ptes, so
45 * we're safe.
46 *
47 * gup_get_pte should not be used or copied outside gup.c without being
48 * very careful -- it does not atomically load the pte or anything that
49 * is likely to be useful for you.
50 */
51 pte_t pte;
52
53retry:
54 pte.pte_low = ptep->pte_low;
55 smp_rmb();
56 pte.pte_high = ptep->pte_high;
57 smp_rmb();
58 if (unlikely(pte.pte_low != ptep->pte_low))
59 goto retry;
60
61 return pte;
62#endif
63}
64
65/*
66 * The performance critical leaf functions are made noinline otherwise gcc
67 * inlines everything into a single function which results in too much
68 * register pressure.
69 */
70static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
71 unsigned long end, int write, struct page **pages, int *nr)
72{
73 unsigned long mask;
74 pte_t *ptep;
75
76 mask = _PAGE_PRESENT|_PAGE_USER;
77 if (write)
78 mask |= _PAGE_RW;
79
80 ptep = pte_offset_map(&pmd, addr);
81 do {
82 pte_t pte = gup_get_pte(ptep);
83 struct page *page;
84
85 if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
86 pte_unmap(ptep);
87 return 0;
88 }
89 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
90 page = pte_page(pte);
91 get_page(page);
92 pages[*nr] = page;
93 (*nr)++;
94
95 } while (ptep++, addr += PAGE_SIZE, addr != end);
96 pte_unmap(ptep - 1);
97
98 return 1;
99}
100
101static inline void get_head_page_multiple(struct page *page, int nr)
102{
103 VM_BUG_ON(page != compound_head(page));
104 VM_BUG_ON(page_count(page) == 0);
105 atomic_add(nr, &page->_count);
106}
107
108static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
109 unsigned long end, int write, struct page **pages, int *nr)
110{
111 unsigned long mask;
112 pte_t pte = *(pte_t *)&pmd;
113 struct page *head, *page;
114 int refs;
115
116 mask = _PAGE_PRESENT|_PAGE_USER;
117 if (write)
118 mask |= _PAGE_RW;
119 if ((pte_flags(pte) & mask) != mask)
120 return 0;
121 /* hugepages are never "special" */
122 VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
123 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
124
125 refs = 0;
126 head = pte_page(pte);
127 page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
128 do {
129 VM_BUG_ON(compound_head(page) != head);
130 pages[*nr] = page;
131 (*nr)++;
132 page++;
133 refs++;
134 } while (addr += PAGE_SIZE, addr != end);
135 get_head_page_multiple(head, refs);
136
137 return 1;
138}
139
140static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
141 int write, struct page **pages, int *nr)
142{
143 unsigned long next;
144 pmd_t *pmdp;
145
146 pmdp = pmd_offset(&pud, addr);
147 do {
148 pmd_t pmd = *pmdp;
149
150 next = pmd_addr_end(addr, end);
151 if (pmd_none(pmd))
152 return 0;
153 if (unlikely(pmd_large(pmd))) {
154 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
155 return 0;
156 } else {
157 if (!gup_pte_range(pmd, addr, next, write, pages, nr))
158 return 0;
159 }
160 } while (pmdp++, addr = next, addr != end);
161
162 return 1;
163}
164
165static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
166 unsigned long end, int write, struct page **pages, int *nr)
167{
168 unsigned long mask;
169 pte_t pte = *(pte_t *)&pud;
170 struct page *head, *page;
171 int refs;
172
173 mask = _PAGE_PRESENT|_PAGE_USER;
174 if (write)
175 mask |= _PAGE_RW;
176 if ((pte_flags(pte) & mask) != mask)
177 return 0;
178 /* hugepages are never "special" */
179 VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
180 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
181
182 refs = 0;
183 head = pte_page(pte);
184 page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
185 do {
186 VM_BUG_ON(compound_head(page) != head);
187 pages[*nr] = page;
188 (*nr)++;
189 page++;
190 refs++;
191 } while (addr += PAGE_SIZE, addr != end);
192 get_head_page_multiple(head, refs);
193
194 return 1;
195}
196
197static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
198 int write, struct page **pages, int *nr)
199{
200 unsigned long next;
201 pud_t *pudp;
202
203 pudp = pud_offset(&pgd, addr);
204 do {
205 pud_t pud = *pudp;
206
207 next = pud_addr_end(addr, end);
208 if (pud_none(pud))
209 return 0;
210 if (unlikely(pud_large(pud))) {
211 if (!gup_huge_pud(pud, addr, next, write, pages, nr))
212 return 0;
213 } else {
214 if (!gup_pmd_range(pud, addr, next, write, pages, nr))
215 return 0;
216 }
217 } while (pudp++, addr = next, addr != end);
218
219 return 1;
220}
221
222int get_user_pages_fast(unsigned long start, int nr_pages, int write,
223 struct page **pages)
224{
225 struct mm_struct *mm = current->mm;
226 unsigned long addr, len, end;
227 unsigned long next;
228 pgd_t *pgdp;
229 int nr = 0;
230
231 start &= PAGE_MASK;
232 addr = start;
233 len = (unsigned long) nr_pages << PAGE_SHIFT;
234 end = start + len;
235 if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
236 start, len)))
237 goto slow_irqon;
238
239 /*
240 * XXX: batch / limit 'nr', to avoid large irq off latency
241 * needs some instrumenting to determine the common sizes used by
242 * important workloads (eg. DB2), and whether limiting the batch size
243 * will decrease performance.
244 *
245 * It seems like we're in the clear for the moment. Direct-IO is
246 * the main guy that batches up lots of get_user_pages, and even
247 * they are limited to 64-at-a-time which is not so many.
248 */
249 /*
250 * This doesn't prevent pagetable teardown, but does prevent
251 * the pagetables and pages from being freed on x86.
252 *
253 * So long as we atomically load page table pointers versus teardown
254 * (which we do on x86, with the above PAE exception), we can follow the
255 * address down to the the page and take a ref on it.
256 */
257 local_irq_disable();
258 pgdp = pgd_offset(mm, addr);
259 do {
260 pgd_t pgd = *pgdp;
261
262 next = pgd_addr_end(addr, end);
263 if (pgd_none(pgd))
264 goto slow;
265 if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
266 goto slow;
267 } while (pgdp++, addr = next, addr != end);
268 local_irq_enable();
269
270 VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
271 return nr;
272
273 {
274 int ret;
275
276slow:
277 local_irq_enable();
278slow_irqon:
279 /* Try to get the remaining pages with get_user_pages */
280 start += nr << PAGE_SHIFT;
281 pages += nr;
282
283 down_read(&mm->mmap_sem);
284 ret = get_user_pages(current, mm, start,
285 (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
286 up_read(&mm->mmap_sem);
287
288 /* Have to be a bit careful with return values */
289 if (nr > 0) {
290 if (ret < 0)
291 ret = nr;
292 else
293 ret += nr;
294 }
295
296 return ret;
297 }
298}
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 0b3d567e686d..8f307d914c2e 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -124,7 +124,8 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
124 return 1; 124 return 1;
125} 125}
126 126
127pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) 127pte_t *huge_pte_alloc(struct mm_struct *mm,
128 unsigned long addr, unsigned long sz)
128{ 129{
129 pgd_t *pgd; 130 pgd_t *pgd;
130 pud_t *pud; 131 pud_t *pud;
@@ -133,9 +134,14 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
133 pgd = pgd_offset(mm, addr); 134 pgd = pgd_offset(mm, addr);
134 pud = pud_alloc(mm, pgd, addr); 135 pud = pud_alloc(mm, pgd, addr);
135 if (pud) { 136 if (pud) {
136 if (pud_none(*pud)) 137 if (sz == PUD_SIZE) {
137 huge_pmd_share(mm, addr, pud); 138 pte = (pte_t *)pud;
138 pte = (pte_t *) pmd_alloc(mm, pud, addr); 139 } else {
140 BUG_ON(sz != PMD_SIZE);
141 if (pud_none(*pud))
142 huge_pmd_share(mm, addr, pud);
143 pte = (pte_t *) pmd_alloc(mm, pud, addr);
144 }
139 } 145 }
140 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); 146 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
141 147
@@ -151,8 +157,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
151 pgd = pgd_offset(mm, addr); 157 pgd = pgd_offset(mm, addr);
152 if (pgd_present(*pgd)) { 158 if (pgd_present(*pgd)) {
153 pud = pud_offset(pgd, addr); 159 pud = pud_offset(pgd, addr);
154 if (pud_present(*pud)) 160 if (pud_present(*pud)) {
161 if (pud_large(*pud))
162 return (pte_t *)pud;
155 pmd = pmd_offset(pud, addr); 163 pmd = pmd_offset(pud, addr);
164 }
156 } 165 }
157 return (pte_t *) pmd; 166 return (pte_t *) pmd;
158} 167}
@@ -188,6 +197,11 @@ int pmd_huge(pmd_t pmd)
188 return 0; 197 return 0;
189} 198}
190 199
200int pud_huge(pud_t pud)
201{
202 return 0;
203}
204
191struct page * 205struct page *
192follow_huge_pmd(struct mm_struct *mm, unsigned long address, 206follow_huge_pmd(struct mm_struct *mm, unsigned long address,
193 pmd_t *pmd, int write) 207 pmd_t *pmd, int write)
@@ -208,6 +222,11 @@ int pmd_huge(pmd_t pmd)
208 return !!(pmd_val(pmd) & _PAGE_PSE); 222 return !!(pmd_val(pmd) & _PAGE_PSE);
209} 223}
210 224
225int pud_huge(pud_t pud)
226{
227 return !!(pud_val(pud) & _PAGE_PSE);
228}
229
211struct page * 230struct page *
212follow_huge_pmd(struct mm_struct *mm, unsigned long address, 231follow_huge_pmd(struct mm_struct *mm, unsigned long address,
213 pmd_t *pmd, int write) 232 pmd_t *pmd, int write)
@@ -216,9 +235,22 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
216 235
217 page = pte_page(*(pte_t *)pmd); 236 page = pte_page(*(pte_t *)pmd);
218 if (page) 237 if (page)
219 page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT); 238 page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
239 return page;
240}
241
242struct page *
243follow_huge_pud(struct mm_struct *mm, unsigned long address,
244 pud_t *pud, int write)
245{
246 struct page *page;
247
248 page = pte_page(*(pte_t *)pud);
249 if (page)
250 page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
220 return page; 251 return page;
221} 252}
253
222#endif 254#endif
223 255
224/* x86_64 also uses this file */ 256/* x86_64 also uses this file */
@@ -228,6 +260,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
228 unsigned long addr, unsigned long len, 260 unsigned long addr, unsigned long len,
229 unsigned long pgoff, unsigned long flags) 261 unsigned long pgoff, unsigned long flags)
230{ 262{
263 struct hstate *h = hstate_file(file);
231 struct mm_struct *mm = current->mm; 264 struct mm_struct *mm = current->mm;
232 struct vm_area_struct *vma; 265 struct vm_area_struct *vma;
233 unsigned long start_addr; 266 unsigned long start_addr;
@@ -240,7 +273,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
240 } 273 }
241 274
242full_search: 275full_search:
243 addr = ALIGN(start_addr, HPAGE_SIZE); 276 addr = ALIGN(start_addr, huge_page_size(h));
244 277
245 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { 278 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
246 /* At this point: (!vma || addr < vma->vm_end). */ 279 /* At this point: (!vma || addr < vma->vm_end). */
@@ -262,7 +295,7 @@ full_search:
262 } 295 }
263 if (addr + mm->cached_hole_size < vma->vm_start) 296 if (addr + mm->cached_hole_size < vma->vm_start)
264 mm->cached_hole_size = vma->vm_start - addr; 297 mm->cached_hole_size = vma->vm_start - addr;
265 addr = ALIGN(vma->vm_end, HPAGE_SIZE); 298 addr = ALIGN(vma->vm_end, huge_page_size(h));
266 } 299 }
267} 300}
268 301
@@ -270,6 +303,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
270 unsigned long addr0, unsigned long len, 303 unsigned long addr0, unsigned long len,
271 unsigned long pgoff, unsigned long flags) 304 unsigned long pgoff, unsigned long flags)
272{ 305{
306 struct hstate *h = hstate_file(file);
273 struct mm_struct *mm = current->mm; 307 struct mm_struct *mm = current->mm;
274 struct vm_area_struct *vma, *prev_vma; 308 struct vm_area_struct *vma, *prev_vma;
275 unsigned long base = mm->mmap_base, addr = addr0; 309 unsigned long base = mm->mmap_base, addr = addr0;
@@ -290,7 +324,7 @@ try_again:
290 goto fail; 324 goto fail;
291 325
292 /* either no address requested or cant fit in requested address hole */ 326 /* either no address requested or cant fit in requested address hole */
293 addr = (mm->free_area_cache - len) & HPAGE_MASK; 327 addr = (mm->free_area_cache - len) & huge_page_mask(h);
294 do { 328 do {
295 /* 329 /*
296 * Lookup failure means no vma is above this address, 330 * Lookup failure means no vma is above this address,
@@ -321,7 +355,7 @@ try_again:
321 largest_hole = vma->vm_start - addr; 355 largest_hole = vma->vm_start - addr;
322 356
323 /* try just below the current vma->vm_start */ 357 /* try just below the current vma->vm_start */
324 addr = (vma->vm_start - len) & HPAGE_MASK; 358 addr = (vma->vm_start - len) & huge_page_mask(h);
325 } while (len <= vma->vm_start); 359 } while (len <= vma->vm_start);
326 360
327fail: 361fail:
@@ -359,22 +393,23 @@ unsigned long
359hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 393hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
360 unsigned long len, unsigned long pgoff, unsigned long flags) 394 unsigned long len, unsigned long pgoff, unsigned long flags)
361{ 395{
396 struct hstate *h = hstate_file(file);
362 struct mm_struct *mm = current->mm; 397 struct mm_struct *mm = current->mm;
363 struct vm_area_struct *vma; 398 struct vm_area_struct *vma;
364 399
365 if (len & ~HPAGE_MASK) 400 if (len & ~huge_page_mask(h))
366 return -EINVAL; 401 return -EINVAL;
367 if (len > TASK_SIZE) 402 if (len > TASK_SIZE)
368 return -ENOMEM; 403 return -ENOMEM;
369 404
370 if (flags & MAP_FIXED) { 405 if (flags & MAP_FIXED) {
371 if (prepare_hugepage_range(addr, len)) 406 if (prepare_hugepage_range(file, addr, len))
372 return -EINVAL; 407 return -EINVAL;
373 return addr; 408 return addr;
374 } 409 }
375 410
376 if (addr) { 411 if (addr) {
377 addr = ALIGN(addr, HPAGE_SIZE); 412 addr = ALIGN(addr, huge_page_size(h));
378 vma = find_vma(mm, addr); 413 vma = find_vma(mm, addr);
379 if (TASK_SIZE - len >= addr && 414 if (TASK_SIZE - len >= addr &&
380 (!vma || addr + len <= vma->vm_start)) 415 (!vma || addr + len <= vma->vm_start))
@@ -390,3 +425,20 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
390 425
391#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ 426#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
392 427
428#ifdef CONFIG_X86_64
429static __init int setup_hugepagesz(char *opt)
430{
431 unsigned long ps = memparse(opt, &opt);
432 if (ps == PMD_SIZE) {
433 hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
434 } else if (ps == PUD_SIZE && cpu_has_gbpages) {
435 hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
436 } else {
437 printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n",
438 ps >> 20);
439 return 0;
440 }
441 return 1;
442}
443__setup("hugepagesz=", setup_hugepagesz);
444#endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index ec30d10154b6..8396868e82c5 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -31,6 +31,7 @@
31#include <linux/cpumask.h> 31#include <linux/cpumask.h>
32 32
33#include <asm/asm.h> 33#include <asm/asm.h>
34#include <asm/bios_ebda.h>
34#include <asm/processor.h> 35#include <asm/processor.h>
35#include <asm/system.h> 36#include <asm/system.h>
36#include <asm/uaccess.h> 37#include <asm/uaccess.h>
@@ -47,9 +48,11 @@
47#include <asm/paravirt.h> 48#include <asm/paravirt.h>
48#include <asm/setup.h> 49#include <asm/setup.h>
49#include <asm/cacheflush.h> 50#include <asm/cacheflush.h>
51#include <asm/smp.h>
50 52
51unsigned int __VMALLOC_RESERVE = 128 << 20; 53unsigned int __VMALLOC_RESERVE = 128 << 20;
52 54
55unsigned long max_low_pfn_mapped;
53unsigned long max_pfn_mapped; 56unsigned long max_pfn_mapped;
54 57
55DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 58DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -57,6 +60,27 @@ unsigned long highstart_pfn, highend_pfn;
57 60
58static noinline int do_test_wp_bit(void); 61static noinline int do_test_wp_bit(void);
59 62
63
64static unsigned long __initdata table_start;
65static unsigned long __meminitdata table_end;
66static unsigned long __meminitdata table_top;
67
68static int __initdata after_init_bootmem;
69
70static __init void *alloc_low_page(unsigned long *phys)
71{
72 unsigned long pfn = table_end++;
73 void *adr;
74
75 if (pfn >= table_top)
76 panic("alloc_low_page: ran out of memory");
77
78 adr = __va(pfn * PAGE_SIZE);
79 memset(adr, 0, PAGE_SIZE);
80 *phys = pfn * PAGE_SIZE;
81 return adr;
82}
83
60/* 84/*
61 * Creates a middle page table and puts a pointer to it in the 85 * Creates a middle page table and puts a pointer to it in the
62 * given global directory entry. This only returns the gd entry 86 * given global directory entry. This only returns the gd entry
@@ -68,9 +92,12 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
68 pmd_t *pmd_table; 92 pmd_t *pmd_table;
69 93
70#ifdef CONFIG_X86_PAE 94#ifdef CONFIG_X86_PAE
95 unsigned long phys;
71 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { 96 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
72 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); 97 if (after_init_bootmem)
73 98 pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
99 else
100 pmd_table = (pmd_t *)alloc_low_page(&phys);
74 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); 101 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
75 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 102 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
76 pud = pud_offset(pgd, 0); 103 pud = pud_offset(pgd, 0);
@@ -92,12 +119,16 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
92 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { 119 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
93 pte_t *page_table = NULL; 120 pte_t *page_table = NULL;
94 121
122 if (after_init_bootmem) {
95#ifdef CONFIG_DEBUG_PAGEALLOC 123#ifdef CONFIG_DEBUG_PAGEALLOC
96 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); 124 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
97#endif 125#endif
98 if (!page_table) { 126 if (!page_table)
99 page_table = 127 page_table =
100 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); 128 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
129 } else {
130 unsigned long phys;
131 page_table = (pte_t *)alloc_low_page(&phys);
101 } 132 }
102 133
103 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); 134 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
@@ -155,40 +186,72 @@ static inline int is_kernel_text(unsigned long addr)
155 * of max_low_pfn pages, by creating page tables starting from address 186 * of max_low_pfn pages, by creating page tables starting from address
156 * PAGE_OFFSET: 187 * PAGE_OFFSET:
157 */ 188 */
158static void __init kernel_physical_mapping_init(pgd_t *pgd_base) 189static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
190 unsigned long start_pfn,
191 unsigned long end_pfn,
192 int use_pse)
159{ 193{
160 int pgd_idx, pmd_idx, pte_ofs; 194 int pgd_idx, pmd_idx, pte_ofs;
161 unsigned long pfn; 195 unsigned long pfn;
162 pgd_t *pgd; 196 pgd_t *pgd;
163 pmd_t *pmd; 197 pmd_t *pmd;
164 pte_t *pte; 198 pte_t *pte;
199 unsigned pages_2m, pages_4k;
200 int mapping_iter;
165 201
166 pgd_idx = pgd_index(PAGE_OFFSET); 202 /*
167 pgd = pgd_base + pgd_idx; 203 * First iteration will setup identity mapping using large/small pages
168 pfn = 0; 204 * based on use_pse, with other attributes same as set by
205 * the early code in head_32.S
206 *
207 * Second iteration will setup the appropriate attributes (NX, GLOBAL..)
208 * as desired for the kernel identity mapping.
209 *
210 * This two pass mechanism conforms to the TLB app note which says:
211 *
212 * "Software should not write to a paging-structure entry in a way
213 * that would change, for any linear address, both the page size
214 * and either the page frame or attributes."
215 */
216 mapping_iter = 1;
169 217
218 if (!cpu_has_pse)
219 use_pse = 0;
220
221repeat:
222 pages_2m = pages_4k = 0;
223 pfn = start_pfn;
224 pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
225 pgd = pgd_base + pgd_idx;
170 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { 226 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
171 pmd = one_md_table_init(pgd); 227 pmd = one_md_table_init(pgd);
172 if (pfn >= max_low_pfn)
173 continue;
174 228
175 for (pmd_idx = 0; 229 if (pfn >= end_pfn)
176 pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; 230 continue;
231#ifdef CONFIG_X86_PAE
232 pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
233 pmd += pmd_idx;
234#else
235 pmd_idx = 0;
236#endif
237 for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
177 pmd++, pmd_idx++) { 238 pmd++, pmd_idx++) {
178 unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET; 239 unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
179 240
180 /* 241 /*
181 * Map with big pages if possible, otherwise 242 * Map with big pages if possible, otherwise
182 * create normal page tables: 243 * create normal page tables:
183 *
184 * Don't use a large page for the first 2/4MB of memory
185 * because there are often fixed size MTRRs in there
186 * and overlapping MTRRs into large pages can cause
187 * slowdowns.
188 */ 244 */
189 if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) { 245 if (use_pse) {
190 unsigned int addr2; 246 unsigned int addr2;
191 pgprot_t prot = PAGE_KERNEL_LARGE; 247 pgprot_t prot = PAGE_KERNEL_LARGE;
248 /*
249 * first pass will use the same initial
250 * identity mapping attribute + _PAGE_PSE.
251 */
252 pgprot_t init_prot =
253 __pgprot(PTE_IDENT_ATTR |
254 _PAGE_PSE);
192 255
193 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + 256 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
194 PAGE_OFFSET + PAGE_SIZE-1; 257 PAGE_OFFSET + PAGE_SIZE-1;
@@ -197,34 +260,59 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
197 is_kernel_text(addr2)) 260 is_kernel_text(addr2))
198 prot = PAGE_KERNEL_LARGE_EXEC; 261 prot = PAGE_KERNEL_LARGE_EXEC;
199 262
200 set_pmd(pmd, pfn_pmd(pfn, prot)); 263 pages_2m++;
264 if (mapping_iter == 1)
265 set_pmd(pmd, pfn_pmd(pfn, init_prot));
266 else
267 set_pmd(pmd, pfn_pmd(pfn, prot));
201 268
202 pfn += PTRS_PER_PTE; 269 pfn += PTRS_PER_PTE;
203 max_pfn_mapped = pfn;
204 continue; 270 continue;
205 } 271 }
206 pte = one_page_table_init(pmd); 272 pte = one_page_table_init(pmd);
207 273
208 for (pte_ofs = 0; 274 pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
209 pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; 275 pte += pte_ofs;
276 for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
210 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { 277 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
211 pgprot_t prot = PAGE_KERNEL; 278 pgprot_t prot = PAGE_KERNEL;
279 /*
280 * first pass will use the same initial
281 * identity mapping attribute.
282 */
283 pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);
212 284
213 if (is_kernel_text(addr)) 285 if (is_kernel_text(addr))
214 prot = PAGE_KERNEL_EXEC; 286 prot = PAGE_KERNEL_EXEC;
215 287
216 set_pte(pte, pfn_pte(pfn, prot)); 288 pages_4k++;
289 if (mapping_iter == 1)
290 set_pte(pte, pfn_pte(pfn, init_prot));
291 else
292 set_pte(pte, pfn_pte(pfn, prot));
217 } 293 }
218 max_pfn_mapped = pfn;
219 } 294 }
220 } 295 }
221} 296 if (mapping_iter == 1) {
297 /*
298 * update direct mapping page count only in the first
299 * iteration.
300 */
301 update_page_count(PG_LEVEL_2M, pages_2m);
302 update_page_count(PG_LEVEL_4K, pages_4k);
222 303
223static inline int page_kills_ppro(unsigned long pagenr) 304 /*
224{ 305 * local global flush tlb, which will flush the previous
225 if (pagenr >= 0x70000 && pagenr <= 0x7003F) 306 * mappings present in both small and large page TLB's.
226 return 1; 307 */
227 return 0; 308 __flush_tlb_all();
309
310 /*
311 * Second iteration will set the actual desired PTE attributes.
312 */
313 mapping_iter = 2;
314 goto repeat;
315 }
228} 316}
229 317
230/* 318/*
@@ -287,29 +375,62 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
287 pkmap_page_table = pte; 375 pkmap_page_table = pte;
288} 376}
289 377
290void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) 378static void __init add_one_highpage_init(struct page *page, int pfn)
291{ 379{
292 if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { 380 ClearPageReserved(page);
293 ClearPageReserved(page); 381 init_page_count(page);
294 init_page_count(page); 382 __free_page(page);
295 __free_page(page); 383 totalhigh_pages++;
296 totalhigh_pages++;
297 } else
298 SetPageReserved(page);
299} 384}
300 385
301#ifndef CONFIG_NUMA 386struct add_highpages_data {
302static void __init set_highmem_pages_init(int bad_ppro) 387 unsigned long start_pfn;
388 unsigned long end_pfn;
389};
390
391static int __init add_highpages_work_fn(unsigned long start_pfn,
392 unsigned long end_pfn, void *datax)
303{ 393{
304 int pfn; 394 int node_pfn;
395 struct page *page;
396 unsigned long final_start_pfn, final_end_pfn;
397 struct add_highpages_data *data;
305 398
306 for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) { 399 data = (struct add_highpages_data *)datax;
307 /* 400
308 * Holes under sparsemem might not have no mem_map[]: 401 final_start_pfn = max(start_pfn, data->start_pfn);
309 */ 402 final_end_pfn = min(end_pfn, data->end_pfn);
310 if (pfn_valid(pfn)) 403 if (final_start_pfn >= final_end_pfn)
311 add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); 404 return 0;
405
406 for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
407 node_pfn++) {
408 if (!pfn_valid(node_pfn))
409 continue;
410 page = pfn_to_page(node_pfn);
411 add_one_highpage_init(page, node_pfn);
312 } 412 }
413
414 return 0;
415
416}
417
418void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
419 unsigned long end_pfn)
420{
421 struct add_highpages_data data;
422
423 data.start_pfn = start_pfn;
424 data.end_pfn = end_pfn;
425
426 work_with_active_regions(nid, add_highpages_work_fn, &data);
427}
428
429#ifndef CONFIG_NUMA
430static void __init set_highmem_pages_init(void)
431{
432 add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
433
313 totalram_pages += totalhigh_pages; 434 totalram_pages += totalhigh_pages;
314} 435}
315#endif /* !CONFIG_NUMA */ 436#endif /* !CONFIG_NUMA */
@@ -317,14 +438,9 @@ static void __init set_highmem_pages_init(int bad_ppro)
317#else 438#else
318# define kmap_init() do { } while (0) 439# define kmap_init() do { } while (0)
319# define permanent_kmaps_init(pgd_base) do { } while (0) 440# define permanent_kmaps_init(pgd_base) do { } while (0)
320# define set_highmem_pages_init(bad_ppro) do { } while (0) 441# define set_highmem_pages_init() do { } while (0)
321#endif /* CONFIG_HIGHMEM */ 442#endif /* CONFIG_HIGHMEM */
322 443
323pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
324EXPORT_SYMBOL(__PAGE_KERNEL);
325
326pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
327
328void __init native_pagetable_setup_start(pgd_t *base) 444void __init native_pagetable_setup_start(pgd_t *base)
329{ 445{
330 unsigned long pfn, va; 446 unsigned long pfn, va;
@@ -380,27 +496,10 @@ void __init native_pagetable_setup_done(pgd_t *base)
380 * be partially populated, and so it avoids stomping on any existing 496 * be partially populated, and so it avoids stomping on any existing
381 * mappings. 497 * mappings.
382 */ 498 */
383static void __init pagetable_init(void) 499static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
384{ 500{
385 pgd_t *pgd_base = swapper_pg_dir;
386 unsigned long vaddr, end; 501 unsigned long vaddr, end;
387 502
388 paravirt_pagetable_setup_start(pgd_base);
389
390 /* Enable PSE if available */
391 if (cpu_has_pse)
392 set_in_cr4(X86_CR4_PSE);
393
394 /* Enable PGE if available */
395 if (cpu_has_pge) {
396 set_in_cr4(X86_CR4_PGE);
397 __PAGE_KERNEL |= _PAGE_GLOBAL;
398 __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
399 }
400
401 kernel_physical_mapping_init(pgd_base);
402 remap_numa_kva();
403
404 /* 503 /*
405 * Fixed mappings, only the page table structure has to be 504 * Fixed mappings, only the page table structure has to be
406 * created - mappings will be set by set_fixmap(): 505 * created - mappings will be set by set_fixmap():
@@ -410,10 +509,13 @@ static void __init pagetable_init(void)
410 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; 509 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
411 page_table_range_init(vaddr, end, pgd_base); 510 page_table_range_init(vaddr, end, pgd_base);
412 early_ioremap_reset(); 511 early_ioremap_reset();
512}
413 513
414 permanent_kmaps_init(pgd_base); 514static void __init pagetable_init(void)
515{
516 pgd_t *pgd_base = swapper_pg_dir;
415 517
416 paravirt_pagetable_setup_done(pgd_base); 518 permanent_kmaps_init(pgd_base);
417} 519}
418 520
419#ifdef CONFIG_ACPI_SLEEP 521#ifdef CONFIG_ACPI_SLEEP
@@ -456,7 +558,7 @@ void zap_low_mappings(void)
456 558
457int nx_enabled; 559int nx_enabled;
458 560
459pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX; 561pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
460EXPORT_SYMBOL_GPL(__supported_pte_mask); 562EXPORT_SYMBOL_GPL(__supported_pte_mask);
461 563
462#ifdef CONFIG_X86_PAE 564#ifdef CONFIG_X86_PAE
@@ -509,27 +611,329 @@ static void __init set_nx(void)
509} 611}
510#endif 612#endif
511 613
614/* user-defined highmem size */
615static unsigned int highmem_pages = -1;
616
512/* 617/*
513 * paging_init() sets up the page tables - note that the first 8MB are 618 * highmem=size forces highmem to be exactly 'size' bytes.
514 * already mapped by head.S. 619 * This works even on boxes that have no highmem otherwise.
515 * 620 * This also works to reduce highmem size on bigger boxes.
516 * This routines also unmaps the page at virtual kernel address 0, so
517 * that we can trap those pesky NULL-reference errors in the kernel.
518 */ 621 */
519void __init paging_init(void) 622static int __init parse_highmem(char *arg)
623{
624 if (!arg)
625 return -EINVAL;
626
627 highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
628 return 0;
629}
630early_param("highmem", parse_highmem);
631
632/*
633 * Determine low and high memory ranges:
634 */
635void __init find_low_pfn_range(void)
520{ 636{
637 /* it could update max_pfn */
638
639 /* max_low_pfn is 0, we already have early_res support */
640
641 max_low_pfn = max_pfn;
642 if (max_low_pfn > MAXMEM_PFN) {
643 if (highmem_pages == -1)
644 highmem_pages = max_pfn - MAXMEM_PFN;
645 if (highmem_pages + MAXMEM_PFN < max_pfn)
646 max_pfn = MAXMEM_PFN + highmem_pages;
647 if (highmem_pages + MAXMEM_PFN > max_pfn) {
648 printk(KERN_WARNING "only %luMB highmem pages "
649 "available, ignoring highmem size of %uMB.\n",
650 pages_to_mb(max_pfn - MAXMEM_PFN),
651 pages_to_mb(highmem_pages));
652 highmem_pages = 0;
653 }
654 max_low_pfn = MAXMEM_PFN;
655#ifndef CONFIG_HIGHMEM
656 /* Maximum memory usable is what is directly addressable */
657 printk(KERN_WARNING "Warning only %ldMB will be used.\n",
658 MAXMEM>>20);
659 if (max_pfn > MAX_NONPAE_PFN)
660 printk(KERN_WARNING
661 "Use a HIGHMEM64G enabled kernel.\n");
662 else
663 printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
664 max_pfn = MAXMEM_PFN;
665#else /* !CONFIG_HIGHMEM */
666#ifndef CONFIG_HIGHMEM64G
667 if (max_pfn > MAX_NONPAE_PFN) {
668 max_pfn = MAX_NONPAE_PFN;
669 printk(KERN_WARNING "Warning only 4GB will be used."
670 "Use a HIGHMEM64G enabled kernel.\n");
671 }
672#endif /* !CONFIG_HIGHMEM64G */
673#endif /* !CONFIG_HIGHMEM */
674 } else {
675 if (highmem_pages == -1)
676 highmem_pages = 0;
677#ifdef CONFIG_HIGHMEM
678 if (highmem_pages >= max_pfn) {
679 printk(KERN_ERR "highmem size specified (%uMB) is "
680 "bigger than pages available (%luMB)!.\n",
681 pages_to_mb(highmem_pages),
682 pages_to_mb(max_pfn));
683 highmem_pages = 0;
684 }
685 if (highmem_pages) {
686 if (max_low_pfn - highmem_pages <
687 64*1024*1024/PAGE_SIZE){
688 printk(KERN_ERR "highmem size %uMB results in "
689 "smaller than 64MB lowmem, ignoring it.\n"
690 , pages_to_mb(highmem_pages));
691 highmem_pages = 0;
692 }
693 max_low_pfn -= highmem_pages;
694 }
695#else
696 if (highmem_pages)
697 printk(KERN_ERR "ignoring highmem size on non-highmem"
698 " kernel!\n");
699#endif
700 }
701}
702
703#ifndef CONFIG_NEED_MULTIPLE_NODES
704void __init initmem_init(unsigned long start_pfn,
705 unsigned long end_pfn)
706{
707#ifdef CONFIG_HIGHMEM
708 highstart_pfn = highend_pfn = max_pfn;
709 if (max_pfn > max_low_pfn)
710 highstart_pfn = max_low_pfn;
711 memory_present(0, 0, highend_pfn);
712 e820_register_active_regions(0, 0, highend_pfn);
713 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
714 pages_to_mb(highend_pfn - highstart_pfn));
715 num_physpages = highend_pfn;
716 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
717#else
718 memory_present(0, 0, max_low_pfn);
719 e820_register_active_regions(0, 0, max_low_pfn);
720 num_physpages = max_low_pfn;
721 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
722#endif
723#ifdef CONFIG_FLATMEM
724 max_mapnr = num_physpages;
725#endif
726 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
727 pages_to_mb(max_low_pfn));
728
729 setup_bootmem_allocator();
730}
731#endif /* !CONFIG_NEED_MULTIPLE_NODES */
732
733static void __init zone_sizes_init(void)
734{
735 unsigned long max_zone_pfns[MAX_NR_ZONES];
736 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
737 max_zone_pfns[ZONE_DMA] =
738 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
739 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
740#ifdef CONFIG_HIGHMEM
741 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
742#endif
743
744 free_area_init_nodes(max_zone_pfns);
745}
746
747void __init setup_bootmem_allocator(void)
748{
749 int i;
750 unsigned long bootmap_size, bootmap;
751 /*
752 * Initialize the boot-time allocator (with low memory only):
753 */
754 bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
755 bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
756 max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
757 PAGE_SIZE);
758 if (bootmap == -1L)
759 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
760 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
761
762 /* don't touch min_low_pfn */
763 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
764 min_low_pfn, max_low_pfn);
765 printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
766 max_pfn_mapped<<PAGE_SHIFT);
767 printk(KERN_INFO " low ram: %08lx - %08lx\n",
768 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
769 printk(KERN_INFO " bootmap %08lx - %08lx\n",
770 bootmap, bootmap + bootmap_size);
771 for_each_online_node(i)
772 free_bootmem_with_active_regions(i, max_low_pfn);
773 early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
774
775 after_init_bootmem = 1;
776}
777
778static void __init find_early_table_space(unsigned long end, int use_pse)
779{
780 unsigned long puds, pmds, ptes, tables, start;
781
782 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
783 tables = PAGE_ALIGN(puds * sizeof(pud_t));
784
785 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
786 tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
787
788 if (use_pse) {
789 unsigned long extra;
790
791 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
792 extra += PMD_SIZE;
793 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
794 } else
795 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
796
797 tables += PAGE_ALIGN(ptes * sizeof(pte_t));
798
799 /* for fixmap */
800 tables += PAGE_SIZE * 2;
801
802 /*
803 * RED-PEN putting page tables only on node 0 could
804 * cause a hotspot and fill up ZONE_DMA. The page tables
805 * need roughly 0.5KB per GB.
806 */
807 start = 0x7000;
808 table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
809 tables, PAGE_SIZE);
810 if (table_start == -1UL)
811 panic("Cannot find space for the kernel page tables");
812
813 table_start >>= PAGE_SHIFT;
814 table_end = table_start;
815 table_top = table_start + (tables>>PAGE_SHIFT);
816
817 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
818 end, table_start << PAGE_SHIFT,
819 (table_start << PAGE_SHIFT) + tables);
820}
821
822unsigned long __init_refok init_memory_mapping(unsigned long start,
823 unsigned long end)
824{
825 pgd_t *pgd_base = swapper_pg_dir;
826 unsigned long start_pfn, end_pfn;
827 unsigned long big_page_start;
828#ifdef CONFIG_DEBUG_PAGEALLOC
829 /*
830 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
831 * This will simplify cpa(), which otherwise needs to support splitting
832 * large pages into small in interrupt context, etc.
833 */
834 int use_pse = 0;
835#else
836 int use_pse = cpu_has_pse;
837#endif
838
839 /*
840 * Find space for the kernel direct mapping tables.
841 */
842 if (!after_init_bootmem)
843 find_early_table_space(end, use_pse);
844
521#ifdef CONFIG_X86_PAE 845#ifdef CONFIG_X86_PAE
522 set_nx(); 846 set_nx();
523 if (nx_enabled) 847 if (nx_enabled)
524 printk(KERN_INFO "NX (Execute Disable) protection: active\n"); 848 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
525#endif 849#endif
526 pagetable_init(); 850
851 /* Enable PSE if available */
852 if (cpu_has_pse)
853 set_in_cr4(X86_CR4_PSE);
854
855 /* Enable PGE if available */
856 if (cpu_has_pge) {
857 set_in_cr4(X86_CR4_PGE);
858 __supported_pte_mask |= _PAGE_GLOBAL;
859 }
860
861 /*
862 * Don't use a large page for the first 2/4MB of memory
863 * because there are often fixed size MTRRs in there
864 * and overlapping MTRRs into large pages can cause
865 * slowdowns.
866 */
867 big_page_start = PMD_SIZE;
868
869 if (start < big_page_start) {
870 start_pfn = start >> PAGE_SHIFT;
871 end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
872 } else {
873 /* head is not big page alignment ? */
874 start_pfn = start >> PAGE_SHIFT;
875 end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
876 << (PMD_SHIFT - PAGE_SHIFT);
877 }
878 if (start_pfn < end_pfn)
879 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
880
881 /* big page range */
882 start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
883 << (PMD_SHIFT - PAGE_SHIFT);
884 if (start_pfn < (big_page_start >> PAGE_SHIFT))
885 start_pfn = big_page_start >> PAGE_SHIFT;
886 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
887 if (start_pfn < end_pfn)
888 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
889 use_pse);
890
891 /* tail is not big page alignment ? */
892 start_pfn = end_pfn;
893 if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
894 end_pfn = end >> PAGE_SHIFT;
895 if (start_pfn < end_pfn)
896 kernel_physical_mapping_init(pgd_base, start_pfn,
897 end_pfn, 0);
898 }
899
900 early_ioremap_page_table_range_init(pgd_base);
527 901
528 load_cr3(swapper_pg_dir); 902 load_cr3(swapper_pg_dir);
529 903
530 __flush_tlb_all(); 904 __flush_tlb_all();
531 905
906 if (!after_init_bootmem)
907 reserve_early(table_start << PAGE_SHIFT,
908 table_end << PAGE_SHIFT, "PGTABLE");
909
910 if (!after_init_bootmem)
911 early_memtest(start, end);
912
913 return end >> PAGE_SHIFT;
914}
915
916
917/*
918 * paging_init() sets up the page tables - note that the first 8MB are
919 * already mapped by head.S.
920 *
921 * This routines also unmaps the page at virtual kernel address 0, so
922 * that we can trap those pesky NULL-reference errors in the kernel.
923 */
924void __init paging_init(void)
925{
926 pagetable_init();
927
928 __flush_tlb_all();
929
532 kmap_init(); 930 kmap_init();
931
932 /*
933 * NOTE: at this point the bootmem allocator is fully available.
934 */
935 sparse_init();
936 zone_sizes_init();
533} 937}
534 938
535/* 939/*
@@ -564,24 +968,13 @@ static struct kcore_list kcore_mem, kcore_vmalloc;
564void __init mem_init(void) 968void __init mem_init(void)
565{ 969{
566 int codesize, reservedpages, datasize, initsize; 970 int codesize, reservedpages, datasize, initsize;
567 int tmp, bad_ppro; 971 int tmp;
972
973 start_periodic_check_for_corruption();
568 974
569#ifdef CONFIG_FLATMEM 975#ifdef CONFIG_FLATMEM
570 BUG_ON(!mem_map); 976 BUG_ON(!mem_map);
571#endif 977#endif
572 bad_ppro = ppro_with_ram_bug();
573
574#ifdef CONFIG_HIGHMEM
575 /* check that fixmap and pkmap do not overlap */
576 if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
577 printk(KERN_ERR
578 "fixmap and kmap areas overlap - this will crash\n");
579 printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
580 PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
581 FIXADDR_START);
582 BUG();
583 }
584#endif
585 /* this will put all low memory onto the freelists */ 978 /* this will put all low memory onto the freelists */
586 totalram_pages += free_all_bootmem(); 979 totalram_pages += free_all_bootmem();
587 980
@@ -593,7 +986,7 @@ void __init mem_init(void)
593 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) 986 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
594 reservedpages++; 987 reservedpages++;
595 988
596 set_highmem_pages_init(bad_ppro); 989 set_highmem_pages_init();
597 990
598 codesize = (unsigned long) &_etext - (unsigned long) &_text; 991 codesize = (unsigned long) &_etext - (unsigned long) &_text;
599 datasize = (unsigned long) &_edata - (unsigned long) &_etext; 992 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
@@ -614,7 +1007,6 @@ void __init mem_init(void)
614 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) 1007 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
615 ); 1008 );
616 1009
617#if 1 /* double-sanity-check paranoia */
618 printk(KERN_INFO "virtual kernel memory layout:\n" 1010 printk(KERN_INFO "virtual kernel memory layout:\n"
619 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" 1011 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
620#ifdef CONFIG_HIGHMEM 1012#ifdef CONFIG_HIGHMEM
@@ -655,12 +1047,10 @@ void __init mem_init(void)
655#endif 1047#endif
656 BUG_ON(VMALLOC_START > VMALLOC_END); 1048 BUG_ON(VMALLOC_START > VMALLOC_END);
657 BUG_ON((unsigned long)high_memory > VMALLOC_START); 1049 BUG_ON((unsigned long)high_memory > VMALLOC_START);
658#endif /* double-sanity-check paranoia */
659 1050
660 if (boot_cpu_data.wp_works_ok < 0) 1051 if (boot_cpu_data.wp_works_ok < 0)
661 test_wp_bit(); 1052 test_wp_bit();
662 1053
663 cpa_init();
664 save_pg_dir(); 1054 save_pg_dir();
665 zap_low_mappings(); 1055 zap_low_mappings();
666} 1056}
@@ -710,6 +1100,8 @@ void mark_rodata_ro(void)
710 unsigned long start = PFN_ALIGN(_text); 1100 unsigned long start = PFN_ALIGN(_text);
711 unsigned long size = PFN_ALIGN(_etext) - start; 1101 unsigned long size = PFN_ALIGN(_etext) - start;
712 1102
1103#ifndef CONFIG_DYNAMIC_FTRACE
1104 /* Dynamic tracing modifies the kernel text section */
713 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 1105 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
714 printk(KERN_INFO "Write protecting the kernel text: %luk\n", 1106 printk(KERN_INFO "Write protecting the kernel text: %luk\n",
715 size >> 10); 1107 size >> 10);
@@ -722,6 +1114,8 @@ void mark_rodata_ro(void)
722 printk(KERN_INFO "Testing CPA: write protecting again\n"); 1114 printk(KERN_INFO "Testing CPA: write protecting again\n");
723 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); 1115 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
724#endif 1116#endif
1117#endif /* CONFIG_DYNAMIC_FTRACE */
1118
725 start += size; 1119 start += size;
726 size = (unsigned long)__end_rodata - start; 1120 size = (unsigned long)__end_rodata - start;
727 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 1121 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
@@ -784,3 +1178,9 @@ void free_initrd_mem(unsigned long start, unsigned long end)
784 free_init_pages("initrd memory", start, end); 1178 free_init_pages("initrd memory", start, end);
785} 1179}
786#endif 1180#endif
1181
1182int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
1183 int flags)
1184{
1185 return reserve_bootmem(phys, len, flags);
1186}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 156e6d7b0e32..b8e461d49412 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -18,6 +18,7 @@
18#include <linux/swap.h> 18#include <linux/swap.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/initrd.h>
21#include <linux/pagemap.h> 22#include <linux/pagemap.h>
22#include <linux/bootmem.h> 23#include <linux/bootmem.h>
23#include <linux/proc_fs.h> 24#include <linux/proc_fs.h>
@@ -30,6 +31,7 @@
30#include <linux/nmi.h> 31#include <linux/nmi.h>
31 32
32#include <asm/processor.h> 33#include <asm/processor.h>
34#include <asm/bios_ebda.h>
33#include <asm/system.h> 35#include <asm/system.h>
34#include <asm/uaccess.h> 36#include <asm/uaccess.h>
35#include <asm/pgtable.h> 37#include <asm/pgtable.h>
@@ -47,11 +49,19 @@
47#include <asm/numa.h> 49#include <asm/numa.h>
48#include <asm/cacheflush.h> 50#include <asm/cacheflush.h>
49 51
52/*
53 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
54 * The direct mapping extends to max_pfn_mapped, so that we can directly access
55 * apertures, ACPI and other tables without having to play with fixmaps.
56 */
57unsigned long max_low_pfn_mapped;
58unsigned long max_pfn_mapped;
59
50static unsigned long dma_reserve __initdata; 60static unsigned long dma_reserve __initdata;
51 61
52DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 62DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
53 63
54int direct_gbpages __meminitdata 64int direct_gbpages
55#ifdef CONFIG_DIRECT_GBPAGES 65#ifdef CONFIG_DIRECT_GBPAGES
56 = 1 66 = 1
57#endif 67#endif
@@ -77,46 +87,69 @@ early_param("gbpages", parse_direct_gbpages_on);
77 * around without checking the pgd every time. 87 * around without checking the pgd every time.
78 */ 88 */
79 89
80void show_mem(void) 90int after_bootmem;
81{
82 long i, total = 0, reserved = 0;
83 long shared = 0, cached = 0;
84 struct page *page;
85 pg_data_t *pgdat;
86 91
87 printk(KERN_INFO "Mem-info:\n"); 92pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
88 show_free_areas(); 93EXPORT_SYMBOL_GPL(__supported_pte_mask);
89 for_each_online_pgdat(pgdat) {
90 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
91 /*
92 * This loop can take a while with 256 GB and
93 * 4k pages so defer the NMI watchdog:
94 */
95 if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
96 touch_nmi_watchdog();
97 94
98 if (!pfn_valid(pgdat->node_start_pfn + i)) 95static int do_not_nx __cpuinitdata;
99 continue;
100 96
101 page = pfn_to_page(pgdat->node_start_pfn + i); 97/*
102 total++; 98 * noexec=on|off
103 if (PageReserved(page)) 99 * Control non-executable mappings for 64-bit processes.
104 reserved++; 100 *
105 else if (PageSwapCache(page)) 101 * on Enable (default)
106 cached++; 102 * off Disable
107 else if (page_count(page)) 103 */
108 shared += page_count(page) - 1; 104static int __init nonx_setup(char *str)
109 } 105{
106 if (!str)
107 return -EINVAL;
108 if (!strncmp(str, "on", 2)) {
109 __supported_pte_mask |= _PAGE_NX;
110 do_not_nx = 0;
111 } else if (!strncmp(str, "off", 3)) {
112 do_not_nx = 1;
113 __supported_pte_mask &= ~_PAGE_NX;
110 } 114 }
111 printk(KERN_INFO "%lu pages of RAM\n", total); 115 return 0;
112 printk(KERN_INFO "%lu reserved pages\n", reserved);
113 printk(KERN_INFO "%lu pages shared\n", shared);
114 printk(KERN_INFO "%lu pages swap cached\n", cached);
115} 116}
117early_param("noexec", nonx_setup);
116 118
117int after_bootmem; 119void __cpuinit check_efer(void)
120{
121 unsigned long efer;
122
123 rdmsrl(MSR_EFER, efer);
124 if (!(efer & EFER_NX) || do_not_nx)
125 __supported_pte_mask &= ~_PAGE_NX;
126}
118 127
119static __init void *spp_getpage(void) 128int force_personality32;
129
130/*
131 * noexec32=on|off
132 * Control non executable heap for 32bit processes.
133 * To control the stack too use noexec=off
134 *
135 * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
136 * off PROT_READ implies PROT_EXEC
137 */
138static int __init nonx32_setup(char *str)
139{
140 if (!strcmp(str, "on"))
141 force_personality32 &= ~READ_IMPLIES_EXEC;
142 else if (!strcmp(str, "off"))
143 force_personality32 |= READ_IMPLIES_EXEC;
144 return 1;
145}
146__setup("noexec32=", nonx32_setup);
147
148/*
149 * NOTE: This function is marked __ref because it calls __init function
150 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
151 */
152static __ref void *spp_getpage(void)
120{ 153{
121 void *ptr; 154 void *ptr;
122 155
@@ -135,26 +168,17 @@ static __init void *spp_getpage(void)
135 return ptr; 168 return ptr;
136} 169}
137 170
138static void 171void
139set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) 172set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
140{ 173{
141 pgd_t *pgd;
142 pud_t *pud; 174 pud_t *pud;
143 pmd_t *pmd; 175 pmd_t *pmd;
144 pte_t *pte, new_pte; 176 pte_t *pte;
145
146 pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
147 177
148 pgd = pgd_offset_k(vaddr); 178 pud = pud_page + pud_index(vaddr);
149 if (pgd_none(*pgd)) {
150 printk(KERN_ERR
151 "PGD FIXMAP MISSING, it should be setup in head.S!\n");
152 return;
153 }
154 pud = pud_offset(pgd, vaddr);
155 if (pud_none(*pud)) { 179 if (pud_none(*pud)) {
156 pmd = (pmd_t *) spp_getpage(); 180 pmd = (pmd_t *) spp_getpage();
157 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); 181 pud_populate(&init_mm, pud, pmd);
158 if (pmd != pmd_offset(pud, 0)) { 182 if (pmd != pmd_offset(pud, 0)) {
159 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", 183 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
160 pmd, pmd_offset(pud, 0)); 184 pmd, pmd_offset(pud, 0));
@@ -164,18 +188,14 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
164 pmd = pmd_offset(pud, vaddr); 188 pmd = pmd_offset(pud, vaddr);
165 if (pmd_none(*pmd)) { 189 if (pmd_none(*pmd)) {
166 pte = (pte_t *) spp_getpage(); 190 pte = (pte_t *) spp_getpage();
167 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); 191 pmd_populate_kernel(&init_mm, pmd, pte);
168 if (pte != pte_offset_kernel(pmd, 0)) { 192 if (pte != pte_offset_kernel(pmd, 0)) {
169 printk(KERN_ERR "PAGETABLE BUG #02!\n"); 193 printk(KERN_ERR "PAGETABLE BUG #02!\n");
170 return; 194 return;
171 } 195 }
172 } 196 }
173 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
174 197
175 pte = pte_offset_kernel(pmd, vaddr); 198 pte = pte_offset_kernel(pmd, vaddr);
176 if (!pte_none(*pte) && pte_val(new_pte) &&
177 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
178 pte_ERROR(*pte);
179 set_pte(pte, new_pte); 199 set_pte(pte, new_pte);
180 200
181 /* 201 /*
@@ -185,6 +205,64 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
185 __flush_tlb_one(vaddr); 205 __flush_tlb_one(vaddr);
186} 206}
187 207
208void
209set_pte_vaddr(unsigned long vaddr, pte_t pteval)
210{
211 pgd_t *pgd;
212 pud_t *pud_page;
213
214 pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));
215
216 pgd = pgd_offset_k(vaddr);
217 if (pgd_none(*pgd)) {
218 printk(KERN_ERR
219 "PGD FIXMAP MISSING, it should be setup in head.S!\n");
220 return;
221 }
222 pud_page = (pud_t*)pgd_page_vaddr(*pgd);
223 set_pte_vaddr_pud(pud_page, vaddr, pteval);
224}
225
226/*
227 * Create large page table mappings for a range of physical addresses.
228 */
229static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
230 pgprot_t prot)
231{
232 pgd_t *pgd;
233 pud_t *pud;
234 pmd_t *pmd;
235
236 BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
237 for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
238 pgd = pgd_offset_k((unsigned long)__va(phys));
239 if (pgd_none(*pgd)) {
240 pud = (pud_t *) spp_getpage();
241 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
242 _PAGE_USER));
243 }
244 pud = pud_offset(pgd, (unsigned long)__va(phys));
245 if (pud_none(*pud)) {
246 pmd = (pmd_t *) spp_getpage();
247 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
248 _PAGE_USER));
249 }
250 pmd = pmd_offset(pud, phys);
251 BUG_ON(!pmd_none(*pmd));
252 set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
253 }
254}
255
256void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
257{
258 __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
259}
260
261void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
262{
263 __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
264}
265
188/* 266/*
189 * The head.S code sets up the kernel high mapping: 267 * The head.S code sets up the kernel high mapping:
190 * 268 *
@@ -201,7 +279,7 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
201void __init cleanup_highmap(void) 279void __init cleanup_highmap(void)
202{ 280{
203 unsigned long vaddr = __START_KERNEL_map; 281 unsigned long vaddr = __START_KERNEL_map;
204 unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1; 282 unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
205 pmd_t *pmd = level2_kernel_pgt; 283 pmd_t *pmd = level2_kernel_pgt;
206 pmd_t *last_pmd = pmd + PTRS_PER_PMD; 284 pmd_t *last_pmd = pmd + PTRS_PER_PMD;
207 285
@@ -213,22 +291,11 @@ void __init cleanup_highmap(void)
213 } 291 }
214} 292}
215 293
216/* NOTE: this is meant to be run only at boot */
217void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
218{
219 unsigned long address = __fix_to_virt(idx);
220
221 if (idx >= __end_of_fixed_addresses) {
222 printk(KERN_ERR "Invalid __set_fixmap\n");
223 return;
224 }
225 set_pte_phys(address, phys, prot);
226}
227
228static unsigned long __initdata table_start; 294static unsigned long __initdata table_start;
229static unsigned long __meminitdata table_end; 295static unsigned long __meminitdata table_end;
296static unsigned long __meminitdata table_top;
230 297
231static __meminit void *alloc_low_page(unsigned long *phys) 298static __ref void *alloc_low_page(unsigned long *phys)
232{ 299{
233 unsigned long pfn = table_end++; 300 unsigned long pfn = table_end++;
234 void *adr; 301 void *adr;
@@ -240,16 +307,16 @@ static __meminit void *alloc_low_page(unsigned long *phys)
240 return adr; 307 return adr;
241 } 308 }
242 309
243 if (pfn >= end_pfn) 310 if (pfn >= table_top)
244 panic("alloc_low_page: ran out of memory"); 311 panic("alloc_low_page: ran out of memory");
245 312
246 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); 313 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
247 memset(adr, 0, PAGE_SIZE); 314 memset(adr, 0, PAGE_SIZE);
248 *phys = pfn * PAGE_SIZE; 315 *phys = pfn * PAGE_SIZE;
249 return adr; 316 return adr;
250} 317}
251 318
252static __meminit void unmap_low_page(void *adr) 319static __ref void unmap_low_page(void *adr)
253{ 320{
254 if (after_bootmem) 321 if (after_bootmem)
255 return; 322 return;
@@ -257,65 +324,71 @@ static __meminit void unmap_low_page(void *adr)
257 early_iounmap(adr, PAGE_SIZE); 324 early_iounmap(adr, PAGE_SIZE);
258} 325}
259 326
260/* Must run before zap_low_mappings */ 327static unsigned long __meminit
261__meminit void *early_ioremap(unsigned long addr, unsigned long size) 328phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
329 pgprot_t prot)
262{ 330{
263 pmd_t *pmd, *last_pmd; 331 unsigned pages = 0;
264 unsigned long vaddr; 332 unsigned long last_map_addr = end;
265 int i, pmds; 333 int i;
334
335 pte_t *pte = pte_page + pte_index(addr);
266 336
267 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; 337 for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
268 vaddr = __START_KERNEL_map;
269 pmd = level2_kernel_pgt;
270 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
271 338
272 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { 339 if (addr >= end) {
273 for (i = 0; i < pmds; i++) { 340 if (!after_bootmem) {
274 if (pmd_present(pmd[i])) 341 for(; i < PTRS_PER_PTE; i++, pte++)
275 goto continue_outer_loop; 342 set_pte(pte, __pte(0));
343 }
344 break;
276 } 345 }
277 vaddr += addr & ~PMD_MASK;
278 addr &= PMD_MASK;
279 346
280 for (i = 0; i < pmds; i++, addr += PMD_SIZE) 347 /*
281 set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); 348 * We will re-use the existing mapping.
282 __flush_tlb_all(); 349 * Xen for example has some special requirements, like mapping
350 * pagetable pages as RO. So assume someone who pre-setup
351 * these mappings are more intelligent.
352 */
353 if (pte_val(*pte))
354 continue;
283 355
284 return (void *)vaddr; 356 if (0)
285continue_outer_loop: 357 printk(" pte=%p addr=%lx pte=%016lx\n",
286 ; 358 pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
359 pages++;
360 set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
361 last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
287 } 362 }
288 printk(KERN_ERR "early_ioremap(0x%lx, %lu) failed\n", addr, size);
289 363
290 return NULL; 364 update_page_count(PG_LEVEL_4K, pages);
365
366 return last_map_addr;
291} 367}
292 368
293/* 369static unsigned long __meminit
294 * To avoid virtual aliases later: 370phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
295 */ 371 pgprot_t prot)
296__meminit void early_iounmap(void *addr, unsigned long size)
297{ 372{
298 unsigned long vaddr; 373 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
299 pmd_t *pmd;
300 int i, pmds;
301 374
302 vaddr = (unsigned long)addr; 375 return phys_pte_init(pte, address, end, prot);
303 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
304 pmd = level2_kernel_pgt + pmd_index(vaddr);
305
306 for (i = 0; i < pmds; i++)
307 pmd_clear(pmd + i);
308
309 __flush_tlb_all();
310} 376}
311 377
312static unsigned long __meminit 378static unsigned long __meminit
313phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) 379phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
380 unsigned long page_size_mask, pgprot_t prot)
314{ 381{
382 unsigned long pages = 0;
383 unsigned long last_map_addr = end;
384
315 int i = pmd_index(address); 385 int i = pmd_index(address);
316 386
317 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { 387 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
388 unsigned long pte_phys;
318 pmd_t *pmd = pmd_page + pmd_index(address); 389 pmd_t *pmd = pmd_page + pmd_index(address);
390 pte_t *pte;
391 pgprot_t new_prot = prot;
319 392
320 if (address >= end) { 393 if (address >= end) {
321 if (!after_bootmem) { 394 if (!after_bootmem) {
@@ -325,31 +398,71 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
325 break; 398 break;
326 } 399 }
327 400
328 if (pmd_val(*pmd)) 401 if (pmd_val(*pmd)) {
402 if (!pmd_large(*pmd)) {
403 spin_lock(&init_mm.page_table_lock);
404 last_map_addr = phys_pte_update(pmd, address,
405 end, prot);
406 spin_unlock(&init_mm.page_table_lock);
407 continue;
408 }
409 /*
410 * If we are ok with PG_LEVEL_2M mapping, then we will
411 * use the existing mapping,
412 *
413 * Otherwise, we will split the large page mapping but
414 * use the same existing protection bits except for
415 * large page, so that we don't violate Intel's TLB
416 * Application note (317080) which says, while changing
417 * the page sizes, new and old translations should
418 * not differ with respect to page frame and
419 * attributes.
420 */
421 if (page_size_mask & (1 << PG_LEVEL_2M))
422 continue;
423 new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
424 }
425
426 if (page_size_mask & (1<<PG_LEVEL_2M)) {
427 pages++;
428 spin_lock(&init_mm.page_table_lock);
429 set_pte((pte_t *)pmd,
430 pfn_pte(address >> PAGE_SHIFT,
431 __pgprot(pgprot_val(prot) | _PAGE_PSE)));
432 spin_unlock(&init_mm.page_table_lock);
433 last_map_addr = (address & PMD_MASK) + PMD_SIZE;
329 continue; 434 continue;
435 }
330 436
331 set_pte((pte_t *)pmd, 437 pte = alloc_low_page(&pte_phys);
332 pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 438 last_map_addr = phys_pte_init(pte, address, end, new_prot);
439 unmap_low_page(pte);
440
441 spin_lock(&init_mm.page_table_lock);
442 pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
443 spin_unlock(&init_mm.page_table_lock);
333 } 444 }
334 return address; 445 update_page_count(PG_LEVEL_2M, pages);
446 return last_map_addr;
335} 447}
336 448
337static unsigned long __meminit 449static unsigned long __meminit
338phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) 450phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
451 unsigned long page_size_mask, pgprot_t prot)
339{ 452{
340 pmd_t *pmd = pmd_offset(pud, 0); 453 pmd_t *pmd = pmd_offset(pud, 0);
341 unsigned long last_map_addr; 454 unsigned long last_map_addr;
342 455
343 spin_lock(&init_mm.page_table_lock); 456 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
344 last_map_addr = phys_pmd_init(pmd, address, end);
345 spin_unlock(&init_mm.page_table_lock);
346 __flush_tlb_all(); 457 __flush_tlb_all();
347 return last_map_addr; 458 return last_map_addr;
348} 459}
349 460
350static unsigned long __meminit 461static unsigned long __meminit
351phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) 462phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
463 unsigned long page_size_mask)
352{ 464{
465 unsigned long pages = 0;
353 unsigned long last_map_addr = end; 466 unsigned long last_map_addr = end;
354 int i = pud_index(addr); 467 int i = pud_index(addr);
355 468
@@ -357,6 +470,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
357 unsigned long pmd_phys; 470 unsigned long pmd_phys;
358 pud_t *pud = pud_page + pud_index(addr); 471 pud_t *pud = pud_page + pud_index(addr);
359 pmd_t *pmd; 472 pmd_t *pmd;
473 pgprot_t prot = PAGE_KERNEL;
360 474
361 if (addr >= end) 475 if (addr >= end)
362 break; 476 break;
@@ -368,42 +482,87 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
368 } 482 }
369 483
370 if (pud_val(*pud)) { 484 if (pud_val(*pud)) {
371 if (!pud_large(*pud)) 485 if (!pud_large(*pud)) {
372 last_map_addr = phys_pmd_update(pud, addr, end); 486 last_map_addr = phys_pmd_update(pud, addr, end,
373 continue; 487 page_size_mask, prot);
488 continue;
489 }
490 /*
491 * If we are ok with PG_LEVEL_1G mapping, then we will
492 * use the existing mapping.
493 *
494 * Otherwise, we will split the gbpage mapping but use
495 * the same existing protection bits except for large
496 * page, so that we don't violate Intel's TLB
497 * Application note (317080) which says, while changing
498 * the page sizes, new and old translations should
499 * not differ with respect to page frame and
500 * attributes.
501 */
502 if (page_size_mask & (1 << PG_LEVEL_1G))
503 continue;
504 prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
374 } 505 }
375 506
376 if (direct_gbpages) { 507 if (page_size_mask & (1<<PG_LEVEL_1G)) {
508 pages++;
509 spin_lock(&init_mm.page_table_lock);
377 set_pte((pte_t *)pud, 510 set_pte((pte_t *)pud,
378 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 511 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
512 spin_unlock(&init_mm.page_table_lock);
379 last_map_addr = (addr & PUD_MASK) + PUD_SIZE; 513 last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
380 continue; 514 continue;
381 } 515 }
382 516
383 pmd = alloc_low_page(&pmd_phys); 517 pmd = alloc_low_page(&pmd_phys);
518 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
519 prot);
520 unmap_low_page(pmd);
384 521
385 spin_lock(&init_mm.page_table_lock); 522 spin_lock(&init_mm.page_table_lock);
386 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); 523 pud_populate(&init_mm, pud, __va(pmd_phys));
387 last_map_addr = phys_pmd_init(pmd, addr, end);
388 spin_unlock(&init_mm.page_table_lock); 524 spin_unlock(&init_mm.page_table_lock);
389
390 unmap_low_page(pmd);
391 } 525 }
392 __flush_tlb_all(); 526 __flush_tlb_all();
393 527
394 return last_map_addr >> PAGE_SHIFT; 528 update_page_count(PG_LEVEL_1G, pages);
529
530 return last_map_addr;
395} 531}
396 532
397static void __init find_early_table_space(unsigned long end) 533static unsigned long __meminit
534phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
535 unsigned long page_size_mask)
398{ 536{
399 unsigned long puds, pmds, tables, start; 537 pud_t *pud;
538
539 pud = (pud_t *)pgd_page_vaddr(*pgd);
540
541 return phys_pud_init(pud, addr, end, page_size_mask);
542}
543
544static void __init find_early_table_space(unsigned long end, int use_pse,
545 int use_gbpages)
546{
547 unsigned long puds, pmds, ptes, tables, start;
400 548
401 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 549 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
402 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); 550 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
403 if (!direct_gbpages) { 551 if (use_gbpages) {
552 unsigned long extra;
553 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
554 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
555 } else
404 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 556 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
405 tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); 557 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
406 } 558
559 if (use_pse) {
560 unsigned long extra;
561 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
562 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
563 } else
564 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
565 tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
407 566
408 /* 567 /*
409 * RED-PEN putting page tables only on node 0 could 568 * RED-PEN putting page tables only on node 0 could
@@ -417,10 +576,10 @@ static void __init find_early_table_space(unsigned long end)
417 576
418 table_start >>= PAGE_SHIFT; 577 table_start >>= PAGE_SHIFT;
419 table_end = table_start; 578 table_end = table_start;
579 table_top = table_start + (tables >> PAGE_SHIFT);
420 580
421 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n", 581 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
422 end, table_start << PAGE_SHIFT, 582 end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
423 (table_start << PAGE_SHIFT) + tables);
424} 583}
425 584
426static void __init init_gbpages(void) 585static void __init init_gbpages(void)
@@ -431,125 +590,85 @@ static void __init init_gbpages(void)
431 direct_gbpages = 0; 590 direct_gbpages = 0;
432} 591}
433 592
434#ifdef CONFIG_MEMTEST_BOOTPARAM 593static unsigned long __init kernel_physical_mapping_init(unsigned long start,
435 594 unsigned long end,
436static void __init memtest(unsigned long start_phys, unsigned long size, 595 unsigned long page_size_mask)
437 unsigned pattern) 596{
438{
439 unsigned long i;
440 unsigned long *start;
441 unsigned long start_bad;
442 unsigned long last_bad;
443 unsigned long val;
444 unsigned long start_phys_aligned;
445 unsigned long count;
446 unsigned long incr;
447
448 switch (pattern) {
449 case 0:
450 val = 0UL;
451 break;
452 case 1:
453 val = -1UL;
454 break;
455 case 2:
456 val = 0x5555555555555555UL;
457 break;
458 case 3:
459 val = 0xaaaaaaaaaaaaaaaaUL;
460 break;
461 default:
462 return;
463 }
464 597
465 incr = sizeof(unsigned long); 598 unsigned long next, last_map_addr = end;
466 start_phys_aligned = ALIGN(start_phys, incr);
467 count = (size - (start_phys_aligned - start_phys))/incr;
468 start = __va(start_phys_aligned);
469 start_bad = 0;
470 last_bad = 0;
471
472 for (i = 0; i < count; i++)
473 start[i] = val;
474 for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
475 if (*start != val) {
476 if (start_phys_aligned == last_bad + incr) {
477 last_bad += incr;
478 } else {
479 if (start_bad) {
480 printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
481 val, start_bad, last_bad + incr);
482 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
483 }
484 start_bad = last_bad = start_phys_aligned;
485 }
486 }
487 }
488 if (start_bad) {
489 printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
490 val, start_bad, last_bad + incr);
491 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
492 }
493 599
494} 600 start = (unsigned long)__va(start);
601 end = (unsigned long)__va(end);
495 602
496static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE; 603 for (; start < end; start = next) {
604 pgd_t *pgd = pgd_offset_k(start);
605 unsigned long pud_phys;
606 pud_t *pud;
497 607
498static int __init parse_memtest(char *arg) 608 next = (start + PGDIR_SIZE) & PGDIR_MASK;
499{ 609 if (next > end)
500 if (arg) 610 next = end;
501 memtest_pattern = simple_strtoul(arg, NULL, 0);
502 return 0;
503}
504 611
505early_param("memtest", parse_memtest); 612 if (pgd_val(*pgd)) {
613 last_map_addr = phys_pud_update(pgd, __pa(start),
614 __pa(end), page_size_mask);
615 continue;
616 }
506 617
507static void __init early_memtest(unsigned long start, unsigned long end) 618 pud = alloc_low_page(&pud_phys);
508{ 619 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
509 u64 t_start, t_size; 620 page_size_mask);
510 unsigned pattern; 621 unmap_low_page(pud);
511 622
512 if (!memtest_pattern) 623 spin_lock(&init_mm.page_table_lock);
513 return; 624 pgd_populate(&init_mm, pgd, __va(pud_phys));
625 spin_unlock(&init_mm.page_table_lock);
626 }
627 __flush_tlb_all();
514 628
515 printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern); 629 return last_map_addr;
516 for (pattern = 0; pattern < memtest_pattern; pattern++) { 630}
517 t_start = start;
518 t_size = 0;
519 while (t_start < end) {
520 t_start = find_e820_area_size(t_start, &t_size, 1);
521 631
522 /* done ? */ 632struct map_range {
523 if (t_start >= end) 633 unsigned long start;
524 break; 634 unsigned long end;
525 if (t_start + t_size > end) 635 unsigned page_size_mask;
526 t_size = end - t_start; 636};
527 637
528 printk(KERN_CONT "\n %016llx - %016llx pattern %d", 638#define NR_RANGE_MR 5
529 t_start, t_start + t_size, pattern);
530 639
531 memtest(t_start, t_size, pattern); 640static int save_mr(struct map_range *mr, int nr_range,
641 unsigned long start_pfn, unsigned long end_pfn,
642 unsigned long page_size_mask)
643{
532 644
533 t_start += t_size; 645 if (start_pfn < end_pfn) {
534 } 646 if (nr_range >= NR_RANGE_MR)
647 panic("run out of range for init_memory_mapping\n");
648 mr[nr_range].start = start_pfn<<PAGE_SHIFT;
649 mr[nr_range].end = end_pfn<<PAGE_SHIFT;
650 mr[nr_range].page_size_mask = page_size_mask;
651 nr_range++;
535 } 652 }
536 printk(KERN_CONT "\n"); 653
537} 654 return nr_range;
538#else
539static void __init early_memtest(unsigned long start, unsigned long end)
540{
541} 655}
542#endif
543 656
544/* 657/*
545 * Setup the direct mapping of the physical memory at PAGE_OFFSET. 658 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
546 * This runs before bootmem is initialized and gets pages directly from 659 * This runs before bootmem is initialized and gets pages directly from
547 * the physical memory. To access them they are temporarily mapped. 660 * the physical memory. To access them they are temporarily mapped.
548 */ 661 */
549unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end) 662unsigned long __init_refok init_memory_mapping(unsigned long start,
663 unsigned long end)
550{ 664{
551 unsigned long next, last_map_addr = end; 665 unsigned long last_map_addr = 0;
552 unsigned long start_phys = start, end_phys = end; 666 unsigned long page_size_mask = 0;
667 unsigned long start_pfn, end_pfn;
668
669 struct map_range mr[NR_RANGE_MR];
670 int nr_range, i;
671 int use_pse, use_gbpages;
553 672
554 printk(KERN_INFO "init_memory_mapping\n"); 673 printk(KERN_INFO "init_memory_mapping\n");
555 674
@@ -560,48 +679,127 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned lon
560 * memory mapped. Unfortunately this is done currently before the 679 * memory mapped. Unfortunately this is done currently before the
561 * nodes are discovered. 680 * nodes are discovered.
562 */ 681 */
563 if (!after_bootmem) { 682 if (!after_bootmem)
564 init_gbpages(); 683 init_gbpages();
565 find_early_table_space(end);
566 }
567 684
568 start = (unsigned long)__va(start); 685#ifdef CONFIG_DEBUG_PAGEALLOC
569 end = (unsigned long)__va(end); 686 /*
687 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
688 * This will simplify cpa(), which otherwise needs to support splitting
689 * large pages into small in interrupt context, etc.
690 */
691 use_pse = use_gbpages = 0;
692#else
693 use_pse = cpu_has_pse;
694 use_gbpages = direct_gbpages;
695#endif
570 696
571 for (; start < end; start = next) { 697 if (use_gbpages)
572 pgd_t *pgd = pgd_offset_k(start); 698 page_size_mask |= 1 << PG_LEVEL_1G;
573 unsigned long pud_phys; 699 if (use_pse)
574 pud_t *pud; 700 page_size_mask |= 1 << PG_LEVEL_2M;
701
702 memset(mr, 0, sizeof(mr));
703 nr_range = 0;
704
705 /* head if not big page alignment ?*/
706 start_pfn = start >> PAGE_SHIFT;
707 end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT)
708 << (PMD_SHIFT - PAGE_SHIFT);
709 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
710
711 /* big page (2M) range*/
712 start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
713 << (PMD_SHIFT - PAGE_SHIFT);
714 end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT)
715 << (PUD_SHIFT - PAGE_SHIFT);
716 if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)))
717 end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT));
718 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
719 page_size_mask & (1<<PG_LEVEL_2M));
720
721 /* big page (1G) range */
722 start_pfn = end_pfn;
723 end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
724 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
725 page_size_mask &
726 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
727
728 /* tail is not big page (1G) alignment */
729 start_pfn = end_pfn;
730 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
731 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
732 page_size_mask & (1<<PG_LEVEL_2M));
733
734 /* tail is not big page (2M) alignment */
735 start_pfn = end_pfn;
736 end_pfn = end>>PAGE_SHIFT;
737 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
738
739 /* try to merge same page size and continuous */
740 for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
741 unsigned long old_start;
742 if (mr[i].end != mr[i+1].start ||
743 mr[i].page_size_mask != mr[i+1].page_size_mask)
744 continue;
745 /* move it */
746 old_start = mr[i].start;
747 memmove(&mr[i], &mr[i+1],
748 (nr_range - 1 - i) * sizeof (struct map_range));
749 mr[i--].start = old_start;
750 nr_range--;
751 }
575 752
576 if (after_bootmem) 753 for (i = 0; i < nr_range; i++)
577 pud = pud_offset(pgd, start & PGDIR_MASK); 754 printk(KERN_DEBUG " %010lx - %010lx page %s\n",
578 else 755 mr[i].start, mr[i].end,
579 pud = alloc_low_page(&pud_phys); 756 (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
757 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
580 758
581 next = start + PGDIR_SIZE; 759 if (!after_bootmem)
582 if (next > end) 760 find_early_table_space(end, use_pse, use_gbpages);
583 next = end; 761
584 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next)); 762 for (i = 0; i < nr_range; i++)
585 if (!after_bootmem) 763 last_map_addr = kernel_physical_mapping_init(
586 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); 764 mr[i].start, mr[i].end,
587 unmap_low_page(pud); 765 mr[i].page_size_mask);
588 }
589 766
590 if (!after_bootmem) 767 if (!after_bootmem)
591 mmu_cr4_features = read_cr4(); 768 mmu_cr4_features = read_cr4();
592 __flush_tlb_all(); 769 __flush_tlb_all();
593 770
594 if (!after_bootmem) 771 if (!after_bootmem && table_end > table_start)
595 reserve_early(table_start << PAGE_SHIFT, 772 reserve_early(table_start << PAGE_SHIFT,
596 table_end << PAGE_SHIFT, "PGTABLE"); 773 table_end << PAGE_SHIFT, "PGTABLE");
597 774
775 printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
776 last_map_addr, end);
777
598 if (!after_bootmem) 778 if (!after_bootmem)
599 early_memtest(start_phys, end_phys); 779 early_memtest(start, end);
600 780
601 return last_map_addr; 781 return last_map_addr >> PAGE_SHIFT;
602} 782}
603 783
604#ifndef CONFIG_NUMA 784#ifndef CONFIG_NUMA
785void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
786{
787 unsigned long bootmap_size, bootmap;
788
789 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
790 bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
791 PAGE_SIZE);
792 if (bootmap == -1L)
793 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
794 /* don't touch min_low_pfn */
795 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
796 0, end_pfn);
797 e820_register_active_regions(0, start_pfn, end_pfn);
798 free_bootmem_with_active_regions(0, end_pfn);
799 early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
800 reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
801}
802
605void __init paging_init(void) 803void __init paging_init(void)
606{ 804{
607 unsigned long max_zone_pfns[MAX_NR_ZONES]; 805 unsigned long max_zone_pfns[MAX_NR_ZONES];
@@ -609,9 +807,9 @@ void __init paging_init(void)
609 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 807 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
610 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; 808 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
611 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 809 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
612 max_zone_pfns[ZONE_NORMAL] = end_pfn; 810 max_zone_pfns[ZONE_NORMAL] = max_pfn;
613 811
614 memory_present(0, 0, end_pfn); 812 memory_present(0, 0, max_pfn);
615 sparse_init(); 813 sparse_init();
616 free_area_init_nodes(max_zone_pfns); 814 free_area_init_nodes(max_zone_pfns);
617} 815}
@@ -681,6 +879,8 @@ void __init mem_init(void)
681{ 879{
682 long codesize, reservedpages, datasize, initsize; 880 long codesize, reservedpages, datasize, initsize;
683 881
882 start_periodic_check_for_corruption();
883
684 pci_iommu_alloc(); 884 pci_iommu_alloc();
685 885
686 /* clear_bss() already clear the empty_zero_page */ 886 /* clear_bss() already clear the empty_zero_page */
@@ -693,8 +893,8 @@ void __init mem_init(void)
693#else 893#else
694 totalram_pages = free_all_bootmem(); 894 totalram_pages = free_all_bootmem();
695#endif 895#endif
696 reservedpages = end_pfn - totalram_pages - 896 reservedpages = max_pfn - totalram_pages -
697 absent_pages_in_range(0, end_pfn); 897 absent_pages_in_range(0, max_pfn);
698 after_bootmem = 1; 898 after_bootmem = 1;
699 899
700 codesize = (unsigned long) &_etext - (unsigned long) &_text; 900 codesize = (unsigned long) &_etext - (unsigned long) &_text;
@@ -713,13 +913,11 @@ void __init mem_init(void)
713 printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " 913 printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
714 "%ldk reserved, %ldk data, %ldk init)\n", 914 "%ldk reserved, %ldk data, %ldk init)\n",
715 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), 915 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
716 end_pfn << (PAGE_SHIFT-10), 916 max_pfn << (PAGE_SHIFT-10),
717 codesize >> 10, 917 codesize >> 10,
718 reservedpages << (PAGE_SHIFT-10), 918 reservedpages << (PAGE_SHIFT-10),
719 datasize >> 10, 919 datasize >> 10,
720 initsize >> 10); 920 initsize >> 10);
721
722 cpa_init();
723} 921}
724 922
725void free_init_pages(char *what, unsigned long begin, unsigned long end) 923void free_init_pages(char *what, unsigned long begin, unsigned long end)
@@ -766,6 +964,13 @@ EXPORT_SYMBOL_GPL(rodata_test_data);
766void mark_rodata_ro(void) 964void mark_rodata_ro(void)
767{ 965{
768 unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); 966 unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
967 unsigned long rodata_start =
968 ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
969
970#ifdef CONFIG_DYNAMIC_FTRACE
971 /* Dynamic tracing modifies the kernel text section */
972 start = rodata_start;
973#endif
769 974
770 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", 975 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
771 (end - start) >> 10); 976 (end - start) >> 10);
@@ -775,8 +980,7 @@ void mark_rodata_ro(void)
775 * The rodata section (but not the kernel text!) should also be 980 * The rodata section (but not the kernel text!) should also be
776 * not-executable. 981 * not-executable.
777 */ 982 */
778 start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; 983 set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
779 set_memory_nx(start, (end - start) >> PAGE_SHIFT);
780 984
781 rodata_test(); 985 rodata_test();
782 986
@@ -798,24 +1002,26 @@ void free_initrd_mem(unsigned long start, unsigned long end)
798} 1002}
799#endif 1003#endif
800 1004
801void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 1005int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
1006 int flags)
802{ 1007{
803#ifdef CONFIG_NUMA 1008#ifdef CONFIG_NUMA
804 int nid, next_nid; 1009 int nid, next_nid;
1010 int ret;
805#endif 1011#endif
806 unsigned long pfn = phys >> PAGE_SHIFT; 1012 unsigned long pfn = phys >> PAGE_SHIFT;
807 1013
808 if (pfn >= end_pfn) { 1014 if (pfn >= max_pfn) {
809 /* 1015 /*
810 * This can happen with kdump kernels when accessing 1016 * This can happen with kdump kernels when accessing
811 * firmware tables: 1017 * firmware tables:
812 */ 1018 */
813 if (pfn < max_pfn_mapped) 1019 if (pfn < max_pfn_mapped)
814 return; 1020 return -EFAULT;
815 1021
816 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", 1022 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
817 phys, len); 1023 phys, len);
818 return; 1024 return -EFAULT;
819 } 1025 }
820 1026
821 /* Should check here against the e820 map to avoid double free */ 1027 /* Should check here against the e820 map to avoid double free */
@@ -823,9 +1029,13 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
823 nid = phys_to_nid(phys); 1029 nid = phys_to_nid(phys);
824 next_nid = phys_to_nid(phys + len - 1); 1030 next_nid = phys_to_nid(phys + len - 1);
825 if (nid == next_nid) 1031 if (nid == next_nid)
826 reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT); 1032 ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
827 else 1033 else
828 reserve_bootmem(phys, len, BOOTMEM_DEFAULT); 1034 ret = reserve_bootmem(phys, len, flags);
1035
1036 if (ret != 0)
1037 return ret;
1038
829#else 1039#else
830 reserve_bootmem(phys, len, BOOTMEM_DEFAULT); 1040 reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
831#endif 1041#endif
@@ -834,6 +1044,8 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
834 dma_reserve += len / PAGE_SIZE; 1044 dma_reserve += len / PAGE_SIZE;
835 set_dma_reserve(dma_reserve); 1045 set_dma_reserve(dma_reserve);
836 } 1046 }
1047
1048 return 0;
837} 1049}
838 1050
839int kern_addr_valid(unsigned long addr) 1051int kern_addr_valid(unsigned long addr)
@@ -938,7 +1150,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
938 pmd_t *pmd; 1150 pmd_t *pmd;
939 1151
940 for (; addr < end; addr = next) { 1152 for (; addr < end; addr = next) {
941 next = pmd_addr_end(addr, end); 1153 void *p = NULL;
942 1154
943 pgd = vmemmap_pgd_populate(addr, node); 1155 pgd = vmemmap_pgd_populate(addr, node);
944 if (!pgd) 1156 if (!pgd)
@@ -948,33 +1160,51 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
948 if (!pud) 1160 if (!pud)
949 return -ENOMEM; 1161 return -ENOMEM;
950 1162
951 pmd = pmd_offset(pud, addr); 1163 if (!cpu_has_pse) {
952 if (pmd_none(*pmd)) { 1164 next = (addr + PAGE_SIZE) & PAGE_MASK;
953 pte_t entry; 1165 pmd = vmemmap_pmd_populate(pud, addr, node);
954 void *p; 1166
1167 if (!pmd)
1168 return -ENOMEM;
1169
1170 p = vmemmap_pte_populate(pmd, addr, node);
955 1171
956 p = vmemmap_alloc_block(PMD_SIZE, node);
957 if (!p) 1172 if (!p)
958 return -ENOMEM; 1173 return -ENOMEM;
959 1174
960 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, 1175 addr_end = addr + PAGE_SIZE;
961 PAGE_KERNEL_LARGE); 1176 p_end = p + PAGE_SIZE;
962 set_pmd(pmd, __pmd(pte_val(entry)));
963
964 /* check to see if we have contiguous blocks */
965 if (p_end != p || node_start != node) {
966 if (p_start)
967 printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
968 addr_start, addr_end-1, p_start, p_end-1, node_start);
969 addr_start = addr;
970 node_start = node;
971 p_start = p;
972 }
973 addr_end = addr + PMD_SIZE;
974 p_end = p + PMD_SIZE;
975 } else { 1177 } else {
976 vmemmap_verify((pte_t *)pmd, node, addr, next); 1178 next = pmd_addr_end(addr, end);
1179
1180 pmd = pmd_offset(pud, addr);
1181 if (pmd_none(*pmd)) {
1182 pte_t entry;
1183
1184 p = vmemmap_alloc_block(PMD_SIZE, node);
1185 if (!p)
1186 return -ENOMEM;
1187
1188 entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
1189 PAGE_KERNEL_LARGE);
1190 set_pmd(pmd, __pmd(pte_val(entry)));
1191
1192 /* check to see if we have contiguous blocks */
1193 if (p_end != p || node_start != node) {
1194 if (p_start)
1195 printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
1196 addr_start, addr_end-1, p_start, p_end-1, node_start);
1197 addr_start = addr;
1198 node_start = node;
1199 p_start = p;
1200 }
1201
1202 addr_end = addr + PMD_SIZE;
1203 p_end = p + PMD_SIZE;
1204 } else
1205 vmemmap_verify((pte_t *)pmd, node, addr, next);
977 } 1206 }
1207
978 } 1208 }
979 return 0; 1209 return 0;
980} 1210}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 2b2bb3f9b683..e4c43ec71b29 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -12,6 +12,7 @@
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
15#include <linux/mmiotrace.h>
15 16
16#include <asm/cacheflush.h> 17#include <asm/cacheflush.h>
17#include <asm/e820.h> 18#include <asm/e820.h>
@@ -23,18 +24,47 @@
23 24
24#ifdef CONFIG_X86_64 25#ifdef CONFIG_X86_64
25 26
27static inline int phys_addr_valid(unsigned long addr)
28{
29 return addr < (1UL << boot_cpu_data.x86_phys_bits);
30}
31
26unsigned long __phys_addr(unsigned long x) 32unsigned long __phys_addr(unsigned long x)
27{ 33{
28 if (x >= __START_KERNEL_map) 34 if (x >= __START_KERNEL_map) {
29 return x - __START_KERNEL_map + phys_base; 35 x -= __START_KERNEL_map;
30 return x - PAGE_OFFSET; 36 VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE);
37 x += phys_base;
38 } else {
39 VIRTUAL_BUG_ON(x < PAGE_OFFSET);
40 x -= PAGE_OFFSET;
41 VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM :
42 !phys_addr_valid(x));
43 }
44 return x;
31} 45}
32EXPORT_SYMBOL(__phys_addr); 46EXPORT_SYMBOL(__phys_addr);
33 47
34static inline int phys_addr_valid(unsigned long addr) 48bool __virt_addr_valid(unsigned long x)
35{ 49{
36 return addr < (1UL << boot_cpu_data.x86_phys_bits); 50 if (x >= __START_KERNEL_map) {
51 x -= __START_KERNEL_map;
52 if (x >= KERNEL_IMAGE_SIZE)
53 return false;
54 x += phys_base;
55 } else {
56 if (x < PAGE_OFFSET)
57 return false;
58 x -= PAGE_OFFSET;
59 if (system_state == SYSTEM_BOOTING ?
60 x > MAXMEM : !phys_addr_valid(x)) {
61 return false;
62 }
63 }
64
65 return pfn_valid(x >> PAGE_SHIFT);
37} 66}
67EXPORT_SYMBOL(__virt_addr_valid);
38 68
39#else 69#else
40 70
@@ -43,6 +73,28 @@ static inline int phys_addr_valid(unsigned long addr)
43 return 1; 73 return 1;
44} 74}
45 75
76#ifdef CONFIG_DEBUG_VIRTUAL
77unsigned long __phys_addr(unsigned long x)
78{
79 /* VMALLOC_* aren't constants; not available at the boot time */
80 VIRTUAL_BUG_ON(x < PAGE_OFFSET);
81 VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING &&
82 is_vmalloc_addr((void *) x));
83 return x - PAGE_OFFSET;
84}
85EXPORT_SYMBOL(__phys_addr);
86#endif
87
88bool __virt_addr_valid(unsigned long x)
89{
90 if (x < PAGE_OFFSET)
91 return false;
92 if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x))
93 return false;
94 return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
95}
96EXPORT_SYMBOL(__virt_addr_valid);
97
46#endif 98#endif
47 99
48int page_is_ram(unsigned long pagenr) 100int page_is_ram(unsigned long pagenr)
@@ -82,6 +134,25 @@ int page_is_ram(unsigned long pagenr)
82 return 0; 134 return 0;
83} 135}
84 136
137int pagerange_is_ram(unsigned long start, unsigned long end)
138{
139 int ram_page = 0, not_rampage = 0;
140 unsigned long page_nr;
141
142 for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
143 ++page_nr) {
144 if (page_is_ram(page_nr))
145 ram_page = 1;
146 else
147 not_rampage = 1;
148
149 if (ram_page == not_rampage)
150 return -1;
151 }
152
153 return ram_page;
154}
155
85/* 156/*
86 * Fix up the linear direct mapping of the kernel to avoid cache attribute 157 * Fix up the linear direct mapping of the kernel to avoid cache attribute
87 * conflicts. 158 * conflicts.
@@ -122,10 +193,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
122{ 193{
123 unsigned long pfn, offset, vaddr; 194 unsigned long pfn, offset, vaddr;
124 resource_size_t last_addr; 195 resource_size_t last_addr;
196 const resource_size_t unaligned_phys_addr = phys_addr;
197 const unsigned long unaligned_size = size;
125 struct vm_struct *area; 198 struct vm_struct *area;
126 unsigned long new_prot_val; 199 unsigned long new_prot_val;
127 pgprot_t prot; 200 pgprot_t prot;
128 int retval; 201 int retval;
202 void __iomem *ret_addr;
129 203
130 /* Don't allow wraparound or zero size */ 204 /* Don't allow wraparound or zero size */
131 last_addr = phys_addr + size - 1; 205 last_addr = phys_addr + size - 1;
@@ -142,7 +216,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
142 /* 216 /*
143 * Don't remap the low PCI/ISA area, it's always mapped.. 217 * Don't remap the low PCI/ISA area, it's always mapped..
144 */ 218 */
145 if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) 219 if (is_ISA_range(phys_addr, last_addr))
146 return (__force void __iomem *)phys_to_virt(phys_addr); 220 return (__force void __iomem *)phys_to_virt(phys_addr);
147 221
148 /* 222 /*
@@ -166,7 +240,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
166 phys_addr &= PAGE_MASK; 240 phys_addr &= PAGE_MASK;
167 size = PAGE_ALIGN(last_addr+1) - phys_addr; 241 size = PAGE_ALIGN(last_addr+1) - phys_addr;
168 242
169 retval = reserve_memtype(phys_addr, phys_addr + size, 243 retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
170 prot_val, &new_prot_val); 244 prot_val, &new_prot_val);
171 if (retval) { 245 if (retval) {
172 pr_debug("Warning: reserve_memtype returned %d\n", retval); 246 pr_debug("Warning: reserve_memtype returned %d\n", retval);
@@ -200,16 +274,16 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
200 switch (prot_val) { 274 switch (prot_val) {
201 case _PAGE_CACHE_UC: 275 case _PAGE_CACHE_UC:
202 default: 276 default:
203 prot = PAGE_KERNEL_NOCACHE; 277 prot = PAGE_KERNEL_IO_NOCACHE;
204 break; 278 break;
205 case _PAGE_CACHE_UC_MINUS: 279 case _PAGE_CACHE_UC_MINUS:
206 prot = PAGE_KERNEL_UC_MINUS; 280 prot = PAGE_KERNEL_IO_UC_MINUS;
207 break; 281 break;
208 case _PAGE_CACHE_WC: 282 case _PAGE_CACHE_WC:
209 prot = PAGE_KERNEL_WC; 283 prot = PAGE_KERNEL_IO_WC;
210 break; 284 break;
211 case _PAGE_CACHE_WB: 285 case _PAGE_CACHE_WB:
212 prot = PAGE_KERNEL; 286 prot = PAGE_KERNEL_IO;
213 break; 287 break;
214 } 288 }
215 289
@@ -233,7 +307,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
233 return NULL; 307 return NULL;
234 } 308 }
235 309
236 return (void __iomem *) (vaddr + offset); 310 ret_addr = (void __iomem *) (vaddr + offset);
311 mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
312
313 return ret_addr;
237} 314}
238 315
239/** 316/**
@@ -261,7 +338,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
261{ 338{
262 /* 339 /*
263 * Ideally, this should be: 340 * Ideally, this should be:
264 * pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS; 341 * pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
265 * 342 *
266 * Till we fix all X drivers to use ioremap_wc(), we will use 343 * Till we fix all X drivers to use ioremap_wc(), we will use
267 * UC MINUS. 344 * UC MINUS.
@@ -285,7 +362,7 @@ EXPORT_SYMBOL(ioremap_nocache);
285 */ 362 */
286void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size) 363void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
287{ 364{
288 if (pat_wc_enabled) 365 if (pat_enabled)
289 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, 366 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
290 __builtin_return_address(0)); 367 __builtin_return_address(0));
291 else 368 else
@@ -300,6 +377,37 @@ void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
300} 377}
301EXPORT_SYMBOL(ioremap_cache); 378EXPORT_SYMBOL(ioremap_cache);
302 379
380static void __iomem *ioremap_default(resource_size_t phys_addr,
381 unsigned long size)
382{
383 unsigned long flags;
384 void *ret;
385 int err;
386
387 /*
388 * - WB for WB-able memory and no other conflicting mappings
389 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
390 * - Inherit from confliting mappings otherwise
391 */
392 err = reserve_memtype(phys_addr, phys_addr + size, -1, &flags);
393 if (err < 0)
394 return NULL;
395
396 ret = (void *) __ioremap_caller(phys_addr, size, flags,
397 __builtin_return_address(0));
398
399 free_memtype(phys_addr, phys_addr + size);
400 return (void __iomem *)ret;
401}
402
403void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
404 unsigned long prot_val)
405{
406 return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK),
407 __builtin_return_address(0));
408}
409EXPORT_SYMBOL(ioremap_prot);
410
303/** 411/**
304 * iounmap - Free a IO remapping 412 * iounmap - Free a IO remapping
305 * @addr: virtual address from ioremap_* 413 * @addr: virtual address from ioremap_*
@@ -318,13 +426,15 @@ void iounmap(volatile void __iomem *addr)
318 * vm_area and by simply returning an address into the kernel mapping 426 * vm_area and by simply returning an address into the kernel mapping
319 * of ISA space. So handle that here. 427 * of ISA space. So handle that here.
320 */ 428 */
321 if (addr >= phys_to_virt(ISA_START_ADDRESS) && 429 if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) &&
322 addr < phys_to_virt(ISA_END_ADDRESS)) 430 (void __force *)addr < phys_to_virt(ISA_END_ADDRESS))
323 return; 431 return;
324 432
325 addr = (volatile void __iomem *) 433 addr = (volatile void __iomem *)
326 (PAGE_MASK & (unsigned long __force)addr); 434 (PAGE_MASK & (unsigned long __force)addr);
327 435
436 mmiotrace_iounmap(addr);
437
328 /* Use the vm area unlocked, assuming the caller 438 /* Use the vm area unlocked, assuming the caller
329 ensures there isn't another iounmap for the same address 439 ensures there isn't another iounmap for the same address
330 in parallel. Reuse of the virtual address is prevented by 440 in parallel. Reuse of the virtual address is prevented by
@@ -332,7 +442,7 @@ void iounmap(volatile void __iomem *addr)
332 cpa takes care of the direct mappings. */ 442 cpa takes care of the direct mappings. */
333 read_lock(&vmlist_lock); 443 read_lock(&vmlist_lock);
334 for (p = vmlist; p; p = p->next) { 444 for (p = vmlist; p; p = p->next) {
335 if (p->addr == addr) 445 if (p->addr == (void __force *)addr)
336 break; 446 break;
337 } 447 }
338 read_unlock(&vmlist_lock); 448 read_unlock(&vmlist_lock);
@@ -346,7 +456,7 @@ void iounmap(volatile void __iomem *addr)
346 free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p)); 456 free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
347 457
348 /* Finally remove it */ 458 /* Finally remove it */
349 o = remove_vm_area((void *)addr); 459 o = remove_vm_area((void __force *)addr);
350 BUG_ON(p != o || o == NULL); 460 BUG_ON(p != o || o == NULL);
351 kfree(p); 461 kfree(p);
352} 462}
@@ -365,7 +475,7 @@ void *xlate_dev_mem_ptr(unsigned long phys)
365 if (page_is_ram(start >> PAGE_SHIFT)) 475 if (page_is_ram(start >> PAGE_SHIFT))
366 return __va(phys); 476 return __va(phys);
367 477
368 addr = (void *)ioremap(start, PAGE_SIZE); 478 addr = (void __force *)ioremap_default(start, PAGE_SIZE);
369 if (addr) 479 if (addr)
370 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); 480 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
371 481
@@ -381,9 +491,7 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
381 return; 491 return;
382} 492}
383 493
384#ifdef CONFIG_X86_32 494static int __initdata early_ioremap_debug;
385
386int __initdata early_ioremap_debug;
387 495
388static int __init early_ioremap_debug_setup(char *str) 496static int __init early_ioremap_debug_setup(char *str)
389{ 497{
@@ -394,8 +502,7 @@ static int __init early_ioremap_debug_setup(char *str)
394early_param("early_ioremap_debug", early_ioremap_debug_setup); 502early_param("early_ioremap_debug", early_ioremap_debug_setup);
395 503
396static __initdata int after_paging_init; 504static __initdata int after_paging_init;
397static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] 505static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
398 __section(.bss.page_aligned);
399 506
400static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) 507static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
401{ 508{
@@ -484,20 +591,21 @@ static void __init __early_set_fixmap(enum fixed_addresses idx,
484 return; 591 return;
485 } 592 }
486 pte = early_ioremap_pte(addr); 593 pte = early_ioremap_pte(addr);
594
487 if (pgprot_val(flags)) 595 if (pgprot_val(flags))
488 set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); 596 set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
489 else 597 else
490 pte_clear(NULL, addr, pte); 598 pte_clear(&init_mm, addr, pte);
491 __flush_tlb_one(addr); 599 __flush_tlb_one(addr);
492} 600}
493 601
494static inline void __init early_set_fixmap(enum fixed_addresses idx, 602static inline void __init early_set_fixmap(enum fixed_addresses idx,
495 unsigned long phys) 603 unsigned long phys, pgprot_t prot)
496{ 604{
497 if (after_paging_init) 605 if (after_paging_init)
498 set_fixmap(idx, phys); 606 __set_fixmap(idx, phys, prot);
499 else 607 else
500 __early_set_fixmap(idx, phys, PAGE_KERNEL); 608 __early_set_fixmap(idx, phys, prot);
501} 609}
502 610
503static inline void __init early_clear_fixmap(enum fixed_addresses idx) 611static inline void __init early_clear_fixmap(enum fixed_addresses idx)
@@ -508,37 +616,56 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx)
508 __early_set_fixmap(idx, 0, __pgprot(0)); 616 __early_set_fixmap(idx, 0, __pgprot(0));
509} 617}
510 618
511 619static void *prev_map[FIX_BTMAPS_SLOTS] __initdata;
512int __initdata early_ioremap_nested; 620static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
513
514static int __init check_early_ioremap_leak(void) 621static int __init check_early_ioremap_leak(void)
515{ 622{
516 if (!early_ioremap_nested) 623 int count = 0;
517 return 0; 624 int i;
518 625
519 printk(KERN_WARNING 626 for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
627 if (prev_map[i])
628 count++;
629
630 if (!count)
631 return 0;
632 WARN(1, KERN_WARNING
520 "Debug warning: early ioremap leak of %d areas detected.\n", 633 "Debug warning: early ioremap leak of %d areas detected.\n",
521 early_ioremap_nested); 634 count);
522 printk(KERN_WARNING 635 printk(KERN_WARNING
523 "please boot with early_ioremap_debug and report the dmesg.\n"); 636 "please boot with early_ioremap_debug and report the dmesg.\n");
524 WARN_ON(1);
525 637
526 return 1; 638 return 1;
527} 639}
528late_initcall(check_early_ioremap_leak); 640late_initcall(check_early_ioremap_leak);
529 641
530void __init *early_ioremap(unsigned long phys_addr, unsigned long size) 642static void __init *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
531{ 643{
532 unsigned long offset, last_addr; 644 unsigned long offset, last_addr;
533 unsigned int nrpages, nesting; 645 unsigned int nrpages;
534 enum fixed_addresses idx0, idx; 646 enum fixed_addresses idx0, idx;
647 int i, slot;
535 648
536 WARN_ON(system_state != SYSTEM_BOOTING); 649 WARN_ON(system_state != SYSTEM_BOOTING);
537 650
538 nesting = early_ioremap_nested; 651 slot = -1;
652 for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
653 if (!prev_map[i]) {
654 slot = i;
655 break;
656 }
657 }
658
659 if (slot < 0) {
660 printk(KERN_INFO "early_iomap(%08lx, %08lx) not found slot\n",
661 phys_addr, size);
662 WARN_ON(1);
663 return NULL;
664 }
665
539 if (early_ioremap_debug) { 666 if (early_ioremap_debug) {
540 printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ", 667 printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ",
541 phys_addr, size, nesting); 668 phys_addr, size, slot);
542 dump_stack(); 669 dump_stack();
543 } 670 }
544 671
@@ -549,17 +676,13 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
549 return NULL; 676 return NULL;
550 } 677 }
551 678
552 if (nesting >= FIX_BTMAPS_NESTING) { 679 prev_size[slot] = size;
553 WARN_ON(1);
554 return NULL;
555 }
556 early_ioremap_nested++;
557 /* 680 /*
558 * Mappings have to be page-aligned 681 * Mappings have to be page-aligned
559 */ 682 */
560 offset = phys_addr & ~PAGE_MASK; 683 offset = phys_addr & ~PAGE_MASK;
561 phys_addr &= PAGE_MASK; 684 phys_addr &= PAGE_MASK;
562 size = PAGE_ALIGN(last_addr) - phys_addr; 685 size = PAGE_ALIGN(last_addr + 1) - phys_addr;
563 686
564 /* 687 /*
565 * Mappings have to fit in the FIX_BTMAP area. 688 * Mappings have to fit in the FIX_BTMAP area.
@@ -573,10 +696,10 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
573 /* 696 /*
574 * Ok, go for it.. 697 * Ok, go for it..
575 */ 698 */
576 idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; 699 idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
577 idx = idx0; 700 idx = idx0;
578 while (nrpages > 0) { 701 while (nrpages > 0) {
579 early_set_fixmap(idx, phys_addr); 702 early_set_fixmap(idx, phys_addr, prot);
580 phys_addr += PAGE_SIZE; 703 phys_addr += PAGE_SIZE;
581 --idx; 704 --idx;
582 --nrpages; 705 --nrpages;
@@ -584,7 +707,20 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
584 if (early_ioremap_debug) 707 if (early_ioremap_debug)
585 printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0)); 708 printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
586 709
587 return (void *) (offset + fix_to_virt(idx0)); 710 prev_map[slot] = (void *) (offset + fix_to_virt(idx0));
711 return prev_map[slot];
712}
713
714/* Remap an IO device */
715void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
716{
717 return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO);
718}
719
720/* Remap memory */
721void __init *early_memremap(unsigned long phys_addr, unsigned long size)
722{
723 return __early_ioremap(phys_addr, size, PAGE_KERNEL);
588} 724}
589 725
590void __init early_iounmap(void *addr, unsigned long size) 726void __init early_iounmap(void *addr, unsigned long size)
@@ -593,15 +729,33 @@ void __init early_iounmap(void *addr, unsigned long size)
593 unsigned long offset; 729 unsigned long offset;
594 unsigned int nrpages; 730 unsigned int nrpages;
595 enum fixed_addresses idx; 731 enum fixed_addresses idx;
596 int nesting; 732 int i, slot;
597 733
598 nesting = --early_ioremap_nested; 734 slot = -1;
599 if (WARN_ON(nesting < 0)) 735 for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
736 if (prev_map[i] == addr) {
737 slot = i;
738 break;
739 }
740 }
741
742 if (slot < 0) {
743 printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n",
744 addr, size);
745 WARN_ON(1);
746 return;
747 }
748
749 if (prev_size[slot] != size) {
750 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n",
751 addr, size, slot, prev_size[slot]);
752 WARN_ON(1);
600 return; 753 return;
754 }
601 755
602 if (early_ioremap_debug) { 756 if (early_ioremap_debug) {
603 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, 757 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
604 size, nesting); 758 size, slot);
605 dump_stack(); 759 dump_stack();
606 } 760 }
607 761
@@ -613,17 +767,16 @@ void __init early_iounmap(void *addr, unsigned long size)
613 offset = virt_addr & ~PAGE_MASK; 767 offset = virt_addr & ~PAGE_MASK;
614 nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; 768 nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
615 769
616 idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; 770 idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
617 while (nrpages > 0) { 771 while (nrpages > 0) {
618 early_clear_fixmap(idx); 772 early_clear_fixmap(idx);
619 --idx; 773 --idx;
620 --nrpages; 774 --nrpages;
621 } 775 }
776 prev_map[slot] = 0;
622} 777}
623 778
624void __this_fixmap_does_not_exist(void) 779void __this_fixmap_does_not_exist(void)
625{ 780{
626 WARN_ON(1); 781 WARN_ON(1);
627} 782}
628
629#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 1f476e477844..41f1b5c00a1d 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -22,6 +22,7 @@
22#include <asm/numa.h> 22#include <asm/numa.h>
23#include <asm/mpspec.h> 23#include <asm/mpspec.h>
24#include <asm/apic.h> 24#include <asm/apic.h>
25#include <asm/k8.h>
25 26
26static __init int find_northbridge(void) 27static __init int find_northbridge(void)
27{ 28{
@@ -56,34 +57,33 @@ static __init void early_get_boot_cpu_id(void)
56 /* 57 /*
57 * Find possible boot-time SMP configuration: 58 * Find possible boot-time SMP configuration:
58 */ 59 */
60#ifdef CONFIG_X86_MPPARSE
59 early_find_smp_config(); 61 early_find_smp_config();
62#endif
60#ifdef CONFIG_ACPI 63#ifdef CONFIG_ACPI
61 /* 64 /*
62 * Read APIC information from ACPI tables. 65 * Read APIC information from ACPI tables.
63 */ 66 */
64 early_acpi_boot_init(); 67 early_acpi_boot_init();
65#endif 68#endif
69#ifdef CONFIG_X86_MPPARSE
66 /* 70 /*
67 * get boot-time SMP configuration: 71 * get boot-time SMP configuration:
68 */ 72 */
69 if (smp_found_config) 73 if (smp_found_config)
70 early_get_smp_config(); 74 early_get_smp_config();
75#endif
71 early_init_lapic_mapping(); 76 early_init_lapic_mapping();
72} 77}
73 78
74int __init k8_scan_nodes(unsigned long start, unsigned long end) 79int __init k8_scan_nodes(unsigned long start, unsigned long end)
75{ 80{
81 unsigned numnodes, cores, bits, apicid_base;
76 unsigned long prevbase; 82 unsigned long prevbase;
77 struct bootnode nodes[8]; 83 struct bootnode nodes[8];
78 int nodeid, i, nb;
79 unsigned char nodeids[8]; 84 unsigned char nodeids[8];
80 int found = 0; 85 int i, j, nb, found = 0;
81 u32 reg; 86 u32 nodeid, reg;
82 unsigned numnodes;
83 unsigned cores;
84 unsigned bits;
85 int j;
86 unsigned apicid_base;
87 87
88 if (!early_pci_allowed()) 88 if (!early_pci_allowed())
89 return -1; 89 return -1;
@@ -105,7 +105,6 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
105 prevbase = 0; 105 prevbase = 0;
106 for (i = 0; i < 8; i++) { 106 for (i = 0; i < 8; i++) {
107 unsigned long base, limit; 107 unsigned long base, limit;
108 u32 nodeid;
109 108
110 base = read_pci_config(0, nb, 1, 0x40 + i*8); 109 base = read_pci_config(0, nb, 1, 0x40 + i*8);
111 limit = read_pci_config(0, nb, 1, 0x44 + i*8); 110 limit = read_pci_config(0, nb, 1, 0x44 + i*8);
@@ -144,8 +143,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
144 limit |= (1<<24)-1; 143 limit |= (1<<24)-1;
145 limit++; 144 limit++;
146 145
147 if (limit > end_pfn << PAGE_SHIFT) 146 if (limit > max_pfn << PAGE_SHIFT)
148 limit = end_pfn << PAGE_SHIFT; 147 limit = max_pfn << PAGE_SHIFT;
149 if (limit <= base) 148 if (limit <= base)
150 continue; 149 continue;
151 150
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
new file mode 100644
index 000000000000..93d82038af4b
--- /dev/null
+++ b/arch/x86/mm/kmmio.c
@@ -0,0 +1,510 @@
1/* Support for MMIO probes.
2 * Benfit many code from kprobes
3 * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>.
4 * 2007 Alexander Eichner
5 * 2008 Pekka Paalanen <pq@iki.fi>
6 */
7
8#include <linux/list.h>
9#include <linux/rculist.h>
10#include <linux/spinlock.h>
11#include <linux/hash.h>
12#include <linux/init.h>
13#include <linux/module.h>
14#include <linux/kernel.h>
15#include <linux/uaccess.h>
16#include <linux/ptrace.h>
17#include <linux/preempt.h>
18#include <linux/percpu.h>
19#include <linux/kdebug.h>
20#include <linux/mutex.h>
21#include <linux/io.h>
22#include <asm/cacheflush.h>
23#include <asm/tlbflush.h>
24#include <linux/errno.h>
25#include <asm/debugreg.h>
26#include <linux/mmiotrace.h>
27
28#define KMMIO_PAGE_HASH_BITS 4
29#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
30
31struct kmmio_fault_page {
32 struct list_head list;
33 struct kmmio_fault_page *release_next;
34 unsigned long page; /* location of the fault page */
35
36 /*
37 * Number of times this page has been registered as a part
38 * of a probe. If zero, page is disarmed and this may be freed.
39 * Used only by writers (RCU).
40 */
41 int count;
42};
43
44struct kmmio_delayed_release {
45 struct rcu_head rcu;
46 struct kmmio_fault_page *release_list;
47};
48
49struct kmmio_context {
50 struct kmmio_fault_page *fpage;
51 struct kmmio_probe *probe;
52 unsigned long saved_flags;
53 unsigned long addr;
54 int active;
55};
56
57static DEFINE_SPINLOCK(kmmio_lock);
58
59/* Protected by kmmio_lock */
60unsigned int kmmio_count;
61
62/* Read-protected by RCU, write-protected by kmmio_lock. */
63static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
64static LIST_HEAD(kmmio_probes);
65
66static struct list_head *kmmio_page_list(unsigned long page)
67{
68 return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
69}
70
71/* Accessed per-cpu */
72static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
73
74/*
75 * this is basically a dynamic stabbing problem:
76 * Could use the existing prio tree code or
77 * Possible better implementations:
78 * The Interval Skip List: A Data Structure for Finding All Intervals That
79 * Overlap a Point (might be simple)
80 * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
81 */
82/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
83static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
84{
85 struct kmmio_probe *p;
86 list_for_each_entry_rcu(p, &kmmio_probes, list) {
87 if (addr >= p->addr && addr <= (p->addr + p->len))
88 return p;
89 }
90 return NULL;
91}
92
93/* You must be holding RCU read lock. */
94static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
95{
96 struct list_head *head;
97 struct kmmio_fault_page *p;
98
99 page &= PAGE_MASK;
100 head = kmmio_page_list(page);
101 list_for_each_entry_rcu(p, head, list) {
102 if (p->page == page)
103 return p;
104 }
105 return NULL;
106}
107
108static void set_page_present(unsigned long addr, bool present,
109 unsigned int *pglevel)
110{
111 pteval_t pteval;
112 pmdval_t pmdval;
113 unsigned int level;
114 pmd_t *pmd;
115 pte_t *pte = lookup_address(addr, &level);
116
117 if (!pte) {
118 pr_err("kmmio: no pte for page 0x%08lx\n", addr);
119 return;
120 }
121
122 if (pglevel)
123 *pglevel = level;
124
125 switch (level) {
126 case PG_LEVEL_2M:
127 pmd = (pmd_t *)pte;
128 pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT;
129 if (present)
130 pmdval |= _PAGE_PRESENT;
131 set_pmd(pmd, __pmd(pmdval));
132 break;
133
134 case PG_LEVEL_4K:
135 pteval = pte_val(*pte) & ~_PAGE_PRESENT;
136 if (present)
137 pteval |= _PAGE_PRESENT;
138 set_pte_atomic(pte, __pte(pteval));
139 break;
140
141 default:
142 pr_err("kmmio: unexpected page level 0x%x.\n", level);
143 return;
144 }
145
146 __flush_tlb_one(addr);
147}
148
149/** Mark the given page as not present. Access to it will trigger a fault. */
150static void arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
151{
152 set_page_present(page & PAGE_MASK, false, pglevel);
153}
154
155/** Mark the given page as present. */
156static void disarm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
157{
158 set_page_present(page & PAGE_MASK, true, pglevel);
159}
160
161/*
162 * This is being called from do_page_fault().
163 *
164 * We may be in an interrupt or a critical section. Also prefecthing may
165 * trigger a page fault. We may be in the middle of process switch.
166 * We cannot take any locks, because we could be executing especially
167 * within a kmmio critical section.
168 *
169 * Local interrupts are disabled, so preemption cannot happen.
170 * Do not enable interrupts, do not sleep, and watch out for other CPUs.
171 */
172/*
173 * Interrupts are disabled on entry as trap3 is an interrupt gate
174 * and they remain disabled thorough out this function.
175 */
176int kmmio_handler(struct pt_regs *regs, unsigned long addr)
177{
178 struct kmmio_context *ctx;
179 struct kmmio_fault_page *faultpage;
180 int ret = 0; /* default to fault not handled */
181
182 /*
183 * Preemption is now disabled to prevent process switch during
184 * single stepping. We can only handle one active kmmio trace
185 * per cpu, so ensure that we finish it before something else
186 * gets to run. We also hold the RCU read lock over single
187 * stepping to avoid looking up the probe and kmmio_fault_page
188 * again.
189 */
190 preempt_disable();
191 rcu_read_lock();
192
193 faultpage = get_kmmio_fault_page(addr);
194 if (!faultpage) {
195 /*
196 * Either this page fault is not caused by kmmio, or
197 * another CPU just pulled the kmmio probe from under
198 * our feet. The latter case should not be possible.
199 */
200 goto no_kmmio;
201 }
202
203 ctx = &get_cpu_var(kmmio_ctx);
204 if (ctx->active) {
205 disarm_kmmio_fault_page(faultpage->page, NULL);
206 if (addr == ctx->addr) {
207 /*
208 * On SMP we sometimes get recursive probe hits on the
209 * same address. Context is already saved, fall out.
210 */
211 pr_debug("kmmio: duplicate probe hit on CPU %d, for "
212 "address 0x%08lx.\n",
213 smp_processor_id(), addr);
214 ret = 1;
215 goto no_kmmio_ctx;
216 }
217 /*
218 * Prevent overwriting already in-flight context.
219 * This should not happen, let's hope disarming at least
220 * prevents a panic.
221 */
222 pr_emerg("kmmio: recursive probe hit on CPU %d, "
223 "for address 0x%08lx. Ignoring.\n",
224 smp_processor_id(), addr);
225 pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
226 ctx->addr);
227 goto no_kmmio_ctx;
228 }
229 ctx->active++;
230
231 ctx->fpage = faultpage;
232 ctx->probe = get_kmmio_probe(addr);
233 ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
234 ctx->addr = addr;
235
236 if (ctx->probe && ctx->probe->pre_handler)
237 ctx->probe->pre_handler(ctx->probe, regs, addr);
238
239 /*
240 * Enable single-stepping and disable interrupts for the faulting
241 * context. Local interrupts must not get enabled during stepping.
242 */
243 regs->flags |= X86_EFLAGS_TF;
244 regs->flags &= ~X86_EFLAGS_IF;
245
246 /* Now we set present bit in PTE and single step. */
247 disarm_kmmio_fault_page(ctx->fpage->page, NULL);
248
249 /*
250 * If another cpu accesses the same page while we are stepping,
251 * the access will not be caught. It will simply succeed and the
252 * only downside is we lose the event. If this becomes a problem,
253 * the user should drop to single cpu before tracing.
254 */
255
256 put_cpu_var(kmmio_ctx);
257 return 1; /* fault handled */
258
259no_kmmio_ctx:
260 put_cpu_var(kmmio_ctx);
261no_kmmio:
262 rcu_read_unlock();
263 preempt_enable_no_resched();
264 return ret;
265}
266
267/*
268 * Interrupts are disabled on entry as trap1 is an interrupt gate
269 * and they remain disabled thorough out this function.
270 * This must always get called as the pair to kmmio_handler().
271 */
272static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
273{
274 int ret = 0;
275 struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
276
277 if (!ctx->active) {
278 pr_debug("kmmio: spurious debug trap on CPU %d.\n",
279 smp_processor_id());
280 goto out;
281 }
282
283 if (ctx->probe && ctx->probe->post_handler)
284 ctx->probe->post_handler(ctx->probe, condition, regs);
285
286 arm_kmmio_fault_page(ctx->fpage->page, NULL);
287
288 regs->flags &= ~X86_EFLAGS_TF;
289 regs->flags |= ctx->saved_flags;
290
291 /* These were acquired in kmmio_handler(). */
292 ctx->active--;
293 BUG_ON(ctx->active);
294 rcu_read_unlock();
295 preempt_enable_no_resched();
296
297 /*
298 * if somebody else is singlestepping across a probe point, flags
299 * will have TF set, in which case, continue the remaining processing
300 * of do_debug, as if this is not a probe hit.
301 */
302 if (!(regs->flags & X86_EFLAGS_TF))
303 ret = 1;
304out:
305 put_cpu_var(kmmio_ctx);
306 return ret;
307}
308
309/* You must be holding kmmio_lock. */
310static int add_kmmio_fault_page(unsigned long page)
311{
312 struct kmmio_fault_page *f;
313
314 page &= PAGE_MASK;
315 f = get_kmmio_fault_page(page);
316 if (f) {
317 if (!f->count)
318 arm_kmmio_fault_page(f->page, NULL);
319 f->count++;
320 return 0;
321 }
322
323 f = kmalloc(sizeof(*f), GFP_ATOMIC);
324 if (!f)
325 return -1;
326
327 f->count = 1;
328 f->page = page;
329 list_add_rcu(&f->list, kmmio_page_list(f->page));
330
331 arm_kmmio_fault_page(f->page, NULL);
332
333 return 0;
334}
335
336/* You must be holding kmmio_lock. */
337static void release_kmmio_fault_page(unsigned long page,
338 struct kmmio_fault_page **release_list)
339{
340 struct kmmio_fault_page *f;
341
342 page &= PAGE_MASK;
343 f = get_kmmio_fault_page(page);
344 if (!f)
345 return;
346
347 f->count--;
348 BUG_ON(f->count < 0);
349 if (!f->count) {
350 disarm_kmmio_fault_page(f->page, NULL);
351 f->release_next = *release_list;
352 *release_list = f;
353 }
354}
355
356/*
357 * With page-unaligned ioremaps, one or two armed pages may contain
358 * addresses from outside the intended mapping. Events for these addresses
359 * are currently silently dropped. The events may result only from programming
360 * mistakes by accessing addresses before the beginning or past the end of a
361 * mapping.
362 */
363int register_kmmio_probe(struct kmmio_probe *p)
364{
365 unsigned long flags;
366 int ret = 0;
367 unsigned long size = 0;
368 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
369
370 spin_lock_irqsave(&kmmio_lock, flags);
371 if (get_kmmio_probe(p->addr)) {
372 ret = -EEXIST;
373 goto out;
374 }
375 kmmio_count++;
376 list_add_rcu(&p->list, &kmmio_probes);
377 while (size < size_lim) {
378 if (add_kmmio_fault_page(p->addr + size))
379 pr_err("kmmio: Unable to set page fault.\n");
380 size += PAGE_SIZE;
381 }
382out:
383 spin_unlock_irqrestore(&kmmio_lock, flags);
384 /*
385 * XXX: What should I do here?
386 * Here was a call to global_flush_tlb(), but it does not exist
387 * anymore. It seems it's not needed after all.
388 */
389 return ret;
390}
391EXPORT_SYMBOL(register_kmmio_probe);
392
393static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
394{
395 struct kmmio_delayed_release *dr = container_of(
396 head,
397 struct kmmio_delayed_release,
398 rcu);
399 struct kmmio_fault_page *p = dr->release_list;
400 while (p) {
401 struct kmmio_fault_page *next = p->release_next;
402 BUG_ON(p->count);
403 kfree(p);
404 p = next;
405 }
406 kfree(dr);
407}
408
409static void remove_kmmio_fault_pages(struct rcu_head *head)
410{
411 struct kmmio_delayed_release *dr = container_of(
412 head,
413 struct kmmio_delayed_release,
414 rcu);
415 struct kmmio_fault_page *p = dr->release_list;
416 struct kmmio_fault_page **prevp = &dr->release_list;
417 unsigned long flags;
418 spin_lock_irqsave(&kmmio_lock, flags);
419 while (p) {
420 if (!p->count)
421 list_del_rcu(&p->list);
422 else
423 *prevp = p->release_next;
424 prevp = &p->release_next;
425 p = p->release_next;
426 }
427 spin_unlock_irqrestore(&kmmio_lock, flags);
428 /* This is the real RCU destroy call. */
429 call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
430}
431
432/*
433 * Remove a kmmio probe. You have to synchronize_rcu() before you can be
434 * sure that the callbacks will not be called anymore. Only after that
435 * you may actually release your struct kmmio_probe.
436 *
437 * Unregistering a kmmio fault page has three steps:
438 * 1. release_kmmio_fault_page()
439 * Disarm the page, wait a grace period to let all faults finish.
440 * 2. remove_kmmio_fault_pages()
441 * Remove the pages from kmmio_page_table.
442 * 3. rcu_free_kmmio_fault_pages()
443 * Actally free the kmmio_fault_page structs as with RCU.
444 */
445void unregister_kmmio_probe(struct kmmio_probe *p)
446{
447 unsigned long flags;
448 unsigned long size = 0;
449 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
450 struct kmmio_fault_page *release_list = NULL;
451 struct kmmio_delayed_release *drelease;
452
453 spin_lock_irqsave(&kmmio_lock, flags);
454 while (size < size_lim) {
455 release_kmmio_fault_page(p->addr + size, &release_list);
456 size += PAGE_SIZE;
457 }
458 list_del_rcu(&p->list);
459 kmmio_count--;
460 spin_unlock_irqrestore(&kmmio_lock, flags);
461
462 drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
463 if (!drelease) {
464 pr_crit("kmmio: leaking kmmio_fault_page objects.\n");
465 return;
466 }
467 drelease->release_list = release_list;
468
469 /*
470 * This is not really RCU here. We have just disarmed a set of
471 * pages so that they cannot trigger page faults anymore. However,
472 * we cannot remove the pages from kmmio_page_table,
473 * because a probe hit might be in flight on another CPU. The
474 * pages are collected into a list, and they will be removed from
475 * kmmio_page_table when it is certain that no probe hit related to
476 * these pages can be in flight. RCU grace period sounds like a
477 * good choice.
478 *
479 * If we removed the pages too early, kmmio page fault handler might
480 * not find the respective kmmio_fault_page and determine it's not
481 * a kmmio fault, when it actually is. This would lead to madness.
482 */
483 call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
484}
485EXPORT_SYMBOL(unregister_kmmio_probe);
486
487static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
488 void *args)
489{
490 struct die_args *arg = args;
491
492 if (val == DIE_DEBUG && (arg->err & DR_STEP))
493 if (post_kmmio_handler(arg->err, arg->regs) == 1)
494 return NOTIFY_STOP;
495
496 return NOTIFY_DONE;
497}
498
499static struct notifier_block nb_die = {
500 .notifier_call = kmmio_die_notifier
501};
502
503static int __init init_kmmio(void)
504{
505 int i;
506 for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
507 INIT_LIST_HEAD(&kmmio_page_table[i]);
508 return register_die_notifier(&nb_die);
509}
510fs_initcall(init_kmmio); /* should be before device_initcall() */
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
new file mode 100644
index 000000000000..672e17f8262a
--- /dev/null
+++ b/arch/x86/mm/memtest.c
@@ -0,0 +1,123 @@
1#include <linux/kernel.h>
2#include <linux/errno.h>
3#include <linux/string.h>
4#include <linux/types.h>
5#include <linux/mm.h>
6#include <linux/smp.h>
7#include <linux/init.h>
8#include <linux/pfn.h>
9
10#include <asm/e820.h>
11
12static void __init memtest(unsigned long start_phys, unsigned long size,
13 unsigned pattern)
14{
15 unsigned long i;
16 unsigned long *start;
17 unsigned long start_bad;
18 unsigned long last_bad;
19 unsigned long val;
20 unsigned long start_phys_aligned;
21 unsigned long count;
22 unsigned long incr;
23
24 switch (pattern) {
25 case 0:
26 val = 0UL;
27 break;
28 case 1:
29 val = -1UL;
30 break;
31 case 2:
32#ifdef CONFIG_X86_64
33 val = 0x5555555555555555UL;
34#else
35 val = 0x55555555UL;
36#endif
37 break;
38 case 3:
39#ifdef CONFIG_X86_64
40 val = 0xaaaaaaaaaaaaaaaaUL;
41#else
42 val = 0xaaaaaaaaUL;
43#endif
44 break;
45 default:
46 return;
47 }
48
49 incr = sizeof(unsigned long);
50 start_phys_aligned = ALIGN(start_phys, incr);
51 count = (size - (start_phys_aligned - start_phys))/incr;
52 start = __va(start_phys_aligned);
53 start_bad = 0;
54 last_bad = 0;
55
56 for (i = 0; i < count; i++)
57 start[i] = val;
58 for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
59 if (*start != val) {
60 if (start_phys_aligned == last_bad + incr) {
61 last_bad += incr;
62 } else {
63 if (start_bad) {
64 printk(KERN_CONT "\n %010lx bad mem addr %010lx - %010lx reserved",
65 val, start_bad, last_bad + incr);
66 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
67 }
68 start_bad = last_bad = start_phys_aligned;
69 }
70 }
71 }
72 if (start_bad) {
73 printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved",
74 val, start_bad, last_bad + incr);
75 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
76 }
77
78}
79
80/* default is disabled */
81static int memtest_pattern __initdata;
82
83static int __init parse_memtest(char *arg)
84{
85 if (arg)
86 memtest_pattern = simple_strtoul(arg, NULL, 0);
87 return 0;
88}
89
90early_param("memtest", parse_memtest);
91
92void __init early_memtest(unsigned long start, unsigned long end)
93{
94 u64 t_start, t_size;
95 unsigned pattern;
96
97 if (!memtest_pattern)
98 return;
99
100 printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
101 for (pattern = 0; pattern < memtest_pattern; pattern++) {
102 t_start = start;
103 t_size = 0;
104 while (t_start < end) {
105 t_start = find_e820_area_size(t_start, &t_size, 1);
106
107 /* done ? */
108 if (t_start >= end)
109 break;
110 if (t_start + t_size > end)
111 t_size = end - t_start;
112
113 printk(KERN_CONT "\n %010llx - %010llx pattern %d",
114 (unsigned long long)t_start,
115 (unsigned long long)t_start + t_size, pattern);
116
117 memtest(t_start, t_size, pattern);
118
119 t_start += t_size;
120 }
121 }
122 printk(KERN_CONT "\n");
123}
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
new file mode 100644
index 000000000000..635b50e85581
--- /dev/null
+++ b/arch/x86/mm/mmio-mod.c
@@ -0,0 +1,517 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2005
17 * Jeff Muizelaar, 2006, 2007
18 * Pekka Paalanen, 2008 <pq@iki.fi>
19 *
20 * Derived from the read-mod example from relay-examples by Tom Zanussi.
21 */
22#define DEBUG 1
23
24#include <linux/module.h>
25#include <linux/debugfs.h>
26#include <linux/uaccess.h>
27#include <linux/io.h>
28#include <linux/version.h>
29#include <linux/kallsyms.h>
30#include <asm/pgtable.h>
31#include <linux/mmiotrace.h>
32#include <asm/e820.h> /* for ISA_START_ADDRESS */
33#include <asm/atomic.h>
34#include <linux/percpu.h>
35#include <linux/cpu.h>
36
37#include "pf_in.h"
38
39#define NAME "mmiotrace: "
40
41struct trap_reason {
42 unsigned long addr;
43 unsigned long ip;
44 enum reason_type type;
45 int active_traces;
46};
47
48struct remap_trace {
49 struct list_head list;
50 struct kmmio_probe probe;
51 resource_size_t phys;
52 unsigned long id;
53};
54
55/* Accessed per-cpu. */
56static DEFINE_PER_CPU(struct trap_reason, pf_reason);
57static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);
58
59#if 0 /* XXX: no way gather this info anymore */
60/* Access to this is not per-cpu. */
61static DEFINE_PER_CPU(atomic_t, dropped);
62#endif
63
64static struct dentry *marker_file;
65
66static DEFINE_MUTEX(mmiotrace_mutex);
67static DEFINE_SPINLOCK(trace_lock);
68static atomic_t mmiotrace_enabled;
69static LIST_HEAD(trace_list); /* struct remap_trace */
70
71/*
72 * Locking in this file:
73 * - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections.
74 * - mmiotrace_enabled may be modified only when holding mmiotrace_mutex
75 * and trace_lock.
76 * - Routines depending on is_enabled() must take trace_lock.
77 * - trace_list users must hold trace_lock.
78 * - is_enabled() guarantees that mmio_trace_record is allowed.
79 * - pre/post callbacks assume the effect of is_enabled() being true.
80 */
81
82/* module parameters */
83static unsigned long filter_offset;
84static int nommiotrace;
85static int trace_pc;
86
87module_param(filter_offset, ulong, 0);
88module_param(nommiotrace, bool, 0);
89module_param(trace_pc, bool, 0);
90
91MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
92MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
93MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions.");
94
95static bool is_enabled(void)
96{
97 return atomic_read(&mmiotrace_enabled);
98}
99
100#if 0 /* XXX: needs rewrite */
101/*
102 * Write callback for the debugfs entry:
103 * Read a marker and write it to the mmio trace log
104 */
105static ssize_t write_marker(struct file *file, const char __user *buffer,
106 size_t count, loff_t *ppos)
107{
108 char *event = NULL;
109 struct mm_io_header *headp;
110 ssize_t len = (count > 65535) ? 65535 : count;
111
112 event = kzalloc(sizeof(*headp) + len, GFP_KERNEL);
113 if (!event)
114 return -ENOMEM;
115
116 headp = (struct mm_io_header *)event;
117 headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT);
118 headp->data_len = len;
119
120 if (copy_from_user(event + sizeof(*headp), buffer, len)) {
121 kfree(event);
122 return -EFAULT;
123 }
124
125 spin_lock_irq(&trace_lock);
126#if 0 /* XXX: convert this to use tracing */
127 if (is_enabled())
128 relay_write(chan, event, sizeof(*headp) + len);
129 else
130#endif
131 len = -EINVAL;
132 spin_unlock_irq(&trace_lock);
133 kfree(event);
134 return len;
135}
136#endif
137
138static void print_pte(unsigned long address)
139{
140 unsigned int level;
141 pte_t *pte = lookup_address(address, &level);
142
143 if (!pte) {
144 pr_err(NAME "Error in %s: no pte for page 0x%08lx\n",
145 __func__, address);
146 return;
147 }
148
149 if (level == PG_LEVEL_2M) {
150 pr_emerg(NAME "4MB pages are not currently supported: "
151 "0x%08lx\n", address);
152 BUG();
153 }
154 pr_info(NAME "pte for 0x%lx: 0x%llx 0x%llx\n", address,
155 (unsigned long long)pte_val(*pte),
156 (unsigned long long)pte_val(*pte) & _PAGE_PRESENT);
157}
158
159/*
160 * For some reason the pre/post pairs have been called in an
161 * unmatched order. Report and die.
162 */
163static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
164{
165 const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
166 pr_emerg(NAME "unexpected fault for address: 0x%08lx, "
167 "last fault for address: 0x%08lx\n",
168 addr, my_reason->addr);
169 print_pte(addr);
170 print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip);
171 print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip);
172#ifdef __i386__
173 pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
174 regs->ax, regs->bx, regs->cx, regs->dx);
175 pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
176 regs->si, regs->di, regs->bp, regs->sp);
177#else
178 pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n",
179 regs->ax, regs->cx, regs->dx);
180 pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n",
181 regs->si, regs->di, regs->bp, regs->sp);
182#endif
183 put_cpu_var(pf_reason);
184 BUG();
185}
186
187static void pre(struct kmmio_probe *p, struct pt_regs *regs,
188 unsigned long addr)
189{
190 struct trap_reason *my_reason = &get_cpu_var(pf_reason);
191 struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
192 const unsigned long instptr = instruction_pointer(regs);
193 const enum reason_type type = get_ins_type(instptr);
194 struct remap_trace *trace = p->private;
195
196 /* it doesn't make sense to have more than one active trace per cpu */
197 if (my_reason->active_traces)
198 die_kmmio_nesting_error(regs, addr);
199 else
200 my_reason->active_traces++;
201
202 my_reason->type = type;
203 my_reason->addr = addr;
204 my_reason->ip = instptr;
205
206 my_trace->phys = addr - trace->probe.addr + trace->phys;
207 my_trace->map_id = trace->id;
208
209 /*
210 * Only record the program counter when requested.
211 * It may taint clean-room reverse engineering.
212 */
213 if (trace_pc)
214 my_trace->pc = instptr;
215 else
216 my_trace->pc = 0;
217
218 /*
219 * XXX: the timestamp recorded will be *after* the tracing has been
220 * done, not at the time we hit the instruction. SMP implications
221 * on event ordering?
222 */
223
224 switch (type) {
225 case REG_READ:
226 my_trace->opcode = MMIO_READ;
227 my_trace->width = get_ins_mem_width(instptr);
228 break;
229 case REG_WRITE:
230 my_trace->opcode = MMIO_WRITE;
231 my_trace->width = get_ins_mem_width(instptr);
232 my_trace->value = get_ins_reg_val(instptr, regs);
233 break;
234 case IMM_WRITE:
235 my_trace->opcode = MMIO_WRITE;
236 my_trace->width = get_ins_mem_width(instptr);
237 my_trace->value = get_ins_imm_val(instptr);
238 break;
239 default:
240 {
241 unsigned char *ip = (unsigned char *)instptr;
242 my_trace->opcode = MMIO_UNKNOWN_OP;
243 my_trace->width = 0;
244 my_trace->value = (*ip) << 16 | *(ip + 1) << 8 |
245 *(ip + 2);
246 }
247 }
248 put_cpu_var(cpu_trace);
249 put_cpu_var(pf_reason);
250}
251
252static void post(struct kmmio_probe *p, unsigned long condition,
253 struct pt_regs *regs)
254{
255 struct trap_reason *my_reason = &get_cpu_var(pf_reason);
256 struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
257
258 /* this should always return the active_trace count to 0 */
259 my_reason->active_traces--;
260 if (my_reason->active_traces) {
261 pr_emerg(NAME "unexpected post handler");
262 BUG();
263 }
264
265 switch (my_reason->type) {
266 case REG_READ:
267 my_trace->value = get_ins_reg_val(my_reason->ip, regs);
268 break;
269 default:
270 break;
271 }
272
273 mmio_trace_rw(my_trace);
274 put_cpu_var(cpu_trace);
275 put_cpu_var(pf_reason);
276}
277
278static void ioremap_trace_core(resource_size_t offset, unsigned long size,
279 void __iomem *addr)
280{
281 static atomic_t next_id;
282 struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL);
283 /* These are page-unaligned. */
284 struct mmiotrace_map map = {
285 .phys = offset,
286 .virt = (unsigned long)addr,
287 .len = size,
288 .opcode = MMIO_PROBE
289 };
290
291 if (!trace) {
292 pr_err(NAME "kmalloc failed in ioremap\n");
293 return;
294 }
295
296 *trace = (struct remap_trace) {
297 .probe = {
298 .addr = (unsigned long)addr,
299 .len = size,
300 .pre_handler = pre,
301 .post_handler = post,
302 .private = trace
303 },
304 .phys = offset,
305 .id = atomic_inc_return(&next_id)
306 };
307 map.map_id = trace->id;
308
309 spin_lock_irq(&trace_lock);
310 if (!is_enabled())
311 goto not_enabled;
312
313 mmio_trace_mapping(&map);
314 list_add_tail(&trace->list, &trace_list);
315 if (!nommiotrace)
316 register_kmmio_probe(&trace->probe);
317
318not_enabled:
319 spin_unlock_irq(&trace_lock);
320}
321
322void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
323 void __iomem *addr)
324{
325 if (!is_enabled()) /* recheck and proper locking in *_core() */
326 return;
327
328 pr_debug(NAME "ioremap_*(0x%llx, 0x%lx) = %p\n",
329 (unsigned long long)offset, size, addr);
330 if ((filter_offset) && (offset != filter_offset))
331 return;
332 ioremap_trace_core(offset, size, addr);
333}
334
335static void iounmap_trace_core(volatile void __iomem *addr)
336{
337 struct mmiotrace_map map = {
338 .phys = 0,
339 .virt = (unsigned long)addr,
340 .len = 0,
341 .opcode = MMIO_UNPROBE
342 };
343 struct remap_trace *trace;
344 struct remap_trace *tmp;
345 struct remap_trace *found_trace = NULL;
346
347 pr_debug(NAME "Unmapping %p.\n", addr);
348
349 spin_lock_irq(&trace_lock);
350 if (!is_enabled())
351 goto not_enabled;
352
353 list_for_each_entry_safe(trace, tmp, &trace_list, list) {
354 if ((unsigned long)addr == trace->probe.addr) {
355 if (!nommiotrace)
356 unregister_kmmio_probe(&trace->probe);
357 list_del(&trace->list);
358 found_trace = trace;
359 break;
360 }
361 }
362 map.map_id = (found_trace) ? found_trace->id : -1;
363 mmio_trace_mapping(&map);
364
365not_enabled:
366 spin_unlock_irq(&trace_lock);
367 if (found_trace) {
368 synchronize_rcu(); /* unregister_kmmio_probe() requirement */
369 kfree(found_trace);
370 }
371}
372
373void mmiotrace_iounmap(volatile void __iomem *addr)
374{
375 might_sleep();
376 if (is_enabled()) /* recheck and proper locking in *_core() */
377 iounmap_trace_core(addr);
378}
379
380static void clear_trace_list(void)
381{
382 struct remap_trace *trace;
383 struct remap_trace *tmp;
384
385 /*
386 * No locking required, because the caller ensures we are in a
387 * critical section via mutex, and is_enabled() is false,
388 * i.e. nothing can traverse or modify this list.
389 * Caller also ensures is_enabled() cannot change.
390 */
391 list_for_each_entry(trace, &trace_list, list) {
392 pr_notice(NAME "purging non-iounmapped "
393 "trace @0x%08lx, size 0x%lx.\n",
394 trace->probe.addr, trace->probe.len);
395 if (!nommiotrace)
396 unregister_kmmio_probe(&trace->probe);
397 }
398 synchronize_rcu(); /* unregister_kmmio_probe() requirement */
399
400 list_for_each_entry_safe(trace, tmp, &trace_list, list) {
401 list_del(&trace->list);
402 kfree(trace);
403 }
404}
405
406#ifdef CONFIG_HOTPLUG_CPU
407static cpumask_t downed_cpus;
408
409static void enter_uniprocessor(void)
410{
411 int cpu;
412 int err;
413
414 get_online_cpus();
415 downed_cpus = cpu_online_map;
416 cpu_clear(first_cpu(cpu_online_map), downed_cpus);
417 if (num_online_cpus() > 1)
418 pr_notice(NAME "Disabling non-boot CPUs...\n");
419 put_online_cpus();
420
421 for_each_cpu_mask(cpu, downed_cpus) {
422 err = cpu_down(cpu);
423 if (!err)
424 pr_info(NAME "CPU%d is down.\n", cpu);
425 else
426 pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err);
427 }
428 if (num_online_cpus() > 1)
429 pr_warning(NAME "multiple CPUs still online, "
430 "may miss events.\n");
431}
432
433/* __ref because leave_uniprocessor calls cpu_up which is __cpuinit,
434 but this whole function is ifdefed CONFIG_HOTPLUG_CPU */
435static void __ref leave_uniprocessor(void)
436{
437 int cpu;
438 int err;
439
440 if (cpus_weight(downed_cpus) == 0)
441 return;
442 pr_notice(NAME "Re-enabling CPUs...\n");
443 for_each_cpu_mask(cpu, downed_cpus) {
444 err = cpu_up(cpu);
445 if (!err)
446 pr_info(NAME "enabled CPU%d.\n", cpu);
447 else
448 pr_err(NAME "cannot re-enable CPU%d: %d\n", cpu, err);
449 }
450}
451
452#else /* !CONFIG_HOTPLUG_CPU */
453static void enter_uniprocessor(void)
454{
455 if (num_online_cpus() > 1)
456 pr_warning(NAME "multiple CPUs are online, may miss events. "
457 "Suggest booting with maxcpus=1 kernel argument.\n");
458}
459
460static void leave_uniprocessor(void)
461{
462}
463#endif
464
465#if 0 /* XXX: out of order */
466static struct file_operations fops_marker = {
467 .owner = THIS_MODULE,
468 .write = write_marker
469};
470#endif
471
472void enable_mmiotrace(void)
473{
474 mutex_lock(&mmiotrace_mutex);
475 if (is_enabled())
476 goto out;
477
478#if 0 /* XXX: tracing does not support text entries */
479 marker_file = debugfs_create_file("marker", 0660, dir, NULL,
480 &fops_marker);
481 if (!marker_file)
482 pr_err(NAME "marker file creation failed.\n");
483#endif
484
485 if (nommiotrace)
486 pr_info(NAME "MMIO tracing disabled.\n");
487 enter_uniprocessor();
488 spin_lock_irq(&trace_lock);
489 atomic_inc(&mmiotrace_enabled);
490 spin_unlock_irq(&trace_lock);
491 pr_info(NAME "enabled.\n");
492out:
493 mutex_unlock(&mmiotrace_mutex);
494}
495
496void disable_mmiotrace(void)
497{
498 mutex_lock(&mmiotrace_mutex);
499 if (!is_enabled())
500 goto out;
501
502 spin_lock_irq(&trace_lock);
503 atomic_dec(&mmiotrace_enabled);
504 BUG_ON(is_enabled());
505 spin_unlock_irq(&trace_lock);
506
507 clear_trace_list(); /* guarantees: no more kmmio callbacks */
508 leave_uniprocessor();
509 if (marker_file) {
510 debugfs_remove(marker_file);
511 marker_file = NULL;
512 }
513
514 pr_info(NAME "disabled.\n");
515out:
516 mutex_unlock(&mmiotrace_mutex);
517}
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/numa_32.c
index 914ccf983687..847c164725f4 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -38,10 +38,10 @@
38#include <asm/setup.h> 38#include <asm/setup.h>
39#include <asm/mmzone.h> 39#include <asm/mmzone.h>
40#include <asm/bios_ebda.h> 40#include <asm/bios_ebda.h>
41#include <asm/proto.h>
41 42
42struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 43struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
43EXPORT_SYMBOL(node_data); 44EXPORT_SYMBOL(node_data);
44static bootmem_data_t node0_bdata;
45 45
46/* 46/*
47 * numa interface - we expect the numa architecture specific code to have 47 * numa interface - we expect the numa architecture specific code to have
@@ -59,14 +59,14 @@ unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
59/* 59/*
60 * 4) physnode_map - the mapping between a pfn and owning node 60 * 4) physnode_map - the mapping between a pfn and owning node
61 * physnode_map keeps track of the physical memory layout of a generic 61 * physnode_map keeps track of the physical memory layout of a generic
62 * numa node on a 256Mb break (each element of the array will 62 * numa node on a 64Mb break (each element of the array will
63 * represent 256Mb of memory and will be marked by the node id. so, 63 * represent 64Mb of memory and will be marked by the node id. so,
64 * if the first gig is on node 0, and the second gig is on node 1 64 * if the first gig is on node 0, and the second gig is on node 1
65 * physnode_map will contain: 65 * physnode_map will contain:
66 * 66 *
67 * physnode_map[0-3] = 0; 67 * physnode_map[0-15] = 0;
68 * physnode_map[4-7] = 1; 68 * physnode_map[16-31] = 1;
69 * physnode_map[8- ] = -1; 69 * physnode_map[32- ] = -1;
70 */ 70 */
71s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1}; 71s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
72EXPORT_SYMBOL(physnode_map); 72EXPORT_SYMBOL(physnode_map);
@@ -75,15 +75,15 @@ void memory_present(int nid, unsigned long start, unsigned long end)
75{ 75{
76 unsigned long pfn; 76 unsigned long pfn;
77 77
78 printk(KERN_INFO "Node: %d, start_pfn: %ld, end_pfn: %ld\n", 78 printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n",
79 nid, start, end); 79 nid, start, end);
80 printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); 80 printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
81 printk(KERN_DEBUG " "); 81 printk(KERN_DEBUG " ");
82 for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { 82 for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
83 physnode_map[pfn / PAGES_PER_ELEMENT] = nid; 83 physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
84 printk("%ld ", pfn); 84 printk(KERN_CONT "%lx ", pfn);
85 } 85 }
86 printk("\n"); 86 printk(KERN_CONT "\n");
87} 87}
88 88
89unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, 89unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
@@ -99,7 +99,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
99#endif 99#endif
100 100
101extern unsigned long find_max_low_pfn(void); 101extern unsigned long find_max_low_pfn(void);
102extern void add_one_highpage_init(struct page *, int, int);
103extern unsigned long highend_pfn, highstart_pfn; 102extern unsigned long highend_pfn, highstart_pfn;
104 103
105#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) 104#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
@@ -117,13 +116,13 @@ static unsigned long kva_pages;
117 */ 116 */
118int __init get_memcfg_numa_flat(void) 117int __init get_memcfg_numa_flat(void)
119{ 118{
120 printk("NUMA - single node, flat memory mode\n"); 119 printk(KERN_DEBUG "NUMA - single node, flat memory mode\n");
121 120
122 /* Run the memory configuration and find the top of memory. */
123 propagate_e820_map();
124 node_start_pfn[0] = 0; 121 node_start_pfn[0] = 0;
125 node_end_pfn[0] = max_pfn; 122 node_end_pfn[0] = max_pfn;
123 e820_register_active_regions(0, 0, max_pfn);
126 memory_present(0, 0, max_pfn); 124 memory_present(0, 0, max_pfn);
125 node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
127 126
128 /* Indicate there is one node available. */ 127 /* Indicate there is one node available. */
129 nodes_clear(node_online_map); 128 nodes_clear(node_online_map);
@@ -156,24 +155,32 @@ static void __init propagate_e820_map_node(int nid)
156 */ 155 */
157static void __init allocate_pgdat(int nid) 156static void __init allocate_pgdat(int nid)
158{ 157{
159 if (nid && node_has_online_mem(nid)) 158 char buf[16];
159
160 if (node_has_online_mem(nid) && node_remap_start_vaddr[nid])
160 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; 161 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
161 else { 162 else {
162 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn)); 163 unsigned long pgdat_phys;
163 min_low_pfn += PFN_UP(sizeof(pg_data_t)); 164 pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT,
165 max_pfn_mapped<<PAGE_SHIFT,
166 sizeof(pg_data_t),
167 PAGE_SIZE);
168 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
169 memset(buf, 0, sizeof(buf));
170 sprintf(buf, "NODE_DATA %d", nid);
171 reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf);
164 } 172 }
173 printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
174 nid, (unsigned long)NODE_DATA(nid));
165} 175}
166 176
167#ifdef CONFIG_DISCONTIGMEM
168/* 177/*
169 * In the discontig memory model, a portion of the kernel virtual area (KVA) 178 * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel
170 * is reserved and portions of nodes are mapped using it. This is to allow 179 * virtual address space (KVA) is reserved and portions of nodes are mapped
171 * node-local memory to be allocated for structures that would normally require 180 * using it. This is to allow node-local memory to be allocated for
172 * ZONE_NORMAL. The memory is allocated with alloc_remap() and callers 181 * structures that would normally require ZONE_NORMAL. The memory is
173 * should be prepared to allocate from the bootmem allocator instead. This KVA 182 * allocated with alloc_remap() and callers should be prepared to allocate
174 * mechanism is incompatible with SPARSEMEM as it makes assumptions about the 183 * from the bootmem allocator instead.
175 * layout of memory that are broken if alloc_remap() succeeds for some of the
176 * map and fails for others
177 */ 184 */
178static unsigned long node_remap_start_pfn[MAX_NUMNODES]; 185static unsigned long node_remap_start_pfn[MAX_NUMNODES];
179static void *node_remap_end_vaddr[MAX_NUMNODES]; 186static void *node_remap_end_vaddr[MAX_NUMNODES];
@@ -195,15 +202,19 @@ void *alloc_remap(int nid, unsigned long size)
195 return allocation; 202 return allocation;
196} 203}
197 204
198void __init remap_numa_kva(void) 205static void __init remap_numa_kva(void)
199{ 206{
200 void *vaddr; 207 void *vaddr;
201 unsigned long pfn; 208 unsigned long pfn;
202 int node; 209 int node;
203 210
204 for_each_online_node(node) { 211 for_each_online_node(node) {
212 printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
205 for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { 213 for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
206 vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT); 214 vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
215 printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
216 (unsigned long)vaddr,
217 node_remap_start_pfn[node] + pfn);
207 set_pmd_pfn((ulong) vaddr, 218 set_pmd_pfn((ulong) vaddr,
208 node_remap_start_pfn[node] + pfn, 219 node_remap_start_pfn[node] + pfn,
209 PAGE_KERNEL_LARGE); 220 PAGE_KERNEL_LARGE);
@@ -215,17 +226,21 @@ static unsigned long calculate_numa_remap_pages(void)
215{ 226{
216 int nid; 227 int nid;
217 unsigned long size, reserve_pages = 0; 228 unsigned long size, reserve_pages = 0;
218 unsigned long pfn;
219 229
220 for_each_online_node(nid) { 230 for_each_online_node(nid) {
221 unsigned old_end_pfn = node_end_pfn[nid]; 231 u64 node_kva_target;
232 u64 node_kva_final;
222 233
223 /* 234 /*
224 * The acpi/srat node info can show hot-add memroy zones 235 * The acpi/srat node info can show hot-add memroy zones
225 * where memory could be added but not currently present. 236 * where memory could be added but not currently present.
226 */ 237 */
238 printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
239 nid, node_start_pfn[nid], node_end_pfn[nid]);
227 if (node_start_pfn[nid] > max_pfn) 240 if (node_start_pfn[nid] > max_pfn)
228 continue; 241 continue;
242 if (!node_end_pfn[nid])
243 continue;
229 if (node_end_pfn[nid] > max_pfn) 244 if (node_end_pfn[nid] > max_pfn)
230 node_end_pfn[nid] = max_pfn; 245 node_end_pfn[nid] = max_pfn;
231 246
@@ -237,41 +252,48 @@ static unsigned long calculate_numa_remap_pages(void)
237 /* now the roundup is correct, convert to PAGE_SIZE pages */ 252 /* now the roundup is correct, convert to PAGE_SIZE pages */
238 size = size * PTRS_PER_PTE; 253 size = size * PTRS_PER_PTE;
239 254
240 /* 255 node_kva_target = round_down(node_end_pfn[nid] - size,
241 * Validate the region we are allocating only contains valid 256 PTRS_PER_PTE);
242 * pages. 257 node_kva_target <<= PAGE_SHIFT;
243 */ 258 do {
244 for (pfn = node_end_pfn[nid] - size; 259 node_kva_final = find_e820_area(node_kva_target,
245 pfn < node_end_pfn[nid]; pfn++) 260 ((u64)node_end_pfn[nid])<<PAGE_SHIFT,
246 if (!page_is_ram(pfn)) 261 ((u64)size)<<PAGE_SHIFT,
247 break; 262 LARGE_PAGE_BYTES);
248 263 node_kva_target -= LARGE_PAGE_BYTES;
249 if (pfn != node_end_pfn[nid]) 264 } while (node_kva_final == -1ULL &&
250 size = 0; 265 (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
266
267 if (node_kva_final == -1ULL)
268 panic("Can not get kva ram\n");
251 269
252 printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
253 size, nid);
254 node_remap_size[nid] = size; 270 node_remap_size[nid] = size;
255 node_remap_offset[nid] = reserve_pages; 271 node_remap_offset[nid] = reserve_pages;
256 reserve_pages += size; 272 reserve_pages += size;
257 printk("Shrinking node %d from %ld pages to %ld pages\n", 273 printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of"
258 nid, node_end_pfn[nid], node_end_pfn[nid] - size); 274 " node %d at %llx\n",
259 275 size, nid, node_kva_final>>PAGE_SHIFT);
260 if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) { 276
261 /* 277 /*
262 * Align node_end_pfn[] and node_remap_start_pfn[] to 278 * prevent kva address below max_low_pfn want it on system
263 * pmd boundary. remap_numa_kva will barf otherwise. 279 * with less memory later.
264 */ 280 * layout will be: KVA address , KVA RAM
265 printk("Shrinking node %d further by %ld pages for proper alignment\n", 281 *
266 nid, node_end_pfn[nid] & (PTRS_PER_PTE-1)); 282 * we are supposed to only record the one less then max_low_pfn
267 size += node_end_pfn[nid] & (PTRS_PER_PTE-1); 283 * but we could have some hole in high memory, and it will only
268 } 284 * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
285 * to use it as free.
286 * So reserve_early here, hope we don't run out of that array
287 */
288 reserve_early(node_kva_final,
289 node_kva_final+(((u64)size)<<PAGE_SHIFT),
290 "KVA RAM");
269 291
270 node_end_pfn[nid] -= size; 292 node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
271 node_remap_start_pfn[nid] = node_end_pfn[nid]; 293 remove_active_range(nid, node_remap_start_pfn[nid],
272 shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]); 294 node_remap_start_pfn[nid] + size);
273 } 295 }
274 printk("Reserving total of %ld pages for numa KVA remap\n", 296 printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
275 reserve_pages); 297 reserve_pages);
276 return reserve_pages; 298 return reserve_pages;
277} 299}
@@ -285,37 +307,16 @@ static void init_remap_allocator(int nid)
285 node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + 307 node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
286 ALIGN(sizeof(pg_data_t), PAGE_SIZE); 308 ALIGN(sizeof(pg_data_t), PAGE_SIZE);
287 309
288 printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, 310 printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid,
289 (ulong) node_remap_start_vaddr[nid], 311 (ulong) node_remap_start_vaddr[nid],
290 (ulong) pfn_to_kaddr(highstart_pfn 312 (ulong) node_remap_end_vaddr[nid]);
291 + node_remap_offset[nid] + node_remap_size[nid]));
292}
293#else
294void *alloc_remap(int nid, unsigned long size)
295{
296 return NULL;
297}
298
299static unsigned long calculate_numa_remap_pages(void)
300{
301 return 0;
302}
303
304static void init_remap_allocator(int nid)
305{
306}
307
308void __init remap_numa_kva(void)
309{
310} 313}
311#endif /* CONFIG_DISCONTIGMEM */
312 314
313extern void setup_bootmem_allocator(void); 315void __init initmem_init(unsigned long start_pfn,
314unsigned long __init setup_memory(void) 316 unsigned long end_pfn)
315{ 317{
316 int nid; 318 int nid;
317 unsigned long system_start_pfn, system_max_low_pfn; 319 long kva_target_pfn;
318 unsigned long wasted_pages;
319 320
320 /* 321 /*
321 * When mapping a NUMA machine we allocate the node_mem_map arrays 322 * When mapping a NUMA machine we allocate the node_mem_map arrays
@@ -324,109 +325,77 @@ unsigned long __init setup_memory(void)
324 * this space and use it to adjust the boundary between ZONE_NORMAL 325 * this space and use it to adjust the boundary between ZONE_NORMAL
325 * and ZONE_HIGHMEM. 326 * and ZONE_HIGHMEM.
326 */ 327 */
327 get_memcfg_numa();
328 328
329 kva_pages = calculate_numa_remap_pages(); 329 get_memcfg_numa();
330 330
331 /* partially used pages are not usable - thus round upwards */ 331 kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
332 system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
333 332
334 kva_start_pfn = find_max_low_pfn() - kva_pages; 333 kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
334 do {
335 kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT,
336 max_low_pfn<<PAGE_SHIFT,
337 kva_pages<<PAGE_SHIFT,
338 PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
339 kva_target_pfn -= PTRS_PER_PTE;
340 } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn);
335 341
336#ifdef CONFIG_BLK_DEV_INITRD 342 if (kva_start_pfn == -1UL)
337 /* Numa kva area is below the initrd */ 343 panic("Can not get kva space\n");
338 if (initrd_start)
339 kva_start_pfn = PFN_DOWN(initrd_start - PAGE_OFFSET)
340 - kva_pages;
341#endif
342 344
343 /* 345 printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
344 * We waste pages past at the end of the KVA for no good reason other
345 * than how it is located. This is bad.
346 */
347 wasted_pages = kva_start_pfn & (PTRS_PER_PTE-1);
348 kva_start_pfn -= wasted_pages;
349 kva_pages += wasted_pages;
350
351 system_max_low_pfn = max_low_pfn = find_max_low_pfn();
352 printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n",
353 kva_start_pfn, max_low_pfn); 346 kva_start_pfn, max_low_pfn);
354 printk("max_pfn = %ld\n", max_pfn); 347 printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
348
349 /* avoid clash with initrd */
350 reserve_early(kva_start_pfn<<PAGE_SHIFT,
351 (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
352 "KVA PG");
355#ifdef CONFIG_HIGHMEM 353#ifdef CONFIG_HIGHMEM
356 highstart_pfn = highend_pfn = max_pfn; 354 highstart_pfn = highend_pfn = max_pfn;
357 if (max_pfn > system_max_low_pfn) 355 if (max_pfn > max_low_pfn)
358 highstart_pfn = system_max_low_pfn; 356 highstart_pfn = max_low_pfn;
359 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", 357 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
360 pages_to_mb(highend_pfn - highstart_pfn)); 358 pages_to_mb(highend_pfn - highstart_pfn));
361 num_physpages = highend_pfn; 359 num_physpages = highend_pfn;
362 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; 360 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
363#else 361#else
364 num_physpages = system_max_low_pfn; 362 num_physpages = max_low_pfn;
365 high_memory = (void *) __va(system_max_low_pfn * PAGE_SIZE - 1) + 1; 363 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
366#endif 364#endif
367 printk(KERN_NOTICE "%ldMB LOWMEM available.\n", 365 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
368 pages_to_mb(system_max_low_pfn)); 366 pages_to_mb(max_low_pfn));
369 printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", 367 printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n",
370 min_low_pfn, max_low_pfn, highstart_pfn); 368 max_low_pfn, highstart_pfn);
371 369
372 printk("Low memory ends at vaddr %08lx\n", 370 printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
373 (ulong) pfn_to_kaddr(max_low_pfn)); 371 (ulong) pfn_to_kaddr(max_low_pfn));
374 for_each_online_node(nid) { 372 for_each_online_node(nid) {
375 init_remap_allocator(nid); 373 init_remap_allocator(nid);
376 374
377 allocate_pgdat(nid); 375 allocate_pgdat(nid);
378 } 376 }
379 printk("High memory starts at vaddr %08lx\n", 377 remap_numa_kva();
378
379 printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
380 (ulong) pfn_to_kaddr(highstart_pfn)); 380 (ulong) pfn_to_kaddr(highstart_pfn));
381 for_each_online_node(nid) 381 for_each_online_node(nid)
382 propagate_e820_map_node(nid); 382 propagate_e820_map_node(nid);
383 383
384 memset(NODE_DATA(0), 0, sizeof(struct pglist_data)); 384 for_each_online_node(nid)
385 NODE_DATA(0)->bdata = &node0_bdata; 385 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
386 setup_bootmem_allocator();
387 return max_low_pfn;
388}
389
390void __init numa_kva_reserve(void)
391{
392 if (kva_pages)
393 reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages),
394 BOOTMEM_DEFAULT);
395}
396
397void __init zone_sizes_init(void)
398{
399 int nid;
400 unsigned long max_zone_pfns[MAX_NR_ZONES];
401 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
402 max_zone_pfns[ZONE_DMA] =
403 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
404 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
405#ifdef CONFIG_HIGHMEM
406 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
407#endif
408
409 /* If SRAT has not registered memory, register it now */
410 if (find_max_pfn_with_active_regions() == 0) {
411 for_each_online_node(nid) {
412 if (node_has_online_mem(nid))
413 add_active_range(nid, node_start_pfn[nid],
414 node_end_pfn[nid]);
415 }
416 }
417 386
418 free_area_init_nodes(max_zone_pfns); 387 NODE_DATA(0)->bdata = &bootmem_node_data[0];
419 return; 388 setup_bootmem_allocator();
420} 389}
421 390
422void __init set_highmem_pages_init(int bad_ppro) 391void __init set_highmem_pages_init(void)
423{ 392{
424#ifdef CONFIG_HIGHMEM 393#ifdef CONFIG_HIGHMEM
425 struct zone *zone; 394 struct zone *zone;
426 struct page *page; 395 int nid;
427 396
428 for_each_zone(zone) { 397 for_each_zone(zone) {
429 unsigned long node_pfn, zone_start_pfn, zone_end_pfn; 398 unsigned long zone_start_pfn, zone_end_pfn;
430 399
431 if (!is_highmem(zone)) 400 if (!is_highmem(zone))
432 continue; 401 continue;
@@ -434,16 +403,12 @@ void __init set_highmem_pages_init(int bad_ppro)
434 zone_start_pfn = zone->zone_start_pfn; 403 zone_start_pfn = zone->zone_start_pfn;
435 zone_end_pfn = zone_start_pfn + zone->spanned_pages; 404 zone_end_pfn = zone_start_pfn + zone->spanned_pages;
436 405
437 printk("Initializing %s for node %d (%08lx:%08lx)\n", 406 nid = zone_to_nid(zone);
438 zone->name, zone_to_nid(zone), 407 printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
439 zone_start_pfn, zone_end_pfn); 408 zone->name, nid, zone_start_pfn, zone_end_pfn);
440 409
441 for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) { 410 add_highpages_with_active_regions(nid, zone_start_pfn,
442 if (!pfn_valid(node_pfn)) 411 zone_end_pfn);
443 continue;
444 page = pfn_to_page(node_pfn);
445 add_one_highpage_init(page, node_pfn, bad_ppro);
446 }
447 } 412 }
448 totalram_pages += totalhigh_pages; 413 totalram_pages += totalhigh_pages;
449#endif 414#endif
@@ -476,3 +441,4 @@ int memory_add_physaddr_to_nid(u64 addr)
476 441
477EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); 442EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
478#endif 443#endif
444
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index c5066d519e5d..cebcbf152d46 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -20,37 +20,18 @@
20#include <asm/acpi.h> 20#include <asm/acpi.h>
21#include <asm/k8.h> 21#include <asm/k8.h>
22 22
23#ifndef Dprintk
24#define Dprintk(x...)
25#endif
26
27struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 23struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
28EXPORT_SYMBOL(node_data); 24EXPORT_SYMBOL(node_data);
29 25
30bootmem_data_t plat_node_bdata[MAX_NUMNODES];
31
32struct memnode memnode; 26struct memnode memnode;
33 27
34#ifdef CONFIG_SMP
35int x86_cpu_to_node_map_init[NR_CPUS] = {
36 [0 ... NR_CPUS-1] = NUMA_NO_NODE
37};
38void *x86_cpu_to_node_map_early_ptr;
39EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr);
40#endif
41DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
42EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map);
43
44s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 28s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
45 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 29 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
46}; 30};
47 31
48cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly;
49EXPORT_SYMBOL(node_to_cpumask_map);
50
51int numa_off __initdata; 32int numa_off __initdata;
52unsigned long __initdata nodemap_addr; 33static unsigned long __initdata nodemap_addr;
53unsigned long __initdata nodemap_size; 34static unsigned long __initdata nodemap_size;
54 35
55/* 36/*
56 * Given a shift value, try to populate memnodemap[] 37 * Given a shift value, try to populate memnodemap[]
@@ -98,8 +79,8 @@ static int __init allocate_cachealigned_memnodemap(void)
98 return 0; 79 return 0;
99 80
100 addr = 0x8000; 81 addr = 0x8000;
101 nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); 82 nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
102 nodemap_addr = find_e820_area(addr, end_pfn<<PAGE_SHIFT, 83 nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT,
103 nodemap_size, L1_CACHE_BYTES); 84 nodemap_size, L1_CACHE_BYTES);
104 if (nodemap_addr == -1UL) { 85 if (nodemap_addr == -1UL) {
105 printk(KERN_ERR 86 printk(KERN_ERR
@@ -192,19 +173,19 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
192void __init setup_node_bootmem(int nodeid, unsigned long start, 173void __init setup_node_bootmem(int nodeid, unsigned long start,
193 unsigned long end) 174 unsigned long end)
194{ 175{
195 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size; 176 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
196 unsigned long bootmap_start, nodedata_phys; 177 unsigned long bootmap_start, nodedata_phys;
197 void *bootmap; 178 void *bootmap;
198 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); 179 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
199 int nid; 180 int nid;
200 181
201 start = round_up(start, ZONE_ALIGN); 182 start = roundup(start, ZONE_ALIGN);
202 183
203 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, 184 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
204 start, end); 185 start, end);
205 186
206 start_pfn = start >> PAGE_SHIFT; 187 start_pfn = start >> PAGE_SHIFT;
207 end_pfn = end >> PAGE_SHIFT; 188 last_pfn = end >> PAGE_SHIFT;
208 189
209 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size, 190 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
210 SMP_CACHE_BYTES); 191 SMP_CACHE_BYTES);
@@ -215,9 +196,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
215 nodedata_phys + pgdat_size - 1); 196 nodedata_phys + pgdat_size - 1);
216 197
217 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 198 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
218 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; 199 NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
219 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 200 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
220 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; 201 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
221 202
222 /* 203 /*
223 * Find a place for the bootmem map 204 * Find a place for the bootmem map
@@ -226,14 +207,14 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
226 * early_node_mem will get that with find_e820_area instead 207 * early_node_mem will get that with find_e820_area instead
227 * of alloc_bootmem, that could clash with reserved range 208 * of alloc_bootmem, that could clash with reserved range
228 */ 209 */
229 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 210 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
230 nid = phys_to_nid(nodedata_phys); 211 nid = phys_to_nid(nodedata_phys);
231 if (nid == nodeid) 212 if (nid == nodeid)
232 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); 213 bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
233 else 214 else
234 bootmap_start = round_up(start, PAGE_SIZE); 215 bootmap_start = roundup(start, PAGE_SIZE);
235 /* 216 /*
236 * SMP_CAHCE_BYTES could be enough, but init_bootmem_node like 217 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
237 * to use that to align to PAGE_SIZE 218 * to use that to align to PAGE_SIZE
238 */ 219 */
239 bootmap = early_node_mem(nodeid, bootmap_start, end, 220 bootmap = early_node_mem(nodeid, bootmap_start, end,
@@ -248,7 +229,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
248 229
249 bootmap_size = init_bootmem_node(NODE_DATA(nodeid), 230 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
250 bootmap_start >> PAGE_SHIFT, 231 bootmap_start >> PAGE_SHIFT,
251 start_pfn, end_pfn); 232 start_pfn, last_pfn);
252 233
253 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", 234 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n",
254 bootmap_start, bootmap_start + bootmap_size - 1, 235 bootmap_start, bootmap_start + bootmap_size - 1,
@@ -309,7 +290,7 @@ void __init numa_init_array(void)
309 290
310#ifdef CONFIG_NUMA_EMU 291#ifdef CONFIG_NUMA_EMU
311/* Numa emulation */ 292/* Numa emulation */
312char *cmdline __initdata; 293static char *cmdline __initdata;
313 294
314/* 295/*
315 * Setups up nid to range from addr to addr + size. If the end 296 * Setups up nid to range from addr to addr + size. If the end
@@ -413,15 +394,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
413} 394}
414 395
415/* 396/*
416 * Sets up the system RAM area from start_pfn to end_pfn according to the 397 * Sets up the system RAM area from start_pfn to last_pfn according to the
417 * numa=fake command-line option. 398 * numa=fake command-line option.
418 */ 399 */
419static struct bootnode nodes[MAX_NUMNODES] __initdata; 400static struct bootnode nodes[MAX_NUMNODES] __initdata;
420 401
421static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) 402static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
422{ 403{
423 u64 size, addr = start_pfn << PAGE_SHIFT; 404 u64 size, addr = start_pfn << PAGE_SHIFT;
424 u64 max_addr = end_pfn << PAGE_SHIFT; 405 u64 max_addr = last_pfn << PAGE_SHIFT;
425 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; 406 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
426 407
427 memset(&nodes, 0, sizeof(nodes)); 408 memset(&nodes, 0, sizeof(nodes));
@@ -527,7 +508,7 @@ out:
527} 508}
528#endif /* CONFIG_NUMA_EMU */ 509#endif /* CONFIG_NUMA_EMU */
529 510
530void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) 511void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
531{ 512{
532 int i; 513 int i;
533 514
@@ -535,7 +516,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
535 nodes_clear(node_online_map); 516 nodes_clear(node_online_map);
536 517
537#ifdef CONFIG_NUMA_EMU 518#ifdef CONFIG_NUMA_EMU
538 if (cmdline && !numa_emulation(start_pfn, end_pfn)) 519 if (cmdline && !numa_emulation(start_pfn, last_pfn))
539 return; 520 return;
540 nodes_clear(node_possible_map); 521 nodes_clear(node_possible_map);
541 nodes_clear(node_online_map); 522 nodes_clear(node_online_map);
@@ -543,7 +524,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
543 524
544#ifdef CONFIG_ACPI_NUMA 525#ifdef CONFIG_ACPI_NUMA
545 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 526 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
546 end_pfn << PAGE_SHIFT)) 527 last_pfn << PAGE_SHIFT))
547 return; 528 return;
548 nodes_clear(node_possible_map); 529 nodes_clear(node_possible_map);
549 nodes_clear(node_online_map); 530 nodes_clear(node_online_map);
@@ -551,7 +532,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
551 532
552#ifdef CONFIG_K8_NUMA 533#ifdef CONFIG_K8_NUMA
553 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, 534 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT,
554 end_pfn<<PAGE_SHIFT)) 535 last_pfn<<PAGE_SHIFT))
555 return; 536 return;
556 nodes_clear(node_possible_map); 537 nodes_clear(node_possible_map);
557 nodes_clear(node_online_map); 538 nodes_clear(node_online_map);
@@ -561,7 +542,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
561 542
562 printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 543 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
563 start_pfn << PAGE_SHIFT, 544 start_pfn << PAGE_SHIFT,
564 end_pfn << PAGE_SHIFT); 545 last_pfn << PAGE_SHIFT);
565 /* setup dummy node covering all memory */ 546 /* setup dummy node covering all memory */
566 memnode_shift = 63; 547 memnode_shift = 63;
567 memnodemap = memnode.embedded_map; 548 memnodemap = memnode.embedded_map;
@@ -570,29 +551,8 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
570 node_set(0, node_possible_map); 551 node_set(0, node_possible_map);
571 for (i = 0; i < NR_CPUS; i++) 552 for (i = 0; i < NR_CPUS; i++)
572 numa_set_node(i, 0); 553 numa_set_node(i, 0);
573 /* cpumask_of_cpu() may not be available during early startup */ 554 e820_register_active_regions(0, start_pfn, last_pfn);
574 memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0])); 555 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
575 cpu_set(0, node_to_cpumask_map[0]);
576 e820_register_active_regions(0, start_pfn, end_pfn);
577 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
578}
579
580__cpuinit void numa_add_cpu(int cpu)
581{
582 set_bit(cpu,
583 (unsigned long *)&node_to_cpumask_map[early_cpu_to_node(cpu)]);
584}
585
586void __cpuinit numa_set_node(int cpu, int node)
587{
588 int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
589
590 if(cpu_to_node_map)
591 cpu_to_node_map[cpu] = node;
592 else if(per_cpu_offset(cpu))
593 per_cpu(x86_cpu_to_node_map, cpu) = node;
594 else
595 Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu);
596} 556}
597 557
598unsigned long __init numa_free_all_bootmem(void) 558unsigned long __init numa_free_all_bootmem(void)
@@ -613,7 +573,7 @@ void __init paging_init(void)
613 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 573 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
614 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; 574 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
615 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 575 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
616 max_zone_pfns[ZONE_NORMAL] = end_pfn; 576 max_zone_pfns[ZONE_NORMAL] = max_pfn;
617 577
618 sparse_memory_present_with_active_regions(MAX_NUMNODES); 578 sparse_memory_present_with_active_regions(MAX_NUMNODES);
619 sparse_init(); 579 sparse_init();
@@ -641,6 +601,7 @@ static __init int numa_setup(char *opt)
641} 601}
642early_param("numa", numa_setup); 602early_param("numa", numa_setup);
643 603
604#ifdef CONFIG_NUMA
644/* 605/*
645 * Setup early cpu_to_node. 606 * Setup early cpu_to_node.
646 * 607 *
@@ -652,14 +613,19 @@ early_param("numa", numa_setup);
652 * is already initialized in a round robin manner at numa_init_array, 613 * is already initialized in a round robin manner at numa_init_array,
653 * prior to this call, and this initialization is good enough 614 * prior to this call, and this initialization is good enough
654 * for the fake NUMA cases. 615 * for the fake NUMA cases.
616 *
617 * Called before the per_cpu areas are setup.
655 */ 618 */
656void __init init_cpu_to_node(void) 619void __init init_cpu_to_node(void)
657{ 620{
658 int i; 621 int cpu;
622 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
659 623
660 for (i = 0; i < NR_CPUS; i++) { 624 BUG_ON(cpu_to_apicid == NULL);
625
626 for_each_possible_cpu(cpu) {
661 int node; 627 int node;
662 u16 apicid = x86_cpu_to_apicid_init[i]; 628 u16 apicid = cpu_to_apicid[cpu];
663 629
664 if (apicid == BAD_APICID) 630 if (apicid == BAD_APICID)
665 continue; 631 continue;
@@ -668,8 +634,9 @@ void __init init_cpu_to_node(void)
668 continue; 634 continue;
669 if (!node_online(node)) 635 if (!node_online(node))
670 continue; 636 continue;
671 numa_set_node(i, node); 637 numa_set_node(cpu, node);
672 } 638 }
673} 639}
640#endif
674 641
675 642
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 75f1b109aae8..e1d106909218 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -1,8 +1,8 @@
1/* 1/*
2 * self test for change_page_attr. 2 * self test for change_page_attr.
3 * 3 *
4 * Clears the global bit on random pages in the direct mapping, then reverts 4 * Clears the a test pte bit on random pages in the direct mapping,
5 * and compares page tables forwards and afterwards. 5 * then reverts and compares page tables forwards and afterwards.
6 */ 6 */
7#include <linux/bootmem.h> 7#include <linux/bootmem.h>
8#include <linux/kthread.h> 8#include <linux/kthread.h>
@@ -32,6 +32,13 @@ enum {
32 GPS = (1<<30) 32 GPS = (1<<30)
33}; 33};
34 34
35#define PAGE_CPA_TEST __pgprot(_PAGE_CPA_TEST)
36
37static int pte_testbit(pte_t pte)
38{
39 return pte_flags(pte) & _PAGE_UNUSED1;
40}
41
35struct split_state { 42struct split_state {
36 long lpg, gpg, spg, exec; 43 long lpg, gpg, spg, exec;
37 long min_exec, max_exec; 44 long min_exec, max_exec;
@@ -111,6 +118,7 @@ static int pageattr_test(void)
111 unsigned int level; 118 unsigned int level;
112 int i, k; 119 int i, k;
113 int err; 120 int err;
121 unsigned long test_addr;
114 122
115 if (print) 123 if (print)
116 printk(KERN_INFO "CPA self-test:\n"); 124 printk(KERN_INFO "CPA self-test:\n");
@@ -165,15 +173,15 @@ static int pageattr_test(void)
165 continue; 173 continue;
166 } 174 }
167 175
168 err = change_page_attr_clear(addr[i], len[i], 176 test_addr = addr[i];
169 __pgprot(_PAGE_GLOBAL)); 177 err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0);
170 if (err < 0) { 178 if (err < 0) {
171 printk(KERN_ERR "CPA %d failed %d\n", i, err); 179 printk(KERN_ERR "CPA %d failed %d\n", i, err);
172 failed++; 180 failed++;
173 } 181 }
174 182
175 pte = lookup_address(addr[i], &level); 183 pte = lookup_address(addr[i], &level);
176 if (!pte || pte_global(*pte) || pte_huge(*pte)) { 184 if (!pte || !pte_testbit(*pte) || pte_huge(*pte)) {
177 printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i], 185 printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i],
178 pte ? (u64)pte_val(*pte) : 0ULL); 186 pte ? (u64)pte_val(*pte) : 0ULL);
179 failed++; 187 failed++;
@@ -198,14 +206,14 @@ static int pageattr_test(void)
198 failed++; 206 failed++;
199 continue; 207 continue;
200 } 208 }
201 err = change_page_attr_set(addr[i], len[i], 209 test_addr = addr[i];
202 __pgprot(_PAGE_GLOBAL)); 210 err = change_page_attr_clear(&test_addr, len[i], PAGE_CPA_TEST, 0);
203 if (err < 0) { 211 if (err < 0) {
204 printk(KERN_ERR "CPA reverting failed: %d\n", err); 212 printk(KERN_ERR "CPA reverting failed: %d\n", err);
205 failed++; 213 failed++;
206 } 214 }
207 pte = lookup_address(addr[i], &level); 215 pte = lookup_address(addr[i], &level);
208 if (!pte || !pte_global(*pte)) { 216 if (!pte || pte_testbit(*pte)) {
209 printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n", 217 printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n",
210 addr[i], pte ? (u64)pte_val(*pte) : 0ULL); 218 addr[i], pte ? (u64)pte_val(*pte) : 0ULL);
211 failed++; 219 failed++;
@@ -216,8 +224,7 @@ static int pageattr_test(void)
216 failed += print_split(&sc); 224 failed += print_split(&sc);
217 225
218 if (failed) { 226 if (failed) {
219 printk(KERN_ERR "NOT PASSED. Please report.\n"); 227 WARN(1, KERN_ERR "NOT PASSED. Please report.\n");
220 WARN_ON(1);
221 return -EINVAL; 228 return -EINVAL;
222 } else { 229 } else {
223 if (print) 230 if (print)
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 60bcb5b6a37e..a9ec89c3fbca 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -25,15 +25,68 @@
25 * The current flushing context - we pass it instead of 5 arguments: 25 * The current flushing context - we pass it instead of 5 arguments:
26 */ 26 */
27struct cpa_data { 27struct cpa_data {
28 unsigned long vaddr; 28 unsigned long *vaddr;
29 pgprot_t mask_set; 29 pgprot_t mask_set;
30 pgprot_t mask_clr; 30 pgprot_t mask_clr;
31 int numpages; 31 int numpages;
32 int flushtlb; 32 int flags;
33 unsigned long pfn; 33 unsigned long pfn;
34 unsigned force_split : 1; 34 unsigned force_split : 1;
35 int curpage;
35}; 36};
36 37
38/*
39 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
40 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
41 * entries change the page attribute in parallel to some other cpu
42 * splitting a large page entry along with changing the attribute.
43 */
44static DEFINE_SPINLOCK(cpa_lock);
45
46#define CPA_FLUSHTLB 1
47#define CPA_ARRAY 2
48
49#ifdef CONFIG_PROC_FS
50static unsigned long direct_pages_count[PG_LEVEL_NUM];
51
52void update_page_count(int level, unsigned long pages)
53{
54 unsigned long flags;
55
56 /* Protect against CPA */
57 spin_lock_irqsave(&pgd_lock, flags);
58 direct_pages_count[level] += pages;
59 spin_unlock_irqrestore(&pgd_lock, flags);
60}
61
62static void split_page_count(int level)
63{
64 direct_pages_count[level]--;
65 direct_pages_count[level - 1] += PTRS_PER_PTE;
66}
67
68int arch_report_meminfo(char *page)
69{
70 int n = sprintf(page, "DirectMap4k: %8lu kB\n",
71 direct_pages_count[PG_LEVEL_4K] << 2);
72#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
73 n += sprintf(page + n, "DirectMap2M: %8lu kB\n",
74 direct_pages_count[PG_LEVEL_2M] << 11);
75#else
76 n += sprintf(page + n, "DirectMap4M: %8lu kB\n",
77 direct_pages_count[PG_LEVEL_2M] << 12);
78#endif
79#ifdef CONFIG_X86_64
80 if (direct_gbpages)
81 n += sprintf(page + n, "DirectMap1G: %8lu kB\n",
82 direct_pages_count[PG_LEVEL_1G] << 20);
83#endif
84 return n;
85}
86#else
87static inline void split_page_count(int level) { }
88#endif
89
37#ifdef CONFIG_X86_64 90#ifdef CONFIG_X86_64
38 91
39static inline unsigned long highmap_start_pfn(void) 92static inline unsigned long highmap_start_pfn(void)
@@ -43,7 +96,7 @@ static inline unsigned long highmap_start_pfn(void)
43 96
44static inline unsigned long highmap_end_pfn(void) 97static inline unsigned long highmap_end_pfn(void)
45{ 98{
46 return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; 99 return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
47} 100}
48 101
49#endif 102#endif
@@ -106,7 +159,7 @@ static void cpa_flush_all(unsigned long cache)
106{ 159{
107 BUG_ON(irqs_disabled()); 160 BUG_ON(irqs_disabled());
108 161
109 on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1); 162 on_each_cpu(__cpa_flush_all, (void *) cache, 1);
110} 163}
111 164
112static void __cpa_flush_range(void *arg) 165static void __cpa_flush_range(void *arg)
@@ -127,7 +180,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
127 BUG_ON(irqs_disabled()); 180 BUG_ON(irqs_disabled());
128 WARN_ON(PAGE_ALIGN(start) != start); 181 WARN_ON(PAGE_ALIGN(start) != start);
129 182
130 on_each_cpu(__cpa_flush_range, NULL, 1, 1); 183 on_each_cpu(__cpa_flush_range, NULL, 1);
131 184
132 if (!cache) 185 if (!cache)
133 return; 186 return;
@@ -149,6 +202,41 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
149 } 202 }
150} 203}
151 204
205static void cpa_flush_array(unsigned long *start, int numpages, int cache)
206{
207 unsigned int i, level;
208 unsigned long *addr;
209
210 BUG_ON(irqs_disabled());
211
212 on_each_cpu(__cpa_flush_range, NULL, 1);
213
214 if (!cache)
215 return;
216
217 /* 4M threshold */
218 if (numpages >= 1024) {
219 if (boot_cpu_data.x86_model >= 4)
220 wbinvd();
221 return;
222 }
223 /*
224 * We only need to flush on one CPU,
225 * clflush is a MESI-coherent instruction that
226 * will cause all other CPUs to flush the same
227 * cachelines:
228 */
229 for (i = 0, addr = start; i < numpages; i++, addr++) {
230 pte_t *pte = lookup_address(*addr, &level);
231
232 /*
233 * Only flush present addresses:
234 */
235 if (pte && (pte_val(*pte) & _PAGE_PRESENT))
236 clflush_cache_range((void *) *addr, PAGE_SIZE);
237 }
238}
239
152/* 240/*
153 * Certain areas of memory on x86 require very specific protection flags, 241 * Certain areas of memory on x86 require very specific protection flags,
154 * for example the BIOS area or kernel text. Callers don't always get this 242 * for example the BIOS area or kernel text. Callers don't always get this
@@ -227,6 +315,7 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
227 315
228 return pte_offset_kernel(pmd, address); 316 return pte_offset_kernel(pmd, address);
229} 317}
318EXPORT_SYMBOL_GPL(lookup_address);
230 319
231/* 320/*
232 * Set the new pmd in all the pgds we know about: 321 * Set the new pmd in all the pgds we know about:
@@ -356,7 +445,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
356 */ 445 */
357 new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); 446 new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
358 __set_pmd_pte(kpte, address, new_pte); 447 __set_pmd_pte(kpte, address, new_pte);
359 cpa->flushtlb = 1; 448 cpa->flags |= CPA_FLUSHTLB;
360 do_split = 0; 449 do_split = 0;
361 } 450 }
362 451
@@ -366,84 +455,6 @@ out_unlock:
366 return do_split; 455 return do_split;
367} 456}
368 457
369static LIST_HEAD(page_pool);
370static unsigned long pool_size, pool_pages, pool_low;
371static unsigned long pool_used, pool_failed;
372
373static void cpa_fill_pool(struct page **ret)
374{
375 gfp_t gfp = GFP_KERNEL;
376 unsigned long flags;
377 struct page *p;
378
379 /*
380 * Avoid recursion (on debug-pagealloc) and also signal
381 * our priority to get to these pagetables:
382 */
383 if (current->flags & PF_MEMALLOC)
384 return;
385 current->flags |= PF_MEMALLOC;
386
387 /*
388 * Allocate atomically from atomic contexts:
389 */
390 if (in_atomic() || irqs_disabled() || debug_pagealloc)
391 gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
392
393 while (pool_pages < pool_size || (ret && !*ret)) {
394 p = alloc_pages(gfp, 0);
395 if (!p) {
396 pool_failed++;
397 break;
398 }
399 /*
400 * If the call site needs a page right now, provide it:
401 */
402 if (ret && !*ret) {
403 *ret = p;
404 continue;
405 }
406 spin_lock_irqsave(&pgd_lock, flags);
407 list_add(&p->lru, &page_pool);
408 pool_pages++;
409 spin_unlock_irqrestore(&pgd_lock, flags);
410 }
411
412 current->flags &= ~PF_MEMALLOC;
413}
414
415#define SHIFT_MB (20 - PAGE_SHIFT)
416#define ROUND_MB_GB ((1 << 10) - 1)
417#define SHIFT_MB_GB 10
418#define POOL_PAGES_PER_GB 16
419
420void __init cpa_init(void)
421{
422 struct sysinfo si;
423 unsigned long gb;
424
425 si_meminfo(&si);
426 /*
427 * Calculate the number of pool pages:
428 *
429 * Convert totalram (nr of pages) to MiB and round to the next
430 * GiB. Shift MiB to Gib and multiply the result by
431 * POOL_PAGES_PER_GB:
432 */
433 if (debug_pagealloc) {
434 gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
435 pool_size = POOL_PAGES_PER_GB * gb;
436 } else {
437 pool_size = 1;
438 }
439 pool_low = pool_size;
440
441 cpa_fill_pool(NULL);
442 printk(KERN_DEBUG
443 "CPA: page pool initialized %lu of %lu pages preallocated\n",
444 pool_pages, pool_size);
445}
446
447static int split_large_page(pte_t *kpte, unsigned long address) 458static int split_large_page(pte_t *kpte, unsigned long address)
448{ 459{
449 unsigned long flags, pfn, pfninc = 1; 460 unsigned long flags, pfn, pfninc = 1;
@@ -452,28 +463,15 @@ static int split_large_page(pte_t *kpte, unsigned long address)
452 pgprot_t ref_prot; 463 pgprot_t ref_prot;
453 struct page *base; 464 struct page *base;
454 465
455 /* 466 if (!debug_pagealloc)
456 * Get a page from the pool. The pool list is protected by the 467 spin_unlock(&cpa_lock);
457 * pgd_lock, which we have to take anyway for the split 468 base = alloc_pages(GFP_KERNEL, 0);
458 * operation: 469 if (!debug_pagealloc)
459 */ 470 spin_lock(&cpa_lock);
460 spin_lock_irqsave(&pgd_lock, flags); 471 if (!base)
461 if (list_empty(&page_pool)) { 472 return -ENOMEM;
462 spin_unlock_irqrestore(&pgd_lock, flags);
463 base = NULL;
464 cpa_fill_pool(&base);
465 if (!base)
466 return -ENOMEM;
467 spin_lock_irqsave(&pgd_lock, flags);
468 } else {
469 base = list_first_entry(&page_pool, struct page, lru);
470 list_del(&base->lru);
471 pool_pages--;
472
473 if (pool_pages < pool_low)
474 pool_low = pool_pages;
475 }
476 473
474 spin_lock_irqsave(&pgd_lock, flags);
477 /* 475 /*
478 * Check for races, another CPU might have split this page 476 * Check for races, another CPU might have split this page
479 * up for us already: 477 * up for us already:
@@ -500,6 +498,16 @@ static int split_large_page(pte_t *kpte, unsigned long address)
500 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) 498 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
501 set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); 499 set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
502 500
501 if (address >= (unsigned long)__va(0) &&
502 address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
503 split_page_count(level);
504
505#ifdef CONFIG_X86_64
506 if (address >= (unsigned long)__va(1UL<<32) &&
507 address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
508 split_page_count(level);
509#endif
510
503 /* 511 /*
504 * Install the new, split up pagetable. Important details here: 512 * Install the new, split up pagetable. Important details here:
505 * 513 *
@@ -520,11 +528,8 @@ out_unlock:
520 * If we dropped out via the lookup_address check under 528 * If we dropped out via the lookup_address check under
521 * pgd_lock then stick the page back into the pool: 529 * pgd_lock then stick the page back into the pool:
522 */ 530 */
523 if (base) { 531 if (base)
524 list_add(&base->lru, &page_pool); 532 __free_page(base);
525 pool_pages++;
526 } else
527 pool_used++;
528 spin_unlock_irqrestore(&pgd_lock, flags); 533 spin_unlock_irqrestore(&pgd_lock, flags);
529 534
530 return 0; 535 return 0;
@@ -532,11 +537,16 @@ out_unlock:
532 537
533static int __change_page_attr(struct cpa_data *cpa, int primary) 538static int __change_page_attr(struct cpa_data *cpa, int primary)
534{ 539{
535 unsigned long address = cpa->vaddr; 540 unsigned long address;
536 int do_split, err; 541 int do_split, err;
537 unsigned int level; 542 unsigned int level;
538 pte_t *kpte, old_pte; 543 pte_t *kpte, old_pte;
539 544
545 if (cpa->flags & CPA_ARRAY)
546 address = cpa->vaddr[cpa->curpage];
547 else
548 address = *cpa->vaddr;
549
540repeat: 550repeat:
541 kpte = lookup_address(address, &level); 551 kpte = lookup_address(address, &level);
542 if (!kpte) 552 if (!kpte)
@@ -546,10 +556,9 @@ repeat:
546 if (!pte_val(old_pte)) { 556 if (!pte_val(old_pte)) {
547 if (!primary) 557 if (!primary)
548 return 0; 558 return 0;
549 printk(KERN_WARNING "CPA: called for zero pte. " 559 WARN(1, KERN_WARNING "CPA: called for zero pte. "
550 "vaddr = %lx cpa->vaddr = %lx\n", address, 560 "vaddr = %lx cpa->vaddr = %lx\n", address,
551 cpa->vaddr); 561 *cpa->vaddr);
552 WARN_ON(1);
553 return -EINVAL; 562 return -EINVAL;
554 } 563 }
555 564
@@ -575,7 +584,7 @@ repeat:
575 */ 584 */
576 if (pte_val(old_pte) != pte_val(new_pte)) { 585 if (pte_val(old_pte) != pte_val(new_pte)) {
577 set_pte_atomic(kpte, new_pte); 586 set_pte_atomic(kpte, new_pte);
578 cpa->flushtlb = 1; 587 cpa->flags |= CPA_FLUSHTLB;
579 } 588 }
580 cpa->numpages = 1; 589 cpa->numpages = 1;
581 return 0; 590 return 0;
@@ -599,7 +608,25 @@ repeat:
599 */ 608 */
600 err = split_large_page(kpte, address); 609 err = split_large_page(kpte, address);
601 if (!err) { 610 if (!err) {
602 cpa->flushtlb = 1; 611 /*
612 * Do a global flush tlb after splitting the large page
613 * and before we do the actual change page attribute in the PTE.
614 *
615 * With out this, we violate the TLB application note, that says
616 * "The TLBs may contain both ordinary and large-page
617 * translations for a 4-KByte range of linear addresses. This
618 * may occur if software modifies the paging structures so that
619 * the page size used for the address range changes. If the two
620 * translations differ with respect to page frame or attributes
621 * (e.g., permissions), processor behavior is undefined and may
622 * be implementation-specific."
623 *
624 * We do this global tlb flush inside the cpa_lock, so that we
625 * don't allow any other cpu, with stale tlb entries change the
626 * page attribute in parallel, that also falls into the
627 * just split large page entry.
628 */
629 flush_tlb_all();
603 goto repeat; 630 goto repeat;
604 } 631 }
605 632
@@ -612,19 +639,37 @@ static int cpa_process_alias(struct cpa_data *cpa)
612{ 639{
613 struct cpa_data alias_cpa; 640 struct cpa_data alias_cpa;
614 int ret = 0; 641 int ret = 0;
642 unsigned long temp_cpa_vaddr, vaddr;
615 643
616 if (cpa->pfn > max_pfn_mapped) 644 if (cpa->pfn >= max_pfn_mapped)
617 return 0; 645 return 0;
618 646
647#ifdef CONFIG_X86_64
648 if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
649 return 0;
650#endif
619 /* 651 /*
620 * No need to redo, when the primary call touched the direct 652 * No need to redo, when the primary call touched the direct
621 * mapping already: 653 * mapping already:
622 */ 654 */
623 if (!within(cpa->vaddr, PAGE_OFFSET, 655 if (cpa->flags & CPA_ARRAY)
624 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { 656 vaddr = cpa->vaddr[cpa->curpage];
657 else
658 vaddr = *cpa->vaddr;
659
660 if (!(within(vaddr, PAGE_OFFSET,
661 PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
662#ifdef CONFIG_X86_64
663 || within(vaddr, PAGE_OFFSET + (1UL<<32),
664 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
665#endif
666 )) {
625 667
626 alias_cpa = *cpa; 668 alias_cpa = *cpa;
627 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); 669 temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
670 alias_cpa.vaddr = &temp_cpa_vaddr;
671 alias_cpa.flags &= ~CPA_ARRAY;
672
628 673
629 ret = __change_page_attr_set_clr(&alias_cpa, 0); 674 ret = __change_page_attr_set_clr(&alias_cpa, 0);
630 } 675 }
@@ -636,7 +681,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
636 * No need to redo, when the primary call touched the high 681 * No need to redo, when the primary call touched the high
637 * mapping already: 682 * mapping already:
638 */ 683 */
639 if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end)) 684 if (within(vaddr, (unsigned long) _text, (unsigned long) _end))
640 return 0; 685 return 0;
641 686
642 /* 687 /*
@@ -647,8 +692,9 @@ static int cpa_process_alias(struct cpa_data *cpa)
647 return 0; 692 return 0;
648 693
649 alias_cpa = *cpa; 694 alias_cpa = *cpa;
650 alias_cpa.vaddr = 695 temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
651 (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; 696 alias_cpa.vaddr = &temp_cpa_vaddr;
697 alias_cpa.flags &= ~CPA_ARRAY;
652 698
653 /* 699 /*
654 * The high mapping range is imprecise, so ignore the return value. 700 * The high mapping range is imprecise, so ignore the return value.
@@ -668,8 +714,15 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
668 * preservation check. 714 * preservation check.
669 */ 715 */
670 cpa->numpages = numpages; 716 cpa->numpages = numpages;
717 /* for array changes, we can't use large page */
718 if (cpa->flags & CPA_ARRAY)
719 cpa->numpages = 1;
671 720
721 if (!debug_pagealloc)
722 spin_lock(&cpa_lock);
672 ret = __change_page_attr(cpa, checkalias); 723 ret = __change_page_attr(cpa, checkalias);
724 if (!debug_pagealloc)
725 spin_unlock(&cpa_lock);
673 if (ret) 726 if (ret)
674 return ret; 727 return ret;
675 728
@@ -686,7 +739,11 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
686 */ 739 */
687 BUG_ON(cpa->numpages > numpages); 740 BUG_ON(cpa->numpages > numpages);
688 numpages -= cpa->numpages; 741 numpages -= cpa->numpages;
689 cpa->vaddr += cpa->numpages * PAGE_SIZE; 742 if (cpa->flags & CPA_ARRAY)
743 cpa->curpage++;
744 else
745 *cpa->vaddr += cpa->numpages * PAGE_SIZE;
746
690 } 747 }
691 return 0; 748 return 0;
692} 749}
@@ -697,9 +754,9 @@ static inline int cache_attr(pgprot_t attr)
697 (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); 754 (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
698} 755}
699 756
700static int change_page_attr_set_clr(unsigned long addr, int numpages, 757static int change_page_attr_set_clr(unsigned long *addr, int numpages,
701 pgprot_t mask_set, pgprot_t mask_clr, 758 pgprot_t mask_set, pgprot_t mask_clr,
702 int force_split) 759 int force_split, int array)
703{ 760{
704 struct cpa_data cpa; 761 struct cpa_data cpa;
705 int ret, cache, checkalias; 762 int ret, cache, checkalias;
@@ -714,21 +771,38 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
714 return 0; 771 return 0;
715 772
716 /* Ensure we are PAGE_SIZE aligned */ 773 /* Ensure we are PAGE_SIZE aligned */
717 if (addr & ~PAGE_MASK) { 774 if (!array) {
718 addr &= PAGE_MASK; 775 if (*addr & ~PAGE_MASK) {
719 /* 776 *addr &= PAGE_MASK;
720 * People should not be passing in unaligned addresses: 777 /*
721 */ 778 * People should not be passing in unaligned addresses:
722 WARN_ON_ONCE(1); 779 */
780 WARN_ON_ONCE(1);
781 }
782 } else {
783 int i;
784 for (i = 0; i < numpages; i++) {
785 if (addr[i] & ~PAGE_MASK) {
786 addr[i] &= PAGE_MASK;
787 WARN_ON_ONCE(1);
788 }
789 }
723 } 790 }
724 791
792 /* Must avoid aliasing mappings in the highmem code */
793 kmap_flush_unused();
794
725 cpa.vaddr = addr; 795 cpa.vaddr = addr;
726 cpa.numpages = numpages; 796 cpa.numpages = numpages;
727 cpa.mask_set = mask_set; 797 cpa.mask_set = mask_set;
728 cpa.mask_clr = mask_clr; 798 cpa.mask_clr = mask_clr;
729 cpa.flushtlb = 0; 799 cpa.flags = 0;
800 cpa.curpage = 0;
730 cpa.force_split = force_split; 801 cpa.force_split = force_split;
731 802
803 if (array)
804 cpa.flags |= CPA_ARRAY;
805
732 /* No alias checking for _NX bit modifications */ 806 /* No alias checking for _NX bit modifications */
733 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; 807 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
734 808
@@ -737,7 +811,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
737 /* 811 /*
738 * Check whether we really changed something: 812 * Check whether we really changed something:
739 */ 813 */
740 if (!cpa.flushtlb) 814 if (!(cpa.flags & CPA_FLUSHTLB))
741 goto out; 815 goto out;
742 816
743 /* 817 /*
@@ -752,27 +826,30 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
752 * error case we fall back to cpa_flush_all (which uses 826 * error case we fall back to cpa_flush_all (which uses
753 * wbindv): 827 * wbindv):
754 */ 828 */
755 if (!ret && cpu_has_clflush) 829 if (!ret && cpu_has_clflush) {
756 cpa_flush_range(addr, numpages, cache); 830 if (cpa.flags & CPA_ARRAY)
757 else 831 cpa_flush_array(addr, numpages, cache);
832 else
833 cpa_flush_range(*addr, numpages, cache);
834 } else
758 cpa_flush_all(cache); 835 cpa_flush_all(cache);
759 836
760out: 837out:
761 cpa_fill_pool(NULL);
762
763 return ret; 838 return ret;
764} 839}
765 840
766static inline int change_page_attr_set(unsigned long addr, int numpages, 841static inline int change_page_attr_set(unsigned long *addr, int numpages,
767 pgprot_t mask) 842 pgprot_t mask, int array)
768{ 843{
769 return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0); 844 return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
845 array);
770} 846}
771 847
772static inline int change_page_attr_clear(unsigned long addr, int numpages, 848static inline int change_page_attr_clear(unsigned long *addr, int numpages,
773 pgprot_t mask) 849 pgprot_t mask, int array)
774{ 850{
775 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0); 851 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
852 array);
776} 853}
777 854
778int _set_memory_uc(unsigned long addr, int numpages) 855int _set_memory_uc(unsigned long addr, int numpages)
@@ -780,8 +857,8 @@ int _set_memory_uc(unsigned long addr, int numpages)
780 /* 857 /*
781 * for now UC MINUS. see comments in ioremap_nocache() 858 * for now UC MINUS. see comments in ioremap_nocache()
782 */ 859 */
783 return change_page_attr_set(addr, numpages, 860 return change_page_attr_set(&addr, numpages,
784 __pgprot(_PAGE_CACHE_UC_MINUS)); 861 __pgprot(_PAGE_CACHE_UC_MINUS), 0);
785} 862}
786 863
787int set_memory_uc(unsigned long addr, int numpages) 864int set_memory_uc(unsigned long addr, int numpages)
@@ -789,7 +866,7 @@ int set_memory_uc(unsigned long addr, int numpages)
789 /* 866 /*
790 * for now UC MINUS. see comments in ioremap_nocache() 867 * for now UC MINUS. see comments in ioremap_nocache()
791 */ 868 */
792 if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, 869 if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
793 _PAGE_CACHE_UC_MINUS, NULL)) 870 _PAGE_CACHE_UC_MINUS, NULL))
794 return -EINVAL; 871 return -EINVAL;
795 872
@@ -797,18 +874,56 @@ int set_memory_uc(unsigned long addr, int numpages)
797} 874}
798EXPORT_SYMBOL(set_memory_uc); 875EXPORT_SYMBOL(set_memory_uc);
799 876
877int set_memory_array_uc(unsigned long *addr, int addrinarray)
878{
879 unsigned long start;
880 unsigned long end;
881 int i;
882 /*
883 * for now UC MINUS. see comments in ioremap_nocache()
884 */
885 for (i = 0; i < addrinarray; i++) {
886 start = __pa(addr[i]);
887 for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
888 if (end != __pa(addr[i + 1]))
889 break;
890 i++;
891 }
892 if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
893 goto out;
894 }
895
896 return change_page_attr_set(addr, addrinarray,
897 __pgprot(_PAGE_CACHE_UC_MINUS), 1);
898out:
899 for (i = 0; i < addrinarray; i++) {
900 unsigned long tmp = __pa(addr[i]);
901
902 if (tmp == start)
903 break;
904 for (end = tmp + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
905 if (end != __pa(addr[i + 1]))
906 break;
907 i++;
908 }
909 free_memtype(tmp, end);
910 }
911 return -EINVAL;
912}
913EXPORT_SYMBOL(set_memory_array_uc);
914
800int _set_memory_wc(unsigned long addr, int numpages) 915int _set_memory_wc(unsigned long addr, int numpages)
801{ 916{
802 return change_page_attr_set(addr, numpages, 917 return change_page_attr_set(&addr, numpages,
803 __pgprot(_PAGE_CACHE_WC)); 918 __pgprot(_PAGE_CACHE_WC), 0);
804} 919}
805 920
806int set_memory_wc(unsigned long addr, int numpages) 921int set_memory_wc(unsigned long addr, int numpages)
807{ 922{
808 if (!pat_wc_enabled) 923 if (!pat_enabled)
809 return set_memory_uc(addr, numpages); 924 return set_memory_uc(addr, numpages);
810 925
811 if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, 926 if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
812 _PAGE_CACHE_WC, NULL)) 927 _PAGE_CACHE_WC, NULL))
813 return -EINVAL; 928 return -EINVAL;
814 929
@@ -818,49 +933,71 @@ EXPORT_SYMBOL(set_memory_wc);
818 933
819int _set_memory_wb(unsigned long addr, int numpages) 934int _set_memory_wb(unsigned long addr, int numpages)
820{ 935{
821 return change_page_attr_clear(addr, numpages, 936 return change_page_attr_clear(&addr, numpages,
822 __pgprot(_PAGE_CACHE_MASK)); 937 __pgprot(_PAGE_CACHE_MASK), 0);
823} 938}
824 939
825int set_memory_wb(unsigned long addr, int numpages) 940int set_memory_wb(unsigned long addr, int numpages)
826{ 941{
827 free_memtype(addr, addr + numpages * PAGE_SIZE); 942 free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
828 943
829 return _set_memory_wb(addr, numpages); 944 return _set_memory_wb(addr, numpages);
830} 945}
831EXPORT_SYMBOL(set_memory_wb); 946EXPORT_SYMBOL(set_memory_wb);
832 947
948int set_memory_array_wb(unsigned long *addr, int addrinarray)
949{
950 int i;
951
952 for (i = 0; i < addrinarray; i++) {
953 unsigned long start = __pa(addr[i]);
954 unsigned long end;
955
956 for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
957 if (end != __pa(addr[i + 1]))
958 break;
959 i++;
960 }
961 free_memtype(start, end);
962 }
963 return change_page_attr_clear(addr, addrinarray,
964 __pgprot(_PAGE_CACHE_MASK), 1);
965}
966EXPORT_SYMBOL(set_memory_array_wb);
967
833int set_memory_x(unsigned long addr, int numpages) 968int set_memory_x(unsigned long addr, int numpages)
834{ 969{
835 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX)); 970 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
836} 971}
837EXPORT_SYMBOL(set_memory_x); 972EXPORT_SYMBOL(set_memory_x);
838 973
839int set_memory_nx(unsigned long addr, int numpages) 974int set_memory_nx(unsigned long addr, int numpages)
840{ 975{
841 return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX)); 976 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
842} 977}
843EXPORT_SYMBOL(set_memory_nx); 978EXPORT_SYMBOL(set_memory_nx);
844 979
845int set_memory_ro(unsigned long addr, int numpages) 980int set_memory_ro(unsigned long addr, int numpages)
846{ 981{
847 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW)); 982 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
848} 983}
984EXPORT_SYMBOL_GPL(set_memory_ro);
849 985
850int set_memory_rw(unsigned long addr, int numpages) 986int set_memory_rw(unsigned long addr, int numpages)
851{ 987{
852 return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW)); 988 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
853} 989}
990EXPORT_SYMBOL_GPL(set_memory_rw);
854 991
855int set_memory_np(unsigned long addr, int numpages) 992int set_memory_np(unsigned long addr, int numpages)
856{ 993{
857 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT)); 994 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
858} 995}
859 996
860int set_memory_4k(unsigned long addr, int numpages) 997int set_memory_4k(unsigned long addr, int numpages)
861{ 998{
862 return change_page_attr_set_clr(addr, numpages, __pgprot(0), 999 return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
863 __pgprot(0), 1); 1000 __pgprot(0), 1, 0);
864} 1001}
865 1002
866int set_pages_uc(struct page *page, int numpages) 1003int set_pages_uc(struct page *page, int numpages)
@@ -913,22 +1050,38 @@ int set_pages_rw(struct page *page, int numpages)
913 1050
914static int __set_pages_p(struct page *page, int numpages) 1051static int __set_pages_p(struct page *page, int numpages)
915{ 1052{
916 struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), 1053 unsigned long tempaddr = (unsigned long) page_address(page);
1054 struct cpa_data cpa = { .vaddr = &tempaddr,
917 .numpages = numpages, 1055 .numpages = numpages,
918 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), 1056 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
919 .mask_clr = __pgprot(0)}; 1057 .mask_clr = __pgprot(0),
1058 .flags = 0};
920 1059
921 return __change_page_attr_set_clr(&cpa, 1); 1060 /*
1061 * No alias checking needed for setting present flag. otherwise,
1062 * we may need to break large pages for 64-bit kernel text
1063 * mappings (this adds to complexity if we want to do this from
1064 * atomic context especially). Let's keep it simple!
1065 */
1066 return __change_page_attr_set_clr(&cpa, 0);
922} 1067}
923 1068
924static int __set_pages_np(struct page *page, int numpages) 1069static int __set_pages_np(struct page *page, int numpages)
925{ 1070{
926 struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), 1071 unsigned long tempaddr = (unsigned long) page_address(page);
1072 struct cpa_data cpa = { .vaddr = &tempaddr,
927 .numpages = numpages, 1073 .numpages = numpages,
928 .mask_set = __pgprot(0), 1074 .mask_set = __pgprot(0),
929 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)}; 1075 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1076 .flags = 0};
930 1077
931 return __change_page_attr_set_clr(&cpa, 1); 1078 /*
1079 * No alias checking needed for setting not present flag. otherwise,
1080 * we may need to break large pages for 64-bit kernel text
1081 * mappings (this adds to complexity if we want to do this from
1082 * atomic context especially). Let's keep it simple!
1083 */
1084 return __change_page_attr_set_clr(&cpa, 0);
932} 1085}
933 1086
934void kernel_map_pages(struct page *page, int numpages, int enable) 1087void kernel_map_pages(struct page *page, int numpages, int enable)
@@ -948,11 +1101,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
948 1101
949 /* 1102 /*
950 * The return value is ignored as the calls cannot fail. 1103 * The return value is ignored as the calls cannot fail.
951 * Large pages are kept enabled at boot time, and are 1104 * Large pages for identity mappings are not used at boot time
952 * split up quickly with DEBUG_PAGEALLOC. If a splitup 1105 * and hence no memory allocations during large page split.
953 * fails here (due to temporary memory shortage) no damage
954 * is done because we just keep the largepage intact up
955 * to the next attempt when it will likely be split up:
956 */ 1106 */
957 if (enable) 1107 if (enable)
958 __set_pages_p(page, numpages); 1108 __set_pages_p(page, numpages);
@@ -964,53 +1114,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
964 * but that can deadlock->flush only current cpu: 1114 * but that can deadlock->flush only current cpu:
965 */ 1115 */
966 __flush_tlb_all(); 1116 __flush_tlb_all();
967
968 /*
969 * Try to refill the page pool here. We can do this only after
970 * the tlb flush.
971 */
972 cpa_fill_pool(NULL);
973}
974
975#ifdef CONFIG_DEBUG_FS
976static int dpa_show(struct seq_file *m, void *v)
977{
978 seq_puts(m, "DEBUG_PAGEALLOC\n");
979 seq_printf(m, "pool_size : %lu\n", pool_size);
980 seq_printf(m, "pool_pages : %lu\n", pool_pages);
981 seq_printf(m, "pool_low : %lu\n", pool_low);
982 seq_printf(m, "pool_used : %lu\n", pool_used);
983 seq_printf(m, "pool_failed : %lu\n", pool_failed);
984
985 return 0;
986}
987
988static int dpa_open(struct inode *inode, struct file *filp)
989{
990 return single_open(filp, dpa_show, NULL);
991} 1117}
992 1118
993static const struct file_operations dpa_fops = {
994 .open = dpa_open,
995 .read = seq_read,
996 .llseek = seq_lseek,
997 .release = single_release,
998};
999
1000static int __init debug_pagealloc_proc_init(void)
1001{
1002 struct dentry *de;
1003
1004 de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL,
1005 &dpa_fops);
1006 if (!de)
1007 return -ENOMEM;
1008
1009 return 0;
1010}
1011__initcall(debug_pagealloc_proc_init);
1012#endif
1013
1014#ifdef CONFIG_HIBERNATION 1119#ifdef CONFIG_HIBERNATION
1015 1120
1016bool kernel_page_present(struct page *page) 1121bool kernel_page_present(struct page *page)
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 06b7a1c90fb8..738fd0f24958 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -7,30 +7,32 @@
7 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. 7 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
8 */ 8 */
9 9
10#include <linux/mm.h> 10#include <linux/seq_file.h>
11#include <linux/bootmem.h>
12#include <linux/debugfs.h>
11#include <linux/kernel.h> 13#include <linux/kernel.h>
12#include <linux/gfp.h> 14#include <linux/gfp.h>
15#include <linux/mm.h>
13#include <linux/fs.h> 16#include <linux/fs.h>
14#include <linux/bootmem.h>
15 17
16#include <asm/msr.h> 18#include <asm/cacheflush.h>
17#include <asm/tlbflush.h>
18#include <asm/processor.h> 19#include <asm/processor.h>
19#include <asm/page.h> 20#include <asm/tlbflush.h>
20#include <asm/pgtable.h> 21#include <asm/pgtable.h>
21#include <asm/pat.h>
22#include <asm/e820.h>
23#include <asm/cacheflush.h>
24#include <asm/fcntl.h> 22#include <asm/fcntl.h>
23#include <asm/e820.h>
25#include <asm/mtrr.h> 24#include <asm/mtrr.h>
25#include <asm/page.h>
26#include <asm/msr.h>
27#include <asm/pat.h>
26#include <asm/io.h> 28#include <asm/io.h>
27 29
28#ifdef CONFIG_X86_PAT 30#ifdef CONFIG_X86_PAT
29int __read_mostly pat_wc_enabled = 1; 31int __read_mostly pat_enabled = 1;
30 32
31void __cpuinit pat_disable(char *reason) 33void __cpuinit pat_disable(char *reason)
32{ 34{
33 pat_wc_enabled = 0; 35 pat_enabled = 0;
34 printk(KERN_INFO "%s\n", reason); 36 printk(KERN_INFO "%s\n", reason);
35} 37}
36 38
@@ -42,6 +44,20 @@ static int __init nopat(char *str)
42early_param("nopat", nopat); 44early_param("nopat", nopat);
43#endif 45#endif
44 46
47
48static int debug_enable;
49
50static int __init pat_debug_setup(char *str)
51{
52 debug_enable = 1;
53 return 0;
54}
55__setup("debugpat", pat_debug_setup);
56
57#define dprintk(fmt, arg...) \
58 do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
59
60
45static u64 __read_mostly boot_pat_state; 61static u64 __read_mostly boot_pat_state;
46 62
47enum { 63enum {
@@ -53,24 +69,25 @@ enum {
53 PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */ 69 PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
54}; 70};
55 71
56#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8)) 72#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8))
57 73
58void pat_init(void) 74void pat_init(void)
59{ 75{
60 u64 pat; 76 u64 pat;
61 77
62 if (!pat_wc_enabled) 78 if (!pat_enabled)
63 return; 79 return;
64 80
65 /* Paranoia check. */ 81 /* Paranoia check. */
66 if (!cpu_has_pat) { 82 if (!cpu_has_pat && boot_pat_state) {
67 printk(KERN_ERR "PAT enabled, but CPU feature cleared\n");
68 /* 83 /*
69 * Panic if this happens on the secondary CPU, and we 84 * If this happens we are on a secondary CPU, but
70 * switched to PAT on the boot CPU. We have no way to 85 * switched to PAT on the boot CPU. We have no way to
71 * undo PAT. 86 * undo PAT.
72 */ 87 */
73 BUG_ON(boot_pat_state); 88 printk(KERN_ERR "PAT enabled, "
89 "but not supported by secondary CPU\n");
90 BUG();
74 } 91 }
75 92
76 /* Set PWT to Write-Combining. All other bits stay the same */ 93 /* Set PWT to Write-Combining. All other bits stay the same */
@@ -86,8 +103,8 @@ void pat_init(void)
86 * 011 UC _PAGE_CACHE_UC 103 * 011 UC _PAGE_CACHE_UC
87 * PAT bit unused 104 * PAT bit unused
88 */ 105 */
89 pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) | 106 pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
90 PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC); 107 PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
91 108
92 /* Boot CPU check */ 109 /* Boot CPU check */
93 if (!boot_pat_state) 110 if (!boot_pat_state)
@@ -103,11 +120,11 @@ void pat_init(void)
103static char *cattr_name(unsigned long flags) 120static char *cattr_name(unsigned long flags)
104{ 121{
105 switch (flags & _PAGE_CACHE_MASK) { 122 switch (flags & _PAGE_CACHE_MASK) {
106 case _PAGE_CACHE_UC: return "uncached"; 123 case _PAGE_CACHE_UC: return "uncached";
107 case _PAGE_CACHE_UC_MINUS: return "uncached-minus"; 124 case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
108 case _PAGE_CACHE_WB: return "write-back"; 125 case _PAGE_CACHE_WB: return "write-back";
109 case _PAGE_CACHE_WC: return "write-combining"; 126 case _PAGE_CACHE_WC: return "write-combining";
110 default: return "broken"; 127 default: return "broken";
111 } 128 }
112} 129}
113 130
@@ -129,14 +146,14 @@ static char *cattr_name(unsigned long flags)
129 */ 146 */
130 147
131struct memtype { 148struct memtype {
132 u64 start; 149 u64 start;
133 u64 end; 150 u64 end;
134 unsigned long type; 151 unsigned long type;
135 struct list_head nd; 152 struct list_head nd;
136}; 153};
137 154
138static LIST_HEAD(memtype_list); 155static LIST_HEAD(memtype_list);
139static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ 156static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
140 157
141/* 158/*
142 * Does intersection of PAT memory type and MTRR memory type and returns 159 * Does intersection of PAT memory type and MTRR memory type and returns
@@ -145,47 +162,113 @@ static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
145 * The intersection is based on "Effective Memory Type" tables in IA-32 162 * The intersection is based on "Effective Memory Type" tables in IA-32
146 * SDM vol 3a 163 * SDM vol 3a
147 */ 164 */
148static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot, 165static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
149 unsigned long *ret_prot)
150{ 166{
151 unsigned long pat_type;
152 u8 mtrr_type;
153
154 pat_type = prot & _PAGE_CACHE_MASK;
155 prot &= (~_PAGE_CACHE_MASK);
156
157 /*
158 * We return the PAT request directly for types where PAT takes
159 * precedence with respect to MTRR and for UC_MINUS.
160 * Consistency checks with other PAT requests is done later
161 * while going through memtype list.
162 */
163 if (pat_type == _PAGE_CACHE_WC) {
164 *ret_prot = prot | _PAGE_CACHE_WC;
165 return 0;
166 } else if (pat_type == _PAGE_CACHE_UC_MINUS) {
167 *ret_prot = prot | _PAGE_CACHE_UC_MINUS;
168 return 0;
169 } else if (pat_type == _PAGE_CACHE_UC) {
170 *ret_prot = prot | _PAGE_CACHE_UC;
171 return 0;
172 }
173
174 /* 167 /*
175 * Look for MTRR hint to get the effective type in case where PAT 168 * Look for MTRR hint to get the effective type in case where PAT
176 * request is for WB. 169 * request is for WB.
177 */ 170 */
178 mtrr_type = mtrr_type_lookup(start, end); 171 if (req_type == _PAGE_CACHE_WB) {
172 u8 mtrr_type;
173
174 mtrr_type = mtrr_type_lookup(start, end);
175 if (mtrr_type == MTRR_TYPE_UNCACHABLE)
176 return _PAGE_CACHE_UC;
177 if (mtrr_type == MTRR_TYPE_WRCOMB)
178 return _PAGE_CACHE_WC;
179 }
179 180
180 if (mtrr_type == MTRR_TYPE_UNCACHABLE) { 181 return req_type;
181 *ret_prot = prot | _PAGE_CACHE_UC; 182}
182 } else if (mtrr_type == MTRR_TYPE_WRCOMB) { 183
183 *ret_prot = prot | _PAGE_CACHE_WC; 184static int
184 } else { 185chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type)
185 *ret_prot = prot | _PAGE_CACHE_WB; 186{
187 if (new->type != entry->type) {
188 if (type) {
189 new->type = entry->type;
190 *type = entry->type;
191 } else
192 goto conflict;
186 } 193 }
187 194
195 /* check overlaps with more than one entry in the list */
196 list_for_each_entry_continue(entry, &memtype_list, nd) {
197 if (new->end <= entry->start)
198 break;
199 else if (new->type != entry->type)
200 goto conflict;
201 }
188 return 0; 202 return 0;
203
204 conflict:
205 printk(KERN_INFO "%s:%d conflicting memory types "
206 "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start,
207 new->end, cattr_name(new->type), cattr_name(entry->type));
208 return -EBUSY;
209}
210
211static struct memtype *cached_entry;
212static u64 cached_start;
213
214/*
215 * For RAM pages, mark the pages as non WB memory type using
216 * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or
217 * set_memory_wc() on a RAM page at a time before marking it as WB again.
218 * This is ok, because only one driver will be owning the page and
219 * doing set_memory_*() calls.
220 *
221 * For now, we use PageNonWB to track that the RAM page is being mapped
222 * as non WB. In future, we will have to use one more flag
223 * (or some other mechanism in page_struct) to distinguish between
224 * UC and WC mapping.
225 */
226static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
227 unsigned long *new_type)
228{
229 struct page *page;
230 u64 pfn, end_pfn;
231
232 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
233 page = pfn_to_page(pfn);
234 if (page_mapped(page) || PageNonWB(page))
235 goto out;
236
237 SetPageNonWB(page);
238 }
239 return 0;
240
241out:
242 end_pfn = pfn;
243 for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
244 page = pfn_to_page(pfn);
245 ClearPageNonWB(page);
246 }
247
248 return -EINVAL;
249}
250
251static int free_ram_pages_type(u64 start, u64 end)
252{
253 struct page *page;
254 u64 pfn, end_pfn;
255
256 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
257 page = pfn_to_page(pfn);
258 if (page_mapped(page) || !PageNonWB(page))
259 goto out;
260
261 ClearPageNonWB(page);
262 }
263 return 0;
264
265out:
266 end_pfn = pfn;
267 for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
268 page = pfn_to_page(pfn);
269 SetPageNonWB(page);
270 }
271 return -EINVAL;
189} 272}
190 273
191/* 274/*
@@ -198,37 +281,37 @@ static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
198 * req_type will have a special case value '-1', when requester want to inherit 281 * req_type will have a special case value '-1', when requester want to inherit
199 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS. 282 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
200 * 283 *
201 * If ret_type is NULL, function will return an error if it cannot reserve the 284 * If new_type is NULL, function will return an error if it cannot reserve the
202 * region with req_type. If ret_type is non-null, function will return 285 * region with req_type. If new_type is non-NULL, function will return
203 * available type in ret_type in case of no error. In case of any error 286 * available type in new_type in case of no error. In case of any error
204 * it will return a negative return value. 287 * it will return a negative return value.
205 */ 288 */
206int reserve_memtype(u64 start, u64 end, unsigned long req_type, 289int reserve_memtype(u64 start, u64 end, unsigned long req_type,
207 unsigned long *ret_type) 290 unsigned long *new_type)
208{ 291{
209 struct memtype *new_entry = NULL; 292 struct memtype *new, *entry;
210 struct memtype *parse;
211 unsigned long actual_type; 293 unsigned long actual_type;
294 struct list_head *where;
295 int is_range_ram;
212 int err = 0; 296 int err = 0;
213 297
214 /* Only track when pat_wc_enabled */ 298 BUG_ON(start >= end); /* end is exclusive */
215 if (!pat_wc_enabled) { 299
300 if (!pat_enabled) {
216 /* This is identical to page table setting without PAT */ 301 /* This is identical to page table setting without PAT */
217 if (ret_type) { 302 if (new_type) {
218 if (req_type == -1) { 303 if (req_type == -1)
219 *ret_type = _PAGE_CACHE_WB; 304 *new_type = _PAGE_CACHE_WB;
220 } else { 305 else
221 *ret_type = req_type; 306 *new_type = req_type & _PAGE_CACHE_MASK;
222 }
223 } 307 }
224 return 0; 308 return 0;
225 } 309 }
226 310
227 /* Low ISA region is always mapped WB in page table. No need to track */ 311 /* Low ISA region is always mapped WB in page table. No need to track */
228 if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) { 312 if (is_ISA_range(start, end - 1)) {
229 if (ret_type) 313 if (new_type)
230 *ret_type = _PAGE_CACHE_WB; 314 *new_type = _PAGE_CACHE_WB;
231
232 return 0; 315 return 0;
233 } 316 }
234 317
@@ -241,206 +324,133 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
241 */ 324 */
242 u8 mtrr_type = mtrr_type_lookup(start, end); 325 u8 mtrr_type = mtrr_type_lookup(start, end);
243 326
244 if (mtrr_type == MTRR_TYPE_WRBACK) { 327 if (mtrr_type == MTRR_TYPE_WRBACK)
245 req_type = _PAGE_CACHE_WB;
246 actual_type = _PAGE_CACHE_WB; 328 actual_type = _PAGE_CACHE_WB;
247 } else { 329 else
248 req_type = _PAGE_CACHE_UC_MINUS;
249 actual_type = _PAGE_CACHE_UC_MINUS; 330 actual_type = _PAGE_CACHE_UC_MINUS;
250 }
251 } else { 331 } else {
252 req_type &= _PAGE_CACHE_MASK; 332 actual_type = pat_x_mtrr_type(start, end,
253 err = pat_x_mtrr_type(start, end, req_type, &actual_type); 333 req_type & _PAGE_CACHE_MASK);
254 } 334 }
255 335
256 if (err) { 336 is_range_ram = pagerange_is_ram(start, end);
257 if (ret_type) 337 if (is_range_ram == 1)
258 *ret_type = actual_type; 338 return reserve_ram_pages_type(start, end, req_type, new_type);
259 339 else if (is_range_ram < 0)
260 return -EINVAL; 340 return -EINVAL;
261 }
262 341
263 new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL); 342 new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
264 if (!new_entry) 343 if (!new)
265 return -ENOMEM; 344 return -ENOMEM;
266 345
267 new_entry->start = start; 346 new->start = start;
268 new_entry->end = end; 347 new->end = end;
269 new_entry->type = actual_type; 348 new->type = actual_type;
270 349
271 if (ret_type) 350 if (new_type)
272 *ret_type = actual_type; 351 *new_type = actual_type;
273 352
274 spin_lock(&memtype_lock); 353 spin_lock(&memtype_lock);
275 354
276 /* Search for existing mapping that overlaps the current range */ 355 if (cached_entry && start >= cached_start)
277 list_for_each_entry(parse, &memtype_list, nd) { 356 entry = cached_entry;
278 struct memtype *saved_ptr; 357 else
358 entry = list_entry(&memtype_list, struct memtype, nd);
279 359
280 if (parse->start >= end) { 360 /* Search for existing mapping that overlaps the current range */
281 pr_debug("New Entry\n"); 361 where = NULL;
282 list_add(&new_entry->nd, parse->nd.prev); 362 list_for_each_entry_continue(entry, &memtype_list, nd) {
283 new_entry = NULL; 363 if (end <= entry->start) {
364 where = entry->nd.prev;
365 cached_entry = list_entry(where, struct memtype, nd);
284 break; 366 break;
285 } 367 } else if (start <= entry->start) { /* end > entry->start */
286 368 err = chk_conflict(new, entry, new_type);
287 if (start <= parse->start && end >= parse->start) { 369 if (!err) {
288 if (actual_type != parse->type && ret_type) { 370 dprintk("Overlap at 0x%Lx-0x%Lx\n",
289 actual_type = parse->type; 371 entry->start, entry->end);
290 *ret_type = actual_type; 372 where = entry->nd.prev;
291 new_entry->type = actual_type; 373 cached_entry = list_entry(where,
292 } 374 struct memtype, nd);
293
294 if (actual_type != parse->type) {
295 printk(
296 KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
297 current->comm, current->pid,
298 start, end,
299 cattr_name(actual_type),
300 cattr_name(parse->type));
301 err = -EBUSY;
302 break;
303 } 375 }
304
305 saved_ptr = parse;
306 /*
307 * Check to see whether the request overlaps more
308 * than one entry in the list
309 */
310 list_for_each_entry_continue(parse, &memtype_list, nd) {
311 if (end <= parse->start) {
312 break;
313 }
314
315 if (actual_type != parse->type) {
316 printk(
317 KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
318 current->comm, current->pid,
319 start, end,
320 cattr_name(actual_type),
321 cattr_name(parse->type));
322 err = -EBUSY;
323 break;
324 }
325 }
326
327 if (err) {
328 break;
329 }
330
331 pr_debug("Overlap at 0x%Lx-0x%Lx\n",
332 saved_ptr->start, saved_ptr->end);
333 /* No conflict. Go ahead and add this new entry */
334 list_add(&new_entry->nd, saved_ptr->nd.prev);
335 new_entry = NULL;
336 break; 376 break;
337 } 377 } else if (start < entry->end) { /* start > entry->start */
338 378 err = chk_conflict(new, entry, new_type);
339 if (start < parse->end) { 379 if (!err) {
340 if (actual_type != parse->type && ret_type) { 380 dprintk("Overlap at 0x%Lx-0x%Lx\n",
341 actual_type = parse->type; 381 entry->start, entry->end);
342 *ret_type = actual_type; 382 cached_entry = list_entry(entry->nd.prev,
343 new_entry->type = actual_type; 383 struct memtype, nd);
344 } 384
345 385 /*
346 if (actual_type != parse->type) { 386 * Move to right position in the linked
347 printk( 387 * list to add this new entry
348 KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", 388 */
349 current->comm, current->pid, 389 list_for_each_entry_continue(entry,
350 start, end, 390 &memtype_list, nd) {
351 cattr_name(actual_type), 391 if (start <= entry->start) {
352 cattr_name(parse->type)); 392 where = entry->nd.prev;
353 err = -EBUSY; 393 break;
354 break; 394 }
355 }
356
357 saved_ptr = parse;
358 /*
359 * Check to see whether the request overlaps more
360 * than one entry in the list
361 */
362 list_for_each_entry_continue(parse, &memtype_list, nd) {
363 if (end <= parse->start) {
364 break;
365 }
366
367 if (actual_type != parse->type) {
368 printk(
369 KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
370 current->comm, current->pid,
371 start, end,
372 cattr_name(actual_type),
373 cattr_name(parse->type));
374 err = -EBUSY;
375 break;
376 } 395 }
377 } 396 }
378
379 if (err) {
380 break;
381 }
382
383 pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
384 saved_ptr->start, saved_ptr->end);
385 /* No conflict. Go ahead and add this new entry */
386 list_add(&new_entry->nd, &saved_ptr->nd);
387 new_entry = NULL;
388 break; 397 break;
389 } 398 }
390 } 399 }
391 400
392 if (err) { 401 if (err) {
393 printk(KERN_INFO 402 printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
394 "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n", 403 "track %s, req %s\n",
395 start, end, cattr_name(new_entry->type), 404 start, end, cattr_name(new->type), cattr_name(req_type));
396 cattr_name(req_type)); 405 kfree(new);
397 kfree(new_entry);
398 spin_unlock(&memtype_lock); 406 spin_unlock(&memtype_lock);
407
399 return err; 408 return err;
400 } 409 }
401 410
402 if (new_entry) { 411 cached_start = start;
403 /* No conflict. Not yet added to the list. Add to the tail */
404 list_add_tail(&new_entry->nd, &memtype_list);
405 pr_debug("New Entry\n");
406 }
407 412
408 if (ret_type) { 413 if (where)
409 pr_debug( 414 list_add(&new->nd, where);
410 "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", 415 else
411 start, end, cattr_name(actual_type), 416 list_add_tail(&new->nd, &memtype_list);
412 cattr_name(req_type), cattr_name(*ret_type));
413 } else {
414 pr_debug(
415 "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
416 start, end, cattr_name(actual_type),
417 cattr_name(req_type));
418 }
419 417
420 spin_unlock(&memtype_lock); 418 spin_unlock(&memtype_lock);
419
420 dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
421 start, end, cattr_name(new->type), cattr_name(req_type),
422 new_type ? cattr_name(*new_type) : "-");
423
421 return err; 424 return err;
422} 425}
423 426
424int free_memtype(u64 start, u64 end) 427int free_memtype(u64 start, u64 end)
425{ 428{
426 struct memtype *ml; 429 struct memtype *entry;
427 int err = -EINVAL; 430 int err = -EINVAL;
431 int is_range_ram;
428 432
429 /* Only track when pat_wc_enabled */ 433 if (!pat_enabled)
430 if (!pat_wc_enabled) {
431 return 0; 434 return 0;
432 }
433 435
434 /* Low ISA region is always mapped WB. No need to track */ 436 /* Low ISA region is always mapped WB. No need to track */
435 if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) { 437 if (is_ISA_range(start, end - 1))
436 return 0; 438 return 0;
437 } 439
440 is_range_ram = pagerange_is_ram(start, end);
441 if (is_range_ram == 1)
442 return free_ram_pages_type(start, end);
443 else if (is_range_ram < 0)
444 return -EINVAL;
438 445
439 spin_lock(&memtype_lock); 446 spin_lock(&memtype_lock);
440 list_for_each_entry(ml, &memtype_list, nd) { 447 list_for_each_entry(entry, &memtype_list, nd) {
441 if (ml->start == start && ml->end == end) { 448 if (entry->start == start && entry->end == end) {
442 list_del(&ml->nd); 449 if (cached_entry == entry || cached_start == start)
443 kfree(ml); 450 cached_entry = NULL;
451
452 list_del(&entry->nd);
453 kfree(entry);
444 err = 0; 454 err = 0;
445 break; 455 break;
446 } 456 }
@@ -452,27 +462,20 @@ int free_memtype(u64 start, u64 end)
452 current->comm, current->pid, start, end); 462 current->comm, current->pid, start, end);
453 } 463 }
454 464
455 pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end); 465 dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
466
456 return err; 467 return err;
457} 468}
458 469
459 470
460/*
461 * /dev/mem mmap interface. The memtype used for mapping varies:
462 * - Use UC for mappings with O_SYNC flag
463 * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
464 * inherit the memtype from existing mapping.
465 * - Else use UC_MINUS memtype (for backward compatibility with existing
466 * X drivers.
467 */
468pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 471pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
469 unsigned long size, pgprot_t vma_prot) 472 unsigned long size, pgprot_t vma_prot)
470{ 473{
471 return vma_prot; 474 return vma_prot;
472} 475}
473 476
474#ifdef CONFIG_NONPROMISC_DEVMEM 477#ifdef CONFIG_STRICT_DEVMEM
475/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/ 478/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
476static inline int range_is_allowed(unsigned long pfn, unsigned long size) 479static inline int range_is_allowed(unsigned long pfn, unsigned long size)
477{ 480{
478 return 1; 481 return 1;
@@ -496,20 +499,20 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
496 } 499 }
497 return 1; 500 return 1;
498} 501}
499#endif /* CONFIG_NONPROMISC_DEVMEM */ 502#endif /* CONFIG_STRICT_DEVMEM */
500 503
501int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, 504int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
502 unsigned long size, pgprot_t *vma_prot) 505 unsigned long size, pgprot_t *vma_prot)
503{ 506{
504 u64 offset = ((u64) pfn) << PAGE_SHIFT; 507 u64 offset = ((u64) pfn) << PAGE_SHIFT;
505 unsigned long flags = _PAGE_CACHE_UC_MINUS; 508 unsigned long flags = -1;
506 int retval; 509 int retval;
507 510
508 if (!range_is_allowed(pfn, size)) 511 if (!range_is_allowed(pfn, size))
509 return 0; 512 return 0;
510 513
511 if (file->f_flags & O_SYNC) { 514 if (file->f_flags & O_SYNC) {
512 flags = _PAGE_CACHE_UC; 515 flags = _PAGE_CACHE_UC_MINUS;
513 } 516 }
514 517
515#ifdef CONFIG_X86_32 518#ifdef CONFIG_X86_32
@@ -521,24 +524,25 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
521 * caching for the high addresses through the KEN pin, but 524 * caching for the high addresses through the KEN pin, but
522 * we maintain the tradition of paranoia in this code. 525 * we maintain the tradition of paranoia in this code.
523 */ 526 */
524 if (!pat_wc_enabled && 527 if (!pat_enabled &&
525 ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) || 528 !(boot_cpu_has(X86_FEATURE_MTRR) ||
526 test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) || 529 boot_cpu_has(X86_FEATURE_K6_MTRR) ||
527 test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) || 530 boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
528 test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) && 531 boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
529 (pfn << PAGE_SHIFT) >= __pa(high_memory)) { 532 (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
530 flags = _PAGE_CACHE_UC; 533 flags = _PAGE_CACHE_UC;
531 } 534 }
532#endif 535#endif
533 536
534 /* 537 /*
535 * With O_SYNC, we can only take UC mapping. Fail if we cannot. 538 * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot.
539 *
536 * Without O_SYNC, we want to get 540 * Without O_SYNC, we want to get
537 * - WB for WB-able memory and no other conflicting mappings 541 * - WB for WB-able memory and no other conflicting mappings
538 * - UC_MINUS for non-WB-able memory with no other conflicting mappings 542 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
539 * - Inherit from confliting mappings otherwise 543 * - Inherit from confliting mappings otherwise
540 */ 544 */
541 if (flags != _PAGE_CACHE_UC_MINUS) { 545 if (flags != -1) {
542 retval = reserve_memtype(offset, offset + size, flags, NULL); 546 retval = reserve_memtype(offset, offset + size, flags, NULL);
543 } else { 547 } else {
544 retval = reserve_memtype(offset, offset + size, -1, &flags); 548 retval = reserve_memtype(offset, offset + size, -1, &flags);
@@ -547,8 +551,9 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
547 if (retval < 0) 551 if (retval < 0)
548 return 0; 552 return 0;
549 553
550 if (pfn <= max_pfn_mapped && 554 if (((pfn < max_low_pfn_mapped) ||
551 ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) { 555 (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn < max_pfn_mapped)) &&
556 ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
552 free_memtype(offset, offset + size); 557 free_memtype(offset, offset + size);
553 printk(KERN_INFO 558 printk(KERN_INFO
554 "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n", 559 "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
@@ -565,9 +570,9 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
565 570
566void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) 571void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
567{ 572{
573 unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
568 u64 addr = (u64)pfn << PAGE_SHIFT; 574 u64 addr = (u64)pfn << PAGE_SHIFT;
569 unsigned long flags; 575 unsigned long flags;
570 unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
571 576
572 reserve_memtype(addr, addr + size, want_flags, &flags); 577 reserve_memtype(addr, addr + size, want_flags, &flags);
573 if (flags != want_flags) { 578 if (flags != want_flags) {
@@ -587,3 +592,90 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
587 free_memtype(addr, addr + size); 592 free_memtype(addr, addr + size);
588} 593}
589 594
595#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
596
597/* get Nth element of the linked list */
598static struct memtype *memtype_get_idx(loff_t pos)
599{
600 struct memtype *list_node, *print_entry;
601 int i = 1;
602
603 print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
604 if (!print_entry)
605 return NULL;
606
607 spin_lock(&memtype_lock);
608 list_for_each_entry(list_node, &memtype_list, nd) {
609 if (pos == i) {
610 *print_entry = *list_node;
611 spin_unlock(&memtype_lock);
612 return print_entry;
613 }
614 ++i;
615 }
616 spin_unlock(&memtype_lock);
617 kfree(print_entry);
618
619 return NULL;
620}
621
622static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
623{
624 if (*pos == 0) {
625 ++*pos;
626 seq_printf(seq, "PAT memtype list:\n");
627 }
628
629 return memtype_get_idx(*pos);
630}
631
632static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
633{
634 ++*pos;
635 return memtype_get_idx(*pos);
636}
637
638static void memtype_seq_stop(struct seq_file *seq, void *v)
639{
640}
641
642static int memtype_seq_show(struct seq_file *seq, void *v)
643{
644 struct memtype *print_entry = (struct memtype *)v;
645
646 seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
647 print_entry->start, print_entry->end);
648 kfree(print_entry);
649
650 return 0;
651}
652
653static struct seq_operations memtype_seq_ops = {
654 .start = memtype_seq_start,
655 .next = memtype_seq_next,
656 .stop = memtype_seq_stop,
657 .show = memtype_seq_show,
658};
659
660static int memtype_seq_open(struct inode *inode, struct file *file)
661{
662 return seq_open(file, &memtype_seq_ops);
663}
664
665static const struct file_operations memtype_fops = {
666 .open = memtype_seq_open,
667 .read = seq_read,
668 .llseek = seq_lseek,
669 .release = seq_release,
670};
671
672static int __init pat_memtype_list_init(void)
673{
674 debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
675 NULL, &memtype_fops);
676 return 0;
677}
678
679late_initcall(pat_memtype_list_init);
680
681#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
new file mode 100644
index 000000000000..efa1911e20ca
--- /dev/null
+++ b/arch/x86/mm/pf_in.c
@@ -0,0 +1,489 @@
1/*
2 * Fault Injection Test harness (FI)
3 * Copyright (C) Intel Crop.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
18 * USA.
19 *
20 */
21
22/* Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp
23 * Copyright by Intel Crop., 2002
24 * Louis Zhuang (louis.zhuang@intel.com)
25 *
26 * Bjorn Steinbrink (B.Steinbrink@gmx.de), 2007
27 */
28
29#include <linux/module.h>
30#include <linux/ptrace.h> /* struct pt_regs */
31#include "pf_in.h"
32
33#ifdef __i386__
34/* IA32 Manual 3, 2-1 */
35static unsigned char prefix_codes[] = {
36 0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64,
37 0x65, 0x2E, 0x3E, 0x66, 0x67
38};
39/* IA32 Manual 3, 3-432*/
40static unsigned int reg_rop[] = {
41 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
42};
43static unsigned int reg_wop[] = { 0x88, 0x89 };
44static unsigned int imm_wop[] = { 0xC6, 0xC7 };
45/* IA32 Manual 3, 3-432*/
46static unsigned int rw8[] = { 0x88, 0x8A, 0xC6 };
47static unsigned int rw32[] = {
48 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
49};
50static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F };
51static unsigned int mw16[] = { 0xB70F, 0xBF0F };
52static unsigned int mw32[] = { 0x89, 0x8B, 0xC7 };
53static unsigned int mw64[] = {};
54#else /* not __i386__ */
55static unsigned char prefix_codes[] = {
56 0x66, 0x67, 0x2E, 0x3E, 0x26, 0x64, 0x65, 0x36,
57 0xF0, 0xF3, 0xF2,
58 /* REX Prefixes */
59 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
60 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f
61};
62/* AMD64 Manual 3, Appendix A*/
63static unsigned int reg_rop[] = {
64 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
65};
66static unsigned int reg_wop[] = { 0x88, 0x89 };
67static unsigned int imm_wop[] = { 0xC6, 0xC7 };
68static unsigned int rw8[] = { 0xC6, 0x88, 0x8A };
69static unsigned int rw32[] = {
70 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
71};
72/* 8 bit only */
73static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F };
74/* 16 bit only */
75static unsigned int mw16[] = { 0xB70F, 0xBF0F };
76/* 16 or 32 bit */
77static unsigned int mw32[] = { 0xC7 };
78/* 16, 32 or 64 bit */
79static unsigned int mw64[] = { 0x89, 0x8B };
80#endif /* not __i386__ */
81
82static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged,
83 int *rexr)
84{
85 int i;
86 unsigned char *p = addr;
87 *shorted = 0;
88 *enlarged = 0;
89 *rexr = 0;
90
91restart:
92 for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) {
93 if (*p == prefix_codes[i]) {
94 if (*p == 0x66)
95 *shorted = 1;
96#ifdef __amd64__
97 if ((*p & 0xf8) == 0x48)
98 *enlarged = 1;
99 if ((*p & 0xf4) == 0x44)
100 *rexr = 1;
101#endif
102 p++;
103 goto restart;
104 }
105 }
106
107 return (p - addr);
108}
109
110static int get_opcode(unsigned char *addr, unsigned int *opcode)
111{
112 int len;
113
114 if (*addr == 0x0F) {
115 /* 0x0F is extension instruction */
116 *opcode = *(unsigned short *)addr;
117 len = 2;
118 } else {
119 *opcode = *addr;
120 len = 1;
121 }
122
123 return len;
124}
125
126#define CHECK_OP_TYPE(opcode, array, type) \
127 for (i = 0; i < ARRAY_SIZE(array); i++) { \
128 if (array[i] == opcode) { \
129 rv = type; \
130 goto exit; \
131 } \
132 }
133
134enum reason_type get_ins_type(unsigned long ins_addr)
135{
136 unsigned int opcode;
137 unsigned char *p;
138 int shorted, enlarged, rexr;
139 int i;
140 enum reason_type rv = OTHERS;
141
142 p = (unsigned char *)ins_addr;
143 p += skip_prefix(p, &shorted, &enlarged, &rexr);
144 p += get_opcode(p, &opcode);
145
146 CHECK_OP_TYPE(opcode, reg_rop, REG_READ);
147 CHECK_OP_TYPE(opcode, reg_wop, REG_WRITE);
148 CHECK_OP_TYPE(opcode, imm_wop, IMM_WRITE);
149
150exit:
151 return rv;
152}
153#undef CHECK_OP_TYPE
154
155static unsigned int get_ins_reg_width(unsigned long ins_addr)
156{
157 unsigned int opcode;
158 unsigned char *p;
159 int i, shorted, enlarged, rexr;
160
161 p = (unsigned char *)ins_addr;
162 p += skip_prefix(p, &shorted, &enlarged, &rexr);
163 p += get_opcode(p, &opcode);
164
165 for (i = 0; i < ARRAY_SIZE(rw8); i++)
166 if (rw8[i] == opcode)
167 return 1;
168
169 for (i = 0; i < ARRAY_SIZE(rw32); i++)
170 if (rw32[i] == opcode)
171 return (shorted ? 2 : (enlarged ? 8 : 4));
172
173 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
174 return 0;
175}
176
177unsigned int get_ins_mem_width(unsigned long ins_addr)
178{
179 unsigned int opcode;
180 unsigned char *p;
181 int i, shorted, enlarged, rexr;
182
183 p = (unsigned char *)ins_addr;
184 p += skip_prefix(p, &shorted, &enlarged, &rexr);
185 p += get_opcode(p, &opcode);
186
187 for (i = 0; i < ARRAY_SIZE(mw8); i++)
188 if (mw8[i] == opcode)
189 return 1;
190
191 for (i = 0; i < ARRAY_SIZE(mw16); i++)
192 if (mw16[i] == opcode)
193 return 2;
194
195 for (i = 0; i < ARRAY_SIZE(mw32); i++)
196 if (mw32[i] == opcode)
197 return shorted ? 2 : 4;
198
199 for (i = 0; i < ARRAY_SIZE(mw64); i++)
200 if (mw64[i] == opcode)
201 return shorted ? 2 : (enlarged ? 8 : 4);
202
203 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
204 return 0;
205}
206
207/*
208 * Define register ident in mod/rm byte.
209 * Note: these are NOT the same as in ptrace-abi.h.
210 */
211enum {
212 arg_AL = 0,
213 arg_CL = 1,
214 arg_DL = 2,
215 arg_BL = 3,
216 arg_AH = 4,
217 arg_CH = 5,
218 arg_DH = 6,
219 arg_BH = 7,
220
221 arg_AX = 0,
222 arg_CX = 1,
223 arg_DX = 2,
224 arg_BX = 3,
225 arg_SP = 4,
226 arg_BP = 5,
227 arg_SI = 6,
228 arg_DI = 7,
229#ifdef __amd64__
230 arg_R8 = 8,
231 arg_R9 = 9,
232 arg_R10 = 10,
233 arg_R11 = 11,
234 arg_R12 = 12,
235 arg_R13 = 13,
236 arg_R14 = 14,
237 arg_R15 = 15
238#endif
239};
240
241static unsigned char *get_reg_w8(int no, struct pt_regs *regs)
242{
243 unsigned char *rv = NULL;
244
245 switch (no) {
246 case arg_AL:
247 rv = (unsigned char *)&regs->ax;
248 break;
249 case arg_BL:
250 rv = (unsigned char *)&regs->bx;
251 break;
252 case arg_CL:
253 rv = (unsigned char *)&regs->cx;
254 break;
255 case arg_DL:
256 rv = (unsigned char *)&regs->dx;
257 break;
258 case arg_AH:
259 rv = 1 + (unsigned char *)&regs->ax;
260 break;
261 case arg_BH:
262 rv = 1 + (unsigned char *)&regs->bx;
263 break;
264 case arg_CH:
265 rv = 1 + (unsigned char *)&regs->cx;
266 break;
267 case arg_DH:
268 rv = 1 + (unsigned char *)&regs->dx;
269 break;
270#ifdef __amd64__
271 case arg_R8:
272 rv = (unsigned char *)&regs->r8;
273 break;
274 case arg_R9:
275 rv = (unsigned char *)&regs->r9;
276 break;
277 case arg_R10:
278 rv = (unsigned char *)&regs->r10;
279 break;
280 case arg_R11:
281 rv = (unsigned char *)&regs->r11;
282 break;
283 case arg_R12:
284 rv = (unsigned char *)&regs->r12;
285 break;
286 case arg_R13:
287 rv = (unsigned char *)&regs->r13;
288 break;
289 case arg_R14:
290 rv = (unsigned char *)&regs->r14;
291 break;
292 case arg_R15:
293 rv = (unsigned char *)&regs->r15;
294 break;
295#endif
296 default:
297 printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
298 break;
299 }
300 return rv;
301}
302
303static unsigned long *get_reg_w32(int no, struct pt_regs *regs)
304{
305 unsigned long *rv = NULL;
306
307 switch (no) {
308 case arg_AX:
309 rv = &regs->ax;
310 break;
311 case arg_BX:
312 rv = &regs->bx;
313 break;
314 case arg_CX:
315 rv = &regs->cx;
316 break;
317 case arg_DX:
318 rv = &regs->dx;
319 break;
320 case arg_SP:
321 rv = &regs->sp;
322 break;
323 case arg_BP:
324 rv = &regs->bp;
325 break;
326 case arg_SI:
327 rv = &regs->si;
328 break;
329 case arg_DI:
330 rv = &regs->di;
331 break;
332#ifdef __amd64__
333 case arg_R8:
334 rv = &regs->r8;
335 break;
336 case arg_R9:
337 rv = &regs->r9;
338 break;
339 case arg_R10:
340 rv = &regs->r10;
341 break;
342 case arg_R11:
343 rv = &regs->r11;
344 break;
345 case arg_R12:
346 rv = &regs->r12;
347 break;
348 case arg_R13:
349 rv = &regs->r13;
350 break;
351 case arg_R14:
352 rv = &regs->r14;
353 break;
354 case arg_R15:
355 rv = &regs->r15;
356 break;
357#endif
358 default:
359 printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
360 }
361
362 return rv;
363}
364
365unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
366{
367 unsigned int opcode;
368 unsigned char mod_rm;
369 int reg;
370 unsigned char *p;
371 int i, shorted, enlarged, rexr;
372 unsigned long rv;
373
374 p = (unsigned char *)ins_addr;
375 p += skip_prefix(p, &shorted, &enlarged, &rexr);
376 p += get_opcode(p, &opcode);
377 for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
378 if (reg_rop[i] == opcode) {
379 rv = REG_READ;
380 goto do_work;
381 }
382
383 for (i = 0; i < ARRAY_SIZE(reg_wop); i++)
384 if (reg_wop[i] == opcode) {
385 rv = REG_WRITE;
386 goto do_work;
387 }
388
389 printk(KERN_ERR "mmiotrace: Not a register instruction, opcode "
390 "0x%02x\n", opcode);
391 goto err;
392
393do_work:
394 mod_rm = *p;
395 reg = ((mod_rm >> 3) & 0x7) | (rexr << 3);
396 switch (get_ins_reg_width(ins_addr)) {
397 case 1:
398 return *get_reg_w8(reg, regs);
399
400 case 2:
401 return *(unsigned short *)get_reg_w32(reg, regs);
402
403 case 4:
404 return *(unsigned int *)get_reg_w32(reg, regs);
405
406#ifdef __amd64__
407 case 8:
408 return *(unsigned long *)get_reg_w32(reg, regs);
409#endif
410
411 default:
412 printk(KERN_ERR "mmiotrace: Error width# %d\n", reg);
413 }
414
415err:
416 return 0;
417}
418
419unsigned long get_ins_imm_val(unsigned long ins_addr)
420{
421 unsigned int opcode;
422 unsigned char mod_rm;
423 unsigned char mod;
424 unsigned char *p;
425 int i, shorted, enlarged, rexr;
426 unsigned long rv;
427
428 p = (unsigned char *)ins_addr;
429 p += skip_prefix(p, &shorted, &enlarged, &rexr);
430 p += get_opcode(p, &opcode);
431 for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
432 if (imm_wop[i] == opcode) {
433 rv = IMM_WRITE;
434 goto do_work;
435 }
436
437 printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode "
438 "0x%02x\n", opcode);
439 goto err;
440
441do_work:
442 mod_rm = *p;
443 mod = mod_rm >> 6;
444 p++;
445 switch (mod) {
446 case 0:
447 /* if r/m is 5 we have a 32 disp (IA32 Manual 3, Table 2-2) */
448 /* AMD64: XXX Check for address size prefix? */
449 if ((mod_rm & 0x7) == 0x5)
450 p += 4;
451 break;
452
453 case 1:
454 p += 1;
455 break;
456
457 case 2:
458 p += 4;
459 break;
460
461 case 3:
462 default:
463 printk(KERN_ERR "mmiotrace: not a memory access instruction "
464 "at 0x%lx, rm_mod=0x%02x\n",
465 ins_addr, mod_rm);
466 }
467
468 switch (get_ins_reg_width(ins_addr)) {
469 case 1:
470 return *(unsigned char *)p;
471
472 case 2:
473 return *(unsigned short *)p;
474
475 case 4:
476 return *(unsigned int *)p;
477
478#ifdef __amd64__
479 case 8:
480 return *(unsigned long *)p;
481#endif
482
483 default:
484 printk(KERN_ERR "mmiotrace: Error: width.\n");
485 }
486
487err:
488 return 0;
489}
diff --git a/arch/x86/mm/pf_in.h b/arch/x86/mm/pf_in.h
new file mode 100644
index 000000000000..e05341a51a27
--- /dev/null
+++ b/arch/x86/mm/pf_in.h
@@ -0,0 +1,39 @@
1/*
2 * Fault Injection Test harness (FI)
3 * Copyright (C) Intel Crop.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
18 * USA.
19 *
20 */
21
22#ifndef __PF_H_
23#define __PF_H_
24
25enum reason_type {
26 NOT_ME, /* page fault is not in regions */
27 NOTHING, /* access others point in regions */
28 REG_READ, /* read from addr to reg */
29 REG_WRITE, /* write from reg to addr */
30 IMM_WRITE, /* write from imm to addr */
31 OTHERS /* Other instructions can not intercept */
32};
33
34enum reason_type get_ins_type(unsigned long ins_addr);
35unsigned int get_ins_mem_width(unsigned long ins_addr);
36unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs);
37unsigned long get_ins_imm_val(unsigned long ins_addr);
38
39#endif /* __PF_H_ */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 50159764f694..86f2ffc43c3d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -2,6 +2,7 @@
2#include <asm/pgalloc.h> 2#include <asm/pgalloc.h>
3#include <asm/pgtable.h> 3#include <asm/pgtable.h>
4#include <asm/tlb.h> 4#include <asm/tlb.h>
5#include <asm/fixmap.h>
5 6
6pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) 7pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
7{ 8{
@@ -62,16 +63,8 @@ static inline void pgd_list_del(pgd_t *pgd)
62#define UNSHARED_PTRS_PER_PGD \ 63#define UNSHARED_PTRS_PER_PGD \
63 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) 64 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
64 65
65static void pgd_ctor(void *p) 66static void pgd_ctor(pgd_t *pgd)
66{ 67{
67 pgd_t *pgd = p;
68 unsigned long flags;
69
70 /* Clear usermode parts of PGD */
71 memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
72
73 spin_lock_irqsave(&pgd_lock, flags);
74
75 /* If the pgd points to a shared pagetable level (either the 68 /* If the pgd points to a shared pagetable level (either the
76 ptes in non-PAE, or shared PMD in PAE), then just copy the 69 ptes in non-PAE, or shared PMD in PAE), then just copy the
77 references from swapper_pg_dir. */ 70 references from swapper_pg_dir. */
@@ -90,11 +83,9 @@ static void pgd_ctor(void *p)
90 /* list required to sync kernel mapping updates */ 83 /* list required to sync kernel mapping updates */
91 if (!SHARED_KERNEL_PMD) 84 if (!SHARED_KERNEL_PMD)
92 pgd_list_add(pgd); 85 pgd_list_add(pgd);
93
94 spin_unlock_irqrestore(&pgd_lock, flags);
95} 86}
96 87
97static void pgd_dtor(void *pgd) 88static void pgd_dtor(pgd_t *pgd)
98{ 89{
99 unsigned long flags; /* can be called from interrupt context */ 90 unsigned long flags; /* can be called from interrupt context */
100 91
@@ -119,6 +110,72 @@ static void pgd_dtor(void *pgd)
119 110
120#ifdef CONFIG_X86_PAE 111#ifdef CONFIG_X86_PAE
121/* 112/*
113 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
114 * updating the top-level pagetable entries to guarantee the
115 * processor notices the update. Since this is expensive, and
116 * all 4 top-level entries are used almost immediately in a
117 * new process's life, we just pre-populate them here.
118 *
119 * Also, if we're in a paravirt environment where the kernel pmd is
120 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
121 * and initialize the kernel pmds here.
122 */
123#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
124
125void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
126{
127 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
128
129 /* Note: almost everything apart from _PAGE_PRESENT is
130 reserved at the pmd (PDPT) level. */
131 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
132
133 /*
134 * According to Intel App note "TLBs, Paging-Structure Caches,
135 * and Their Invalidation", April 2007, document 317080-001,
136 * section 8.1: in PAE mode we explicitly have to flush the
137 * TLB via cr3 if the top-level pgd is changed...
138 */
139 if (mm == current->active_mm)
140 write_cr3(read_cr3());
141}
142#else /* !CONFIG_X86_PAE */
143
144/* No need to prepopulate any pagetable entries in non-PAE modes. */
145#define PREALLOCATED_PMDS 0
146
147#endif /* CONFIG_X86_PAE */
148
149static void free_pmds(pmd_t *pmds[])
150{
151 int i;
152
153 for(i = 0; i < PREALLOCATED_PMDS; i++)
154 if (pmds[i])
155 free_page((unsigned long)pmds[i]);
156}
157
158static int preallocate_pmds(pmd_t *pmds[])
159{
160 int i;
161 bool failed = false;
162
163 for(i = 0; i < PREALLOCATED_PMDS; i++) {
164 pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
165 if (pmd == NULL)
166 failed = true;
167 pmds[i] = pmd;
168 }
169
170 if (failed) {
171 free_pmds(pmds);
172 return -ENOMEM;
173 }
174
175 return 0;
176}
177
178/*
122 * Mop up any pmd pages which may still be attached to the pgd. 179 * Mop up any pmd pages which may still be attached to the pgd.
123 * Normally they will be freed by munmap/exit_mmap, but any pmd we 180 * Normally they will be freed by munmap/exit_mmap, but any pmd we
124 * preallocate which never got a corresponding vma will need to be 181 * preallocate which never got a corresponding vma will need to be
@@ -128,7 +185,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
128{ 185{
129 int i; 186 int i;
130 187
131 for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { 188 for(i = 0; i < PREALLOCATED_PMDS; i++) {
132 pgd_t pgd = pgdp[i]; 189 pgd_t pgd = pgdp[i];
133 190
134 if (pgd_val(pgd) != 0) { 191 if (pgd_val(pgd) != 0) {
@@ -142,32 +199,20 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
142 } 199 }
143} 200}
144 201
145/* 202static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
146 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
147 * updating the top-level pagetable entries to guarantee the
148 * processor notices the update. Since this is expensive, and
149 * all 4 top-level entries are used almost immediately in a
150 * new process's life, we just pre-populate them here.
151 *
152 * Also, if we're in a paravirt environment where the kernel pmd is
153 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
154 * and initialize the kernel pmds here.
155 */
156static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
157{ 203{
158 pud_t *pud; 204 pud_t *pud;
159 unsigned long addr; 205 unsigned long addr;
160 int i; 206 int i;
161 207
208 if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
209 return;
210
162 pud = pud_offset(pgd, 0); 211 pud = pud_offset(pgd, 0);
163 for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
164 i++, pud++, addr += PUD_SIZE) {
165 pmd_t *pmd = pmd_alloc_one(mm, addr);
166 212
167 if (!pmd) { 213 for (addr = i = 0; i < PREALLOCATED_PMDS;
168 pgd_mop_up_pmds(mm, pgd); 214 i++, pud++, addr += PUD_SIZE) {
169 return 0; 215 pmd_t *pmd = pmds[i];
170 }
171 216
172 if (i >= KERNEL_PGD_BOUNDARY) 217 if (i >= KERNEL_PGD_BOUNDARY)
173 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), 218 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
@@ -175,61 +220,54 @@ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
175 220
176 pud_populate(mm, pud, pmd); 221 pud_populate(mm, pud, pmd);
177 } 222 }
178
179 return 1;
180} 223}
181 224
182void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) 225pgd_t *pgd_alloc(struct mm_struct *mm)
183{ 226{
184 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); 227 pgd_t *pgd;
228 pmd_t *pmds[PREALLOCATED_PMDS];
229 unsigned long flags;
185 230
186 /* Note: almost everything apart from _PAGE_PRESENT is 231 pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
187 reserved at the pmd (PDPT) level. */
188 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
189 232
190 /* 233 if (pgd == NULL)
191 * According to Intel App note "TLBs, Paging-Structure Caches, 234 goto out;
192 * and Their Invalidation", April 2007, document 317080-001,
193 * section 8.1: in PAE mode we explicitly have to flush the
194 * TLB via cr3 if the top-level pgd is changed...
195 */
196 if (mm == current->active_mm)
197 write_cr3(read_cr3());
198}
199#else /* !CONFIG_X86_PAE */
200/* No need to prepopulate any pagetable entries in non-PAE modes. */
201static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
202{
203 return 1;
204}
205 235
206static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd) 236 mm->pgd = pgd;
207{
208}
209#endif /* CONFIG_X86_PAE */
210 237
211pgd_t *pgd_alloc(struct mm_struct *mm) 238 if (preallocate_pmds(pmds) != 0)
212{ 239 goto out_free_pgd;
213 pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
214 240
215 /* so that alloc_pmd can use it */ 241 if (paravirt_pgd_alloc(mm) != 0)
216 mm->pgd = pgd; 242 goto out_free_pmds;
217 if (pgd)
218 pgd_ctor(pgd);
219 243
220 if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { 244 /*
221 pgd_dtor(pgd); 245 * Make sure that pre-populating the pmds is atomic with
222 free_page((unsigned long)pgd); 246 * respect to anything walking the pgd_list, so that they
223 pgd = NULL; 247 * never see a partially populated pgd.
224 } 248 */
249 spin_lock_irqsave(&pgd_lock, flags);
250
251 pgd_ctor(pgd);
252 pgd_prepopulate_pmd(mm, pgd, pmds);
253
254 spin_unlock_irqrestore(&pgd_lock, flags);
225 255
226 return pgd; 256 return pgd;
257
258out_free_pmds:
259 free_pmds(pmds);
260out_free_pgd:
261 free_page((unsigned long)pgd);
262out:
263 return NULL;
227} 264}
228 265
229void pgd_free(struct mm_struct *mm, pgd_t *pgd) 266void pgd_free(struct mm_struct *mm, pgd_t *pgd)
230{ 267{
231 pgd_mop_up_pmds(mm, pgd); 268 pgd_mop_up_pmds(mm, pgd);
232 pgd_dtor(pgd); 269 pgd_dtor(pgd);
270 paravirt_pgd_free(mm, pgd);
233 free_page((unsigned long)pgd); 271 free_page((unsigned long)pgd);
234} 272}
235 273
@@ -255,7 +293,7 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
255 293
256 if (pte_young(*ptep)) 294 if (pte_young(*ptep))
257 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, 295 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
258 &ptep->pte); 296 (unsigned long *) &ptep->pte);
259 297
260 if (ret) 298 if (ret)
261 pte_update(vma->vm_mm, addr, ptep); 299 pte_update(vma->vm_mm, addr, ptep);
@@ -274,3 +312,22 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
274 312
275 return young; 313 return young;
276} 314}
315
316int fixmaps_set;
317
318void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
319{
320 unsigned long address = __fix_to_virt(idx);
321
322 if (idx >= __end_of_fixed_addresses) {
323 BUG();
324 return;
325 }
326 set_pte_vaddr(address, pte);
327 fixmaps_set++;
328}
329
330void native_set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
331{
332 __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
333}
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 369cf065b6a4..0951db9ee519 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -20,58 +20,11 @@
20#include <asm/tlb.h> 20#include <asm/tlb.h>
21#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
22 22
23void show_mem(void)
24{
25 int total = 0, reserved = 0;
26 int shared = 0, cached = 0;
27 int highmem = 0;
28 struct page *page;
29 pg_data_t *pgdat;
30 unsigned long i;
31 unsigned long flags;
32
33 printk(KERN_INFO "Mem-info:\n");
34 show_free_areas();
35 for_each_online_pgdat(pgdat) {
36 pgdat_resize_lock(pgdat, &flags);
37 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
38 if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
39 touch_nmi_watchdog();
40 page = pgdat_page_nr(pgdat, i);
41 total++;
42 if (PageHighMem(page))
43 highmem++;
44 if (PageReserved(page))
45 reserved++;
46 else if (PageSwapCache(page))
47 cached++;
48 else if (page_count(page))
49 shared += page_count(page) - 1;
50 }
51 pgdat_resize_unlock(pgdat, &flags);
52 }
53 printk(KERN_INFO "%d pages of RAM\n", total);
54 printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
55 printk(KERN_INFO "%d reserved pages\n", reserved);
56 printk(KERN_INFO "%d pages shared\n", shared);
57 printk(KERN_INFO "%d pages swap cached\n", cached);
58
59 printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
60 printk(KERN_INFO "%lu pages writeback\n",
61 global_page_state(NR_WRITEBACK));
62 printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
63 printk(KERN_INFO "%lu pages slab\n",
64 global_page_state(NR_SLAB_RECLAIMABLE) +
65 global_page_state(NR_SLAB_UNRECLAIMABLE));
66 printk(KERN_INFO "%lu pages pagetables\n",
67 global_page_state(NR_PAGETABLE));
68}
69
70/* 23/*
71 * Associate a virtual page frame with a given physical page frame 24 * Associate a virtual page frame with a given physical page frame
72 * and protection flags for that frame. 25 * and protection flags for that frame.
73 */ 26 */
74static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) 27void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
75{ 28{
76 pgd_t *pgd; 29 pgd_t *pgd;
77 pud_t *pud; 30 pud_t *pud;
@@ -94,8 +47,8 @@ static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
94 return; 47 return;
95 } 48 }
96 pte = pte_offset_kernel(pmd, vaddr); 49 pte = pte_offset_kernel(pmd, vaddr);
97 if (pgprot_val(flags)) 50 if (pte_val(pteval))
98 set_pte_present(&init_mm, vaddr, pte, pfn_pte(pfn, flags)); 51 set_pte_present(&init_mm, vaddr, pte, pteval);
99 else 52 else
100 pte_clear(&init_mm, vaddr, pte); 53 pte_clear(&init_mm, vaddr, pte);
101 54
@@ -141,22 +94,9 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
141 __flush_tlb_one(vaddr); 94 __flush_tlb_one(vaddr);
142} 95}
143 96
144static int fixmaps;
145unsigned long __FIXADDR_TOP = 0xfffff000; 97unsigned long __FIXADDR_TOP = 0xfffff000;
146EXPORT_SYMBOL(__FIXADDR_TOP); 98EXPORT_SYMBOL(__FIXADDR_TOP);
147 99
148void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
149{
150 unsigned long address = __fix_to_virt(idx);
151
152 if (idx >= __end_of_fixed_addresses) {
153 BUG();
154 return;
155 }
156 set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
157 fixmaps++;
158}
159
160/** 100/**
161 * reserve_top_address - reserves a hole in the top of kernel address space 101 * reserve_top_address - reserves a hole in the top of kernel address space
162 * @reserve - size of hole to reserve 102 * @reserve - size of hole to reserve
@@ -164,11 +104,45 @@ void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
164 * Can be used to relocate the fixmap area and poke a hole in the top 104 * Can be used to relocate the fixmap area and poke a hole in the top
165 * of kernel address space to make room for a hypervisor. 105 * of kernel address space to make room for a hypervisor.
166 */ 106 */
167void reserve_top_address(unsigned long reserve) 107void __init reserve_top_address(unsigned long reserve)
168{ 108{
169 BUG_ON(fixmaps > 0); 109 BUG_ON(fixmaps_set > 0);
170 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", 110 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
171 (int)-reserve); 111 (int)-reserve);
172 __FIXADDR_TOP = -reserve - PAGE_SIZE; 112 __FIXADDR_TOP = -reserve - PAGE_SIZE;
173 __VMALLOC_RESERVE += reserve; 113 __VMALLOC_RESERVE += reserve;
174} 114}
115
116/*
117 * vmalloc=size forces the vmalloc area to be exactly 'size'
118 * bytes. This can be used to increase (or decrease) the
119 * vmalloc area - the default is 128m.
120 */
121static int __init parse_vmalloc(char *arg)
122{
123 if (!arg)
124 return -EINVAL;
125
126 /* Add VMALLOC_OFFSET to the parsed value due to vm area guard hole*/
127 __VMALLOC_RESERVE = memparse(arg, &arg) + VMALLOC_OFFSET;
128 return 0;
129}
130early_param("vmalloc", parse_vmalloc);
131
132/*
133 * reservetop=size reserves a hole at the top of the kernel address space which
134 * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
135 * so relocating the fixmap can be done before paging initialization.
136 */
137static int __init parse_reservetop(char *arg)
138{
139 unsigned long address;
140
141 if (!arg)
142 return -EINVAL;
143
144 address = memparse(arg, &arg);
145 reserve_top_address(address);
146 return 0;
147}
148early_param("reservetop", parse_reservetop);
diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/mm/srat_32.c
index 70e4a374b4e8..16ae70fc57e7 100644
--- a/arch/x86/kernel/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -31,6 +31,7 @@
31#include <asm/srat.h> 31#include <asm/srat.h>
32#include <asm/topology.h> 32#include <asm/topology.h>
33#include <asm/smp.h> 33#include <asm/smp.h>
34#include <asm/e820.h>
34 35
35/* 36/*
36 * proximity macros and definitions 37 * proximity macros and definitions
@@ -41,7 +42,7 @@
41#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit))) 42#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
42/* bitmap length; _PXM is at most 255 */ 43/* bitmap length; _PXM is at most 255 */
43#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) 44#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
44static u8 pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */ 45static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */
45 46
46#define MAX_CHUNKS_PER_NODE 3 47#define MAX_CHUNKS_PER_NODE 3
47#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES) 48#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
@@ -52,16 +53,37 @@ struct node_memory_chunk_s {
52 u8 nid; // which cnode contains this chunk? 53 u8 nid; // which cnode contains this chunk?
53 u8 bank; // which mem bank on this node 54 u8 bank; // which mem bank on this node
54}; 55};
55static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS]; 56static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
56 57
57static int num_memory_chunks; /* total number of memory chunks */ 58static int __initdata num_memory_chunks; /* total number of memory chunks */
58static u8 __initdata apicid_to_pxm[MAX_APICID]; 59static u8 __initdata apicid_to_pxm[MAX_APICID];
59 60
61int numa_off __initdata;
62int acpi_numa __initdata;
63
64static __init void bad_srat(void)
65{
66 printk(KERN_ERR "SRAT: SRAT not used.\n");
67 acpi_numa = -1;
68 num_memory_chunks = 0;
69}
70
71static __init inline int srat_disabled(void)
72{
73 return numa_off || acpi_numa < 0;
74}
75
60/* Identify CPU proximity domains */ 76/* Identify CPU proximity domains */
61static void __init parse_cpu_affinity_structure(char *p) 77void __init
78acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
62{ 79{
63 struct acpi_srat_cpu_affinity *cpu_affinity = 80 if (srat_disabled())
64 (struct acpi_srat_cpu_affinity *) p; 81 return;
82 if (cpu_affinity->header.length !=
83 sizeof(struct acpi_srat_cpu_affinity)) {
84 bad_srat();
85 return;
86 }
65 87
66 if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0) 88 if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
67 return; /* empty entry */ 89 return; /* empty entry */
@@ -71,7 +93,7 @@ static void __init parse_cpu_affinity_structure(char *p)
71 93
72 apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo; 94 apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
73 95
74 printk("CPU 0x%02X in proximity domain 0x%02X\n", 96 printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
75 cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo); 97 cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
76} 98}
77 99
@@ -79,14 +101,21 @@ static void __init parse_cpu_affinity_structure(char *p)
79 * Identify memory proximity domains and hot-remove capabilities. 101 * Identify memory proximity domains and hot-remove capabilities.
80 * Fill node memory chunk list structure. 102 * Fill node memory chunk list structure.
81 */ 103 */
82static void __init parse_memory_affinity_structure (char *sratp) 104void __init
105acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
83{ 106{
84 unsigned long long paddr, size; 107 unsigned long long paddr, size;
85 unsigned long start_pfn, end_pfn; 108 unsigned long start_pfn, end_pfn;
86 u8 pxm; 109 u8 pxm;
87 struct node_memory_chunk_s *p, *q, *pend; 110 struct node_memory_chunk_s *p, *q, *pend;
88 struct acpi_srat_mem_affinity *memory_affinity = 111
89 (struct acpi_srat_mem_affinity *) sratp; 112 if (srat_disabled())
113 return;
114 if (memory_affinity->header.length !=
115 sizeof(struct acpi_srat_mem_affinity)) {
116 bad_srat();
117 return;
118 }
90 119
91 if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0) 120 if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
92 return; /* empty entry */ 121 return; /* empty entry */
@@ -105,7 +134,8 @@ static void __init parse_memory_affinity_structure (char *sratp)
105 134
106 135
107 if (num_memory_chunks >= MAXCHUNKS) { 136 if (num_memory_chunks >= MAXCHUNKS) {
108 printk("Too many mem chunks in SRAT. Ignoring %lld MBytes at %llx\n", 137 printk(KERN_WARNING "Too many mem chunks in SRAT."
138 " Ignoring %lld MBytes at %llx\n",
109 size/(1024*1024), paddr); 139 size/(1024*1024), paddr);
110 return; 140 return;
111 } 141 }
@@ -126,21 +156,29 @@ static void __init parse_memory_affinity_structure (char *sratp)
126 156
127 num_memory_chunks++; 157 num_memory_chunks++;
128 158
129 printk("Memory range 0x%lX to 0x%lX (type 0x%X) in proximity domain 0x%02X %s\n", 159 printk(KERN_DEBUG "Memory range %08lx to %08lx"
160 " in proximity domain %02x %s\n",
130 start_pfn, end_pfn, 161 start_pfn, end_pfn,
131 memory_affinity->memory_type,
132 pxm, 162 pxm,
133 ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ? 163 ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
134 "enabled and removable" : "enabled" ) ); 164 "enabled and removable" : "enabled" ) );
135} 165}
136 166
167/* Callback for SLIT parsing */
168void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
169{
170}
171
172void acpi_numa_arch_fixup(void)
173{
174}
137/* 175/*
138 * The SRAT table always lists ascending addresses, so can always 176 * The SRAT table always lists ascending addresses, so can always
139 * assume that the first "start" address that you see is the real 177 * assume that the first "start" address that you see is the real
140 * start of the node, and that the current "end" address is after 178 * start of the node, and that the current "end" address is after
141 * the previous one. 179 * the previous one.
142 */ 180 */
143static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk) 181static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
144{ 182{
145 /* 183 /*
146 * Only add present memory as told by the e820. 184 * Only add present memory as told by the e820.
@@ -149,12 +187,12 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c
149 * *possible* memory hotplug areas the same as normal RAM. 187 * *possible* memory hotplug areas the same as normal RAM.
150 */ 188 */
151 if (memory_chunk->start_pfn >= max_pfn) { 189 if (memory_chunk->start_pfn >= max_pfn) {
152 printk (KERN_INFO "Ignoring SRAT pfns: 0x%08lx -> %08lx\n", 190 printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
153 memory_chunk->start_pfn, memory_chunk->end_pfn); 191 memory_chunk->start_pfn, memory_chunk->end_pfn);
154 return; 192 return -1;
155 } 193 }
156 if (memory_chunk->nid != nid) 194 if (memory_chunk->nid != nid)
157 return; 195 return -1;
158 196
159 if (!node_has_online_mem(nid)) 197 if (!node_has_online_mem(nid))
160 node_start_pfn[nid] = memory_chunk->start_pfn; 198 node_start_pfn[nid] = memory_chunk->start_pfn;
@@ -164,44 +202,21 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c
164 202
165 if (node_end_pfn[nid] < memory_chunk->end_pfn) 203 if (node_end_pfn[nid] < memory_chunk->end_pfn)
166 node_end_pfn[nid] = memory_chunk->end_pfn; 204 node_end_pfn[nid] = memory_chunk->end_pfn;
205
206 return 0;
167} 207}
168 208
169/* Parse the ACPI Static Resource Affinity Table */ 209int __init get_memcfg_from_srat(void)
170static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
171{ 210{
172 u8 *start, *end, *p;
173 int i, j, nid; 211 int i, j, nid;
174 212
175 start = (u8 *)(&(sratp->reserved) + 1); /* skip header */
176 p = start;
177 end = (u8 *)sratp + sratp->header.length;
178
179 memset(pxm_bitmap, 0, sizeof(pxm_bitmap)); /* init proximity domain bitmap */
180 memset(node_memory_chunk, 0, sizeof(node_memory_chunk));
181 213
182 num_memory_chunks = 0; 214 if (srat_disabled())
183 while (p < end) { 215 goto out_fail;
184 switch (*p) {
185 case ACPI_SRAT_TYPE_CPU_AFFINITY:
186 parse_cpu_affinity_structure(p);
187 break;
188 case ACPI_SRAT_TYPE_MEMORY_AFFINITY:
189 parse_memory_affinity_structure(p);
190 break;
191 default:
192 printk("ACPI 2.0 SRAT: unknown entry skipped: type=0x%02X, len=%d\n", p[0], p[1]);
193 break;
194 }
195 p += p[1];
196 if (p[1] == 0) {
197 printk("acpi20_parse_srat: Entry length value is zero;"
198 " can't parse any further!\n");
199 break;
200 }
201 }
202 216
203 if (num_memory_chunks == 0) { 217 if (num_memory_chunks == 0) {
204 printk("could not finy any ACPI SRAT memory areas.\n"); 218 printk(KERN_WARNING
219 "could not finy any ACPI SRAT memory areas.\n");
205 goto out_fail; 220 goto out_fail;
206 } 221 }
207 222
@@ -228,131 +243,41 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
228 for (i = 0; i < num_memory_chunks; i++) 243 for (i = 0; i < num_memory_chunks; i++)
229 node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm); 244 node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
230 245
231 printk("pxm bitmap: "); 246 printk(KERN_DEBUG "pxm bitmap: ");
232 for (i = 0; i < sizeof(pxm_bitmap); i++) { 247 for (i = 0; i < sizeof(pxm_bitmap); i++) {
233 printk("%02X ", pxm_bitmap[i]); 248 printk(KERN_CONT "%02x ", pxm_bitmap[i]);
234 } 249 }
235 printk("\n"); 250 printk(KERN_CONT "\n");
236 printk("Number of logical nodes in system = %d\n", num_online_nodes()); 251 printk(KERN_DEBUG "Number of logical nodes in system = %d\n",
237 printk("Number of memory chunks in system = %d\n", num_memory_chunks); 252 num_online_nodes());
253 printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
254 num_memory_chunks);
238 255
239 for (i = 0; i < MAX_APICID; i++) 256 for (i = 0; i < MAX_APICID; i++)
240 apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]); 257 apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
241 258
242 for (j = 0; j < num_memory_chunks; j++){ 259 for (j = 0; j < num_memory_chunks; j++){
243 struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; 260 struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
244 printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", 261 printk(KERN_DEBUG
262 "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
245 j, chunk->nid, chunk->start_pfn, chunk->end_pfn); 263 j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
246 node_read_chunk(chunk->nid, chunk); 264 if (node_read_chunk(chunk->nid, chunk))
247 add_active_range(chunk->nid, chunk->start_pfn, chunk->end_pfn); 265 continue;
266
267 e820_register_active_regions(chunk->nid, chunk->start_pfn,
268 min(chunk->end_pfn, max_pfn));
248 } 269 }
249 270
250 for_each_online_node(nid) { 271 for_each_online_node(nid) {
251 unsigned long start = node_start_pfn[nid]; 272 unsigned long start = node_start_pfn[nid];
252 unsigned long end = node_end_pfn[nid]; 273 unsigned long end = min(node_end_pfn[nid], max_pfn);
253 274
254 memory_present(nid, start, end); 275 memory_present(nid, start, end);
255 node_remap_size[nid] = node_memmap_size_bytes(nid, start, end); 276 node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
256 } 277 }
257 return 1; 278 return 1;
258out_fail: 279out_fail:
259 return 0; 280 printk(KERN_ERR "failed to get NUMA memory information from SRAT"
260} 281 " table\n");
261
262struct acpi_static_rsdt {
263 struct acpi_table_rsdt table;
264 u32 padding[7]; /* Allow for 7 more table entries */
265};
266
267int __init get_memcfg_from_srat(void)
268{
269 struct acpi_table_header *header = NULL;
270 struct acpi_table_rsdp *rsdp = NULL;
271 struct acpi_table_rsdt *rsdt = NULL;
272 acpi_native_uint rsdp_address = 0;
273 struct acpi_static_rsdt saved_rsdt;
274 int tables = 0;
275 int i = 0;
276
277 rsdp_address = acpi_os_get_root_pointer();
278 if (!rsdp_address) {
279 printk("%s: System description tables not found\n",
280 __func__);
281 goto out_err;
282 }
283
284 printk("%s: assigning address to rsdp\n", __func__);
285 rsdp = (struct acpi_table_rsdp *)(u32)rsdp_address;
286 if (!rsdp) {
287 printk("%s: Didn't find ACPI root!\n", __func__);
288 goto out_err;
289 }
290
291 printk(KERN_INFO "%.8s v%d [%.6s]\n", rsdp->signature, rsdp->revision,
292 rsdp->oem_id);
293
294 if (strncmp(rsdp->signature, ACPI_SIG_RSDP,strlen(ACPI_SIG_RSDP))) {
295 printk(KERN_WARNING "%s: RSDP table signature incorrect\n", __func__);
296 goto out_err;
297 }
298
299 rsdt = (struct acpi_table_rsdt *)
300 early_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt));
301
302 if (!rsdt) {
303 printk(KERN_WARNING
304 "%s: ACPI: Invalid root system description tables (RSDT)\n",
305 __func__);
306 goto out_err;
307 }
308
309 header = &rsdt->header;
310
311 if (strncmp(header->signature, ACPI_SIG_RSDT, strlen(ACPI_SIG_RSDT))) {
312 printk(KERN_WARNING "ACPI: RSDT signature incorrect\n");
313 goto out_err;
314 }
315
316 /*
317 * The number of tables is computed by taking the
318 * size of all entries (header size minus total
319 * size of RSDT) divided by the size of each entry
320 * (4-byte table pointers).
321 */
322 tables = (header->length - sizeof(struct acpi_table_header)) / 4;
323
324 if (!tables)
325 goto out_err;
326
327 memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt));
328
329 if (saved_rsdt.table.header.length > sizeof(saved_rsdt)) {
330 printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n",
331 saved_rsdt.table.header.length);
332 goto out_err;
333 }
334
335 printk("Begin SRAT table scan....\n");
336
337 for (i = 0; i < tables; i++) {
338 /* Map in header, then map in full table length. */
339 header = (struct acpi_table_header *)
340 early_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header));
341 if (!header)
342 break;
343 header = (struct acpi_table_header *)
344 early_ioremap(saved_rsdt.table.table_offset_entry[i], header->length);
345 if (!header)
346 break;
347
348 if (strncmp((char *) &header->signature, ACPI_SIG_SRAT, 4))
349 continue;
350
351 /* we've found the srat table. don't need to look at any more tables */
352 return acpi20_parse_srat((struct acpi_table_srat *)header);
353 }
354out_err:
355 remove_all_active_ranges();
356 printk("failed to get NUMA memory information from SRAT table\n");
357 return 0; 282 return 0;
358} 283}
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 99649dccad28..51c0a2fc14fe 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -100,7 +100,19 @@ static __init inline int srat_disabled(void)
100/* Callback for SLIT parsing */ 100/* Callback for SLIT parsing */
101void __init acpi_numa_slit_init(struct acpi_table_slit *slit) 101void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
102{ 102{
103 acpi_slit = slit; 103 unsigned length;
104 unsigned long phys;
105
106 length = slit->header.length;
107 phys = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, length,
108 PAGE_SIZE);
109
110 if (phys == -1L)
111 panic(" Can not save slit!\n");
112
113 acpi_slit = __va(phys);
114 memcpy(acpi_slit, slit, length);
115 reserve_early(phys, phys + length, "ACPI SLIT");
104} 116}
105 117
106/* Callback for Proximity Domain -> LAPIC mapping */ 118/* Callback for Proximity Domain -> LAPIC mapping */
@@ -126,7 +138,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
126 return; 138 return;
127 } 139 }
128 140
129 if (is_uv_system()) 141 if (get_uv_system_type() >= UV_X2APIC)
130 apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; 142 apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
131 else 143 else
132 apic_id = pa->apic_id; 144 apic_id = pa->apic_id;
@@ -299,7 +311,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
299 pxmram = 0; 311 pxmram = 0;
300 } 312 }
301 313
302 e820ram = end_pfn - absent_pages_in_range(0, end_pfn); 314 e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
303 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ 315 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
304 if ((long)(e820ram - pxmram) >= 1*1024*1024) { 316 if ((long)(e820ram - pxmram) >= 1*1024*1024) {
305 printk(KERN_ERR 317 printk(KERN_ERR
@@ -376,7 +388,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
376 if (node == NUMA_NO_NODE) 388 if (node == NUMA_NO_NODE)
377 continue; 389 continue;
378 if (!node_isset(node, node_possible_map)) 390 if (!node_isset(node, node_possible_map))
379 numa_set_node(i, NUMA_NO_NODE); 391 numa_clear_node(i);
380 } 392 }
381 numa_init_array(); 393 numa_init_array();
382 return 0; 394 return 0;
@@ -495,6 +507,7 @@ int __node_distance(int a, int b)
495 507
496EXPORT_SYMBOL(__node_distance); 508EXPORT_SYMBOL(__node_distance);
497 509
510#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
498int memory_add_physaddr_to_nid(u64 start) 511int memory_add_physaddr_to_nid(u64 start)
499{ 512{
500 int i, ret = 0; 513 int i, ret = 0;
@@ -506,4 +519,4 @@ int memory_add_physaddr_to_nid(u64 start)
506 return ret; 519 return ret;
507} 520}
508EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); 521EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
509 522#endif
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
new file mode 100644
index 000000000000..d877c5b423ef
--- /dev/null
+++ b/arch/x86/mm/testmmiotrace.c
@@ -0,0 +1,71 @@
1/*
2 * Written by Pekka Paalanen, 2008 <pq@iki.fi>
3 */
4#include <linux/module.h>
5#include <linux/io.h>
6
7#define MODULE_NAME "testmmiotrace"
8
9static unsigned long mmio_address;
10module_param(mmio_address, ulong, 0);
11MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB.");
12
13static void do_write_test(void __iomem *p)
14{
15 unsigned int i;
16 for (i = 0; i < 256; i++)
17 iowrite8(i, p + i);
18 for (i = 1024; i < (5 * 1024); i += 2)
19 iowrite16(i * 12 + 7, p + i);
20 for (i = (5 * 1024); i < (16 * 1024); i += 4)
21 iowrite32(i * 212371 + 13, p + i);
22}
23
24static void do_read_test(void __iomem *p)
25{
26 unsigned int i;
27 for (i = 0; i < 256; i++)
28 ioread8(p + i);
29 for (i = 1024; i < (5 * 1024); i += 2)
30 ioread16(p + i);
31 for (i = (5 * 1024); i < (16 * 1024); i += 4)
32 ioread32(p + i);
33}
34
35static void do_test(void)
36{
37 void __iomem *p = ioremap_nocache(mmio_address, 0x4000);
38 if (!p) {
39 pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
40 return;
41 }
42 do_write_test(p);
43 do_read_test(p);
44 iounmap(p);
45}
46
47static int __init init(void)
48{
49 if (mmio_address == 0) {
50 pr_err(MODULE_NAME ": you have to use the module argument "
51 "mmio_address.\n");
52 pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
53 " YOU REALLY KNOW WHAT YOU ARE DOING!\n");
54 return -ENXIO;
55 }
56
57 pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx "
58 "in PCI address space, and writing "
59 "rubbish in there.\n", mmio_address);
60 do_test();
61 return 0;
62}
63
64static void __exit cleanup(void)
65{
66 pr_debug(MODULE_NAME ": unloaded.\n");
67}
68
69module_init(init);
70module_exit(cleanup);
71MODULE_LICENSE("GPL");
diff --git a/arch/x86/oprofile/Makefile b/arch/x86/oprofile/Makefile
index 30f3eb366667..446902b2a6b6 100644
--- a/arch/x86/oprofile/Makefile
+++ b/arch/x86/oprofile/Makefile
@@ -7,6 +7,6 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
7 timer_int.o ) 7 timer_int.o )
8 8
9oprofile-y := $(DRIVER_OBJS) init.o backtrace.o 9oprofile-y := $(DRIVER_OBJS) init.o backtrace.o
10oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o \ 10oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_amd.o \
11 op_model_ppro.o op_model_p4.o 11 op_model_ppro.o op_model_p4.o
12oprofile-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o 12oprofile-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index cc48d3fde545..57f6c9088081 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -1,10 +1,11 @@
1/** 1/**
2 * @file nmi_int.c 2 * @file nmi_int.c
3 * 3 *
4 * @remark Copyright 2002 OProfile authors 4 * @remark Copyright 2002-2008 OProfile authors
5 * @remark Read the file COPYING 5 * @remark Read the file COPYING
6 * 6 *
7 * @author John Levon <levon@movementarian.org> 7 * @author John Levon <levon@movementarian.org>
8 * @author Robert Richter <robert.richter@amd.com>
8 */ 9 */
9 10
10#include <linux/init.h> 11#include <linux/init.h>
@@ -15,6 +16,7 @@
15#include <linux/slab.h> 16#include <linux/slab.h>
16#include <linux/moduleparam.h> 17#include <linux/moduleparam.h>
17#include <linux/kdebug.h> 18#include <linux/kdebug.h>
19#include <linux/cpu.h>
18#include <asm/nmi.h> 20#include <asm/nmi.h>
19#include <asm/msr.h> 21#include <asm/msr.h>
20#include <asm/apic.h> 22#include <asm/apic.h>
@@ -28,23 +30,48 @@ static DEFINE_PER_CPU(unsigned long, saved_lvtpc);
28 30
29static int nmi_start(void); 31static int nmi_start(void);
30static void nmi_stop(void); 32static void nmi_stop(void);
33static void nmi_cpu_start(void *dummy);
34static void nmi_cpu_stop(void *dummy);
31 35
32/* 0 == registered but off, 1 == registered and on */ 36/* 0 == registered but off, 1 == registered and on */
33static int nmi_enabled = 0; 37static int nmi_enabled = 0;
34 38
39#ifdef CONFIG_SMP
40static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
41 void *data)
42{
43 int cpu = (unsigned long)data;
44 switch (action) {
45 case CPU_DOWN_FAILED:
46 case CPU_ONLINE:
47 smp_call_function_single(cpu, nmi_cpu_start, NULL, 0);
48 break;
49 case CPU_DOWN_PREPARE:
50 smp_call_function_single(cpu, nmi_cpu_stop, NULL, 1);
51 break;
52 }
53 return NOTIFY_DONE;
54}
55
56static struct notifier_block oprofile_cpu_nb = {
57 .notifier_call = oprofile_cpu_notifier
58};
59#endif
60
35#ifdef CONFIG_PM 61#ifdef CONFIG_PM
36 62
37static int nmi_suspend(struct sys_device *dev, pm_message_t state) 63static int nmi_suspend(struct sys_device *dev, pm_message_t state)
38{ 64{
65 /* Only one CPU left, just stop that one */
39 if (nmi_enabled == 1) 66 if (nmi_enabled == 1)
40 nmi_stop(); 67 nmi_cpu_stop(NULL);
41 return 0; 68 return 0;
42} 69}
43 70
44static int nmi_resume(struct sys_device *dev) 71static int nmi_resume(struct sys_device *dev)
45{ 72{
46 if (nmi_enabled == 1) 73 if (nmi_enabled == 1)
47 nmi_start(); 74 nmi_cpu_start(NULL);
48 return 0; 75 return 0;
49} 76}
50 77
@@ -218,8 +245,8 @@ static int nmi_setup(void)
218 } 245 }
219 246
220 } 247 }
221 on_each_cpu(nmi_save_registers, NULL, 0, 1); 248 on_each_cpu(nmi_save_registers, NULL, 1);
222 on_each_cpu(nmi_cpu_setup, NULL, 0, 1); 249 on_each_cpu(nmi_cpu_setup, NULL, 1);
223 nmi_enabled = 1; 250 nmi_enabled = 1;
224 return 0; 251 return 0;
225} 252}
@@ -269,12 +296,15 @@ static void nmi_cpu_shutdown(void *dummy)
269 296
270static void nmi_shutdown(void) 297static void nmi_shutdown(void)
271{ 298{
272 struct op_msrs *msrs = &__get_cpu_var(cpu_msrs); 299 struct op_msrs *msrs;
300
273 nmi_enabled = 0; 301 nmi_enabled = 0;
274 on_each_cpu(nmi_cpu_shutdown, NULL, 0, 1); 302 on_each_cpu(nmi_cpu_shutdown, NULL, 1);
275 unregister_die_notifier(&profile_exceptions_nb); 303 unregister_die_notifier(&profile_exceptions_nb);
304 msrs = &get_cpu_var(cpu_msrs);
276 model->shutdown(msrs); 305 model->shutdown(msrs);
277 free_msrs(); 306 free_msrs();
307 put_cpu_var(cpu_msrs);
278} 308}
279 309
280static void nmi_cpu_start(void *dummy) 310static void nmi_cpu_start(void *dummy)
@@ -285,7 +315,7 @@ static void nmi_cpu_start(void *dummy)
285 315
286static int nmi_start(void) 316static int nmi_start(void)
287{ 317{
288 on_each_cpu(nmi_cpu_start, NULL, 0, 1); 318 on_each_cpu(nmi_cpu_start, NULL, 1);
289 return 0; 319 return 0;
290} 320}
291 321
@@ -297,7 +327,7 @@ static void nmi_cpu_stop(void *dummy)
297 327
298static void nmi_stop(void) 328static void nmi_stop(void)
299{ 329{
300 on_each_cpu(nmi_cpu_stop, NULL, 0, 1); 330 on_each_cpu(nmi_cpu_stop, NULL, 1);
301} 331}
302 332
303struct op_counter_config counter_config[OP_MAX_COUNTER]; 333struct op_counter_config counter_config[OP_MAX_COUNTER];
@@ -368,20 +398,34 @@ static int __init ppro_init(char **cpu_type)
368{ 398{
369 __u8 cpu_model = boot_cpu_data.x86_model; 399 __u8 cpu_model = boot_cpu_data.x86_model;
370 400
371 if (cpu_model == 14) 401 switch (cpu_model) {
402 case 0 ... 2:
403 *cpu_type = "i386/ppro";
404 break;
405 case 3 ... 5:
406 *cpu_type = "i386/pii";
407 break;
408 case 6 ... 8:
409 *cpu_type = "i386/piii";
410 break;
411 case 9:
412 *cpu_type = "i386/p6_mobile";
413 break;
414 case 10 ... 13:
415 *cpu_type = "i386/p6";
416 break;
417 case 14:
372 *cpu_type = "i386/core"; 418 *cpu_type = "i386/core";
373 else if (cpu_model == 15 || cpu_model == 23) 419 break;
420 case 15: case 23:
421 *cpu_type = "i386/core_2";
422 break;
423 case 26:
374 *cpu_type = "i386/core_2"; 424 *cpu_type = "i386/core_2";
375 else if (cpu_model > 0xd) 425 break;
426 default:
427 /* Unknown */
376 return 0; 428 return 0;
377 else if (cpu_model == 9) {
378 *cpu_type = "i386/p6_mobile";
379 } else if (cpu_model > 5) {
380 *cpu_type = "i386/piii";
381 } else if (cpu_model > 2) {
382 *cpu_type = "i386/pii";
383 } else {
384 *cpu_type = "i386/ppro";
385 } 429 }
386 430
387 model = &op_ppro_spec; 431 model = &op_ppro_spec;
@@ -396,6 +440,7 @@ int __init op_nmi_init(struct oprofile_operations *ops)
396 __u8 vendor = boot_cpu_data.x86_vendor; 440 __u8 vendor = boot_cpu_data.x86_vendor;
397 __u8 family = boot_cpu_data.x86; 441 __u8 family = boot_cpu_data.x86;
398 char *cpu_type; 442 char *cpu_type;
443 int ret = 0;
399 444
400 if (!cpu_has_apic) 445 if (!cpu_has_apic)
401 return -ENODEV; 446 return -ENODEV;
@@ -408,19 +453,23 @@ int __init op_nmi_init(struct oprofile_operations *ops)
408 default: 453 default:
409 return -ENODEV; 454 return -ENODEV;
410 case 6: 455 case 6:
411 model = &op_athlon_spec; 456 model = &op_amd_spec;
412 cpu_type = "i386/athlon"; 457 cpu_type = "i386/athlon";
413 break; 458 break;
414 case 0xf: 459 case 0xf:
415 model = &op_athlon_spec; 460 model = &op_amd_spec;
416 /* Actually it could be i386/hammer too, but give 461 /* Actually it could be i386/hammer too, but give
417 user space an consistent name. */ 462 user space an consistent name. */
418 cpu_type = "x86-64/hammer"; 463 cpu_type = "x86-64/hammer";
419 break; 464 break;
420 case 0x10: 465 case 0x10:
421 model = &op_athlon_spec; 466 model = &op_amd_spec;
422 cpu_type = "x86-64/family10"; 467 cpu_type = "x86-64/family10";
423 break; 468 break;
469 case 0x11:
470 model = &op_amd_spec;
471 cpu_type = "x86-64/family11h";
472 break;
424 } 473 }
425 break; 474 break;
426 475
@@ -447,20 +496,36 @@ int __init op_nmi_init(struct oprofile_operations *ops)
447 return -ENODEV; 496 return -ENODEV;
448 } 497 }
449 498
450 init_sysfs(); 499#ifdef CONFIG_SMP
451 using_nmi = 1; 500 register_cpu_notifier(&oprofile_cpu_nb);
501#endif
502 /* default values, can be overwritten by model */
452 ops->create_files = nmi_create_files; 503 ops->create_files = nmi_create_files;
453 ops->setup = nmi_setup; 504 ops->setup = nmi_setup;
454 ops->shutdown = nmi_shutdown; 505 ops->shutdown = nmi_shutdown;
455 ops->start = nmi_start; 506 ops->start = nmi_start;
456 ops->stop = nmi_stop; 507 ops->stop = nmi_stop;
457 ops->cpu_type = cpu_type; 508 ops->cpu_type = cpu_type;
509
510 if (model->init)
511 ret = model->init(ops);
512 if (ret)
513 return ret;
514
515 init_sysfs();
516 using_nmi = 1;
458 printk(KERN_INFO "oprofile: using NMI interrupt.\n"); 517 printk(KERN_INFO "oprofile: using NMI interrupt.\n");
459 return 0; 518 return 0;
460} 519}
461 520
462void op_nmi_exit(void) 521void op_nmi_exit(void)
463{ 522{
464 if (using_nmi) 523 if (using_nmi) {
465 exit_sysfs(); 524 exit_sysfs();
525#ifdef CONFIG_SMP
526 unregister_cpu_notifier(&oprofile_cpu_nb);
527#endif
528 }
529 if (model->exit)
530 model->exit();
466} 531}
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
new file mode 100644
index 000000000000..d9faf607b3a6
--- /dev/null
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -0,0 +1,543 @@
1/*
2 * @file op_model_amd.c
3 * athlon / K7 / K8 / Family 10h model-specific MSR operations
4 *
5 * @remark Copyright 2002-2008 OProfile authors
6 * @remark Read the file COPYING
7 *
8 * @author John Levon
9 * @author Philippe Elie
10 * @author Graydon Hoare
11 * @author Robert Richter <robert.richter@amd.com>
12 * @author Barry Kasindorf
13*/
14
15#include <linux/oprofile.h>
16#include <linux/device.h>
17#include <linux/pci.h>
18
19#include <asm/ptrace.h>
20#include <asm/msr.h>
21#include <asm/nmi.h>
22
23#include "op_x86_model.h"
24#include "op_counter.h"
25
26#define NUM_COUNTERS 4
27#define NUM_CONTROLS 4
28
29#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
30#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0)
31#define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0)
32#define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
33
34#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
35#define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0)
36#define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0)
37#define CTRL_SET_ACTIVE(n) (n |= (1<<22))
38#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22))
39#define CTRL_CLEAR_LO(x) (x &= (1<<21))
40#define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0)
41#define CTRL_SET_ENABLE(val) (val |= 1<<20)
42#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16))
43#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17))
44#define CTRL_SET_UM(val, m) (val |= (m << 8))
45#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff))
46#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf))
47#define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9))
48#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8))
49
50static unsigned long reset_value[NUM_COUNTERS];
51
52#ifdef CONFIG_OPROFILE_IBS
53
54/* IbsFetchCtl bits/masks */
55#define IBS_FETCH_HIGH_VALID_BIT (1UL << 17) /* bit 49 */
56#define IBS_FETCH_HIGH_ENABLE (1UL << 16) /* bit 48 */
57#define IBS_FETCH_LOW_MAX_CNT_MASK 0x0000FFFFUL /* MaxCnt mask */
58
59/*IbsOpCtl bits */
60#define IBS_OP_LOW_VALID_BIT (1ULL<<18) /* bit 18 */
61#define IBS_OP_LOW_ENABLE (1ULL<<17) /* bit 17 */
62
63/* Codes used in cpu_buffer.c */
64/* This produces duplicate code, need to be fixed */
65#define IBS_FETCH_BEGIN 3
66#define IBS_OP_BEGIN 4
67
68/* The function interface needs to be fixed, something like add
69 data. Should then be added to linux/oprofile.h. */
70extern void oprofile_add_ibs_sample(struct pt_regs *const regs,
71 unsigned int * const ibs_sample, u8 code);
72
73struct ibs_fetch_sample {
74 /* MSRC001_1031 IBS Fetch Linear Address Register */
75 unsigned int ibs_fetch_lin_addr_low;
76 unsigned int ibs_fetch_lin_addr_high;
77 /* MSRC001_1030 IBS Fetch Control Register */
78 unsigned int ibs_fetch_ctl_low;
79 unsigned int ibs_fetch_ctl_high;
80 /* MSRC001_1032 IBS Fetch Physical Address Register */
81 unsigned int ibs_fetch_phys_addr_low;
82 unsigned int ibs_fetch_phys_addr_high;
83};
84
85struct ibs_op_sample {
86 /* MSRC001_1034 IBS Op Logical Address Register (IbsRIP) */
87 unsigned int ibs_op_rip_low;
88 unsigned int ibs_op_rip_high;
89 /* MSRC001_1035 IBS Op Data Register */
90 unsigned int ibs_op_data1_low;
91 unsigned int ibs_op_data1_high;
92 /* MSRC001_1036 IBS Op Data 2 Register */
93 unsigned int ibs_op_data2_low;
94 unsigned int ibs_op_data2_high;
95 /* MSRC001_1037 IBS Op Data 3 Register */
96 unsigned int ibs_op_data3_low;
97 unsigned int ibs_op_data3_high;
98 /* MSRC001_1038 IBS DC Linear Address Register (IbsDcLinAd) */
99 unsigned int ibs_dc_linear_low;
100 unsigned int ibs_dc_linear_high;
101 /* MSRC001_1039 IBS DC Physical Address Register (IbsDcPhysAd) */
102 unsigned int ibs_dc_phys_low;
103 unsigned int ibs_dc_phys_high;
104};
105
106/*
107 * unitialize the APIC for the IBS interrupts if needed on AMD Family10h+
108*/
109static void clear_ibs_nmi(void);
110
111static int ibs_allowed; /* AMD Family10h and later */
112
113struct op_ibs_config {
114 unsigned long op_enabled;
115 unsigned long fetch_enabled;
116 unsigned long max_cnt_fetch;
117 unsigned long max_cnt_op;
118 unsigned long rand_en;
119 unsigned long dispatched_ops;
120};
121
122static struct op_ibs_config ibs_config;
123
124#endif
125
126/* functions for op_amd_spec */
127
128static void op_amd_fill_in_addresses(struct op_msrs * const msrs)
129{
130 int i;
131
132 for (i = 0; i < NUM_COUNTERS; i++) {
133 if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
134 msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
135 else
136 msrs->counters[i].addr = 0;
137 }
138
139 for (i = 0; i < NUM_CONTROLS; i++) {
140 if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i))
141 msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
142 else
143 msrs->controls[i].addr = 0;
144 }
145}
146
147
148static void op_amd_setup_ctrs(struct op_msrs const * const msrs)
149{
150 unsigned int low, high;
151 int i;
152
153 /* clear all counters */
154 for (i = 0 ; i < NUM_CONTROLS; ++i) {
155 if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
156 continue;
157 CTRL_READ(low, high, msrs, i);
158 CTRL_CLEAR_LO(low);
159 CTRL_CLEAR_HI(high);
160 CTRL_WRITE(low, high, msrs, i);
161 }
162
163 /* avoid a false detection of ctr overflows in NMI handler */
164 for (i = 0; i < NUM_COUNTERS; ++i) {
165 if (unlikely(!CTR_IS_RESERVED(msrs, i)))
166 continue;
167 CTR_WRITE(1, msrs, i);
168 }
169
170 /* enable active counters */
171 for (i = 0; i < NUM_COUNTERS; ++i) {
172 if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) {
173 reset_value[i] = counter_config[i].count;
174
175 CTR_WRITE(counter_config[i].count, msrs, i);
176
177 CTRL_READ(low, high, msrs, i);
178 CTRL_CLEAR_LO(low);
179 CTRL_CLEAR_HI(high);
180 CTRL_SET_ENABLE(low);
181 CTRL_SET_USR(low, counter_config[i].user);
182 CTRL_SET_KERN(low, counter_config[i].kernel);
183 CTRL_SET_UM(low, counter_config[i].unit_mask);
184 CTRL_SET_EVENT_LOW(low, counter_config[i].event);
185 CTRL_SET_EVENT_HIGH(high, counter_config[i].event);
186 CTRL_SET_HOST_ONLY(high, 0);
187 CTRL_SET_GUEST_ONLY(high, 0);
188
189 CTRL_WRITE(low, high, msrs, i);
190 } else {
191 reset_value[i] = 0;
192 }
193 }
194}
195
196#ifdef CONFIG_OPROFILE_IBS
197
198static inline int
199op_amd_handle_ibs(struct pt_regs * const regs,
200 struct op_msrs const * const msrs)
201{
202 unsigned int low, high;
203 struct ibs_fetch_sample ibs_fetch;
204 struct ibs_op_sample ibs_op;
205
206 if (!ibs_allowed)
207 return 1;
208
209 if (ibs_config.fetch_enabled) {
210 rdmsr(MSR_AMD64_IBSFETCHCTL, low, high);
211 if (high & IBS_FETCH_HIGH_VALID_BIT) {
212 ibs_fetch.ibs_fetch_ctl_high = high;
213 ibs_fetch.ibs_fetch_ctl_low = low;
214 rdmsr(MSR_AMD64_IBSFETCHLINAD, low, high);
215 ibs_fetch.ibs_fetch_lin_addr_high = high;
216 ibs_fetch.ibs_fetch_lin_addr_low = low;
217 rdmsr(MSR_AMD64_IBSFETCHPHYSAD, low, high);
218 ibs_fetch.ibs_fetch_phys_addr_high = high;
219 ibs_fetch.ibs_fetch_phys_addr_low = low;
220
221 oprofile_add_ibs_sample(regs,
222 (unsigned int *)&ibs_fetch,
223 IBS_FETCH_BEGIN);
224
225 /*reenable the IRQ */
226 rdmsr(MSR_AMD64_IBSFETCHCTL, low, high);
227 high &= ~IBS_FETCH_HIGH_VALID_BIT;
228 high |= IBS_FETCH_HIGH_ENABLE;
229 low &= IBS_FETCH_LOW_MAX_CNT_MASK;
230 wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
231 }
232 }
233
234 if (ibs_config.op_enabled) {
235 rdmsr(MSR_AMD64_IBSOPCTL, low, high);
236 if (low & IBS_OP_LOW_VALID_BIT) {
237 rdmsr(MSR_AMD64_IBSOPRIP, low, high);
238 ibs_op.ibs_op_rip_low = low;
239 ibs_op.ibs_op_rip_high = high;
240 rdmsr(MSR_AMD64_IBSOPDATA, low, high);
241 ibs_op.ibs_op_data1_low = low;
242 ibs_op.ibs_op_data1_high = high;
243 rdmsr(MSR_AMD64_IBSOPDATA2, low, high);
244 ibs_op.ibs_op_data2_low = low;
245 ibs_op.ibs_op_data2_high = high;
246 rdmsr(MSR_AMD64_IBSOPDATA3, low, high);
247 ibs_op.ibs_op_data3_low = low;
248 ibs_op.ibs_op_data3_high = high;
249 rdmsr(MSR_AMD64_IBSDCLINAD, low, high);
250 ibs_op.ibs_dc_linear_low = low;
251 ibs_op.ibs_dc_linear_high = high;
252 rdmsr(MSR_AMD64_IBSDCPHYSAD, low, high);
253 ibs_op.ibs_dc_phys_low = low;
254 ibs_op.ibs_dc_phys_high = high;
255
256 /* reenable the IRQ */
257 oprofile_add_ibs_sample(regs,
258 (unsigned int *)&ibs_op,
259 IBS_OP_BEGIN);
260 rdmsr(MSR_AMD64_IBSOPCTL, low, high);
261 high = 0;
262 low &= ~IBS_OP_LOW_VALID_BIT;
263 low |= IBS_OP_LOW_ENABLE;
264 wrmsr(MSR_AMD64_IBSOPCTL, low, high);
265 }
266 }
267
268 return 1;
269}
270
271#endif
272
273static int op_amd_check_ctrs(struct pt_regs * const regs,
274 struct op_msrs const * const msrs)
275{
276 unsigned int low, high;
277 int i;
278
279 for (i = 0 ; i < NUM_COUNTERS; ++i) {
280 if (!reset_value[i])
281 continue;
282 CTR_READ(low, high, msrs, i);
283 if (CTR_OVERFLOWED(low)) {
284 oprofile_add_sample(regs, i);
285 CTR_WRITE(reset_value[i], msrs, i);
286 }
287 }
288
289#ifdef CONFIG_OPROFILE_IBS
290 op_amd_handle_ibs(regs, msrs);
291#endif
292
293 /* See op_model_ppro.c */
294 return 1;
295}
296
297static void op_amd_start(struct op_msrs const * const msrs)
298{
299 unsigned int low, high;
300 int i;
301 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
302 if (reset_value[i]) {
303 CTRL_READ(low, high, msrs, i);
304 CTRL_SET_ACTIVE(low);
305 CTRL_WRITE(low, high, msrs, i);
306 }
307 }
308
309#ifdef CONFIG_OPROFILE_IBS
310 if (ibs_allowed && ibs_config.fetch_enabled) {
311 low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF;
312 high = IBS_FETCH_HIGH_ENABLE;
313 wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
314 }
315
316 if (ibs_allowed && ibs_config.op_enabled) {
317 low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) + IBS_OP_LOW_ENABLE;
318 high = 0;
319 wrmsr(MSR_AMD64_IBSOPCTL, low, high);
320 }
321#endif
322}
323
324
325static void op_amd_stop(struct op_msrs const * const msrs)
326{
327 unsigned int low, high;
328 int i;
329
330 /* Subtle: stop on all counters to avoid race with
331 * setting our pm callback */
332 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
333 if (!reset_value[i])
334 continue;
335 CTRL_READ(low, high, msrs, i);
336 CTRL_SET_INACTIVE(low);
337 CTRL_WRITE(low, high, msrs, i);
338 }
339
340#ifdef CONFIG_OPROFILE_IBS
341 if (ibs_allowed && ibs_config.fetch_enabled) {
342 low = 0; /* clear max count and enable */
343 high = 0;
344 wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
345 }
346
347 if (ibs_allowed && ibs_config.op_enabled) {
348 low = 0; /* clear max count and enable */
349 high = 0;
350 wrmsr(MSR_AMD64_IBSOPCTL, low, high);
351 }
352#endif
353}
354
355static void op_amd_shutdown(struct op_msrs const * const msrs)
356{
357 int i;
358
359 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
360 if (CTR_IS_RESERVED(msrs, i))
361 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
362 }
363 for (i = 0 ; i < NUM_CONTROLS ; ++i) {
364 if (CTRL_IS_RESERVED(msrs, i))
365 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
366 }
367}
368
369#ifndef CONFIG_OPROFILE_IBS
370
371/* no IBS support */
372
373static int op_amd_init(struct oprofile_operations *ops)
374{
375 return 0;
376}
377
378static void op_amd_exit(void) {}
379
380#else
381
382static u8 ibs_eilvt_off;
383
384static inline void apic_init_ibs_nmi_per_cpu(void *arg)
385{
386 ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0);
387}
388
389static inline void apic_clear_ibs_nmi_per_cpu(void *arg)
390{
391 setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
392}
393
394static int pfm_amd64_setup_eilvt(void)
395{
396#define IBSCTL_LVTOFFSETVAL (1 << 8)
397#define IBSCTL 0x1cc
398 struct pci_dev *cpu_cfg;
399 int nodes;
400 u32 value = 0;
401
402 /* per CPU setup */
403 on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 1);
404
405 nodes = 0;
406 cpu_cfg = NULL;
407 do {
408 cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
409 PCI_DEVICE_ID_AMD_10H_NB_MISC,
410 cpu_cfg);
411 if (!cpu_cfg)
412 break;
413 ++nodes;
414 pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
415 | IBSCTL_LVTOFFSETVAL);
416 pci_read_config_dword(cpu_cfg, IBSCTL, &value);
417 if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) {
418 printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
419 "IBSCTL = 0x%08x", value);
420 return 1;
421 }
422 } while (1);
423
424 if (!nodes) {
425 printk(KERN_DEBUG "No CPU node configured for IBS");
426 return 1;
427 }
428
429#ifdef CONFIG_NUMA
430 /* Sanity check */
431 /* Works only for 64bit with proper numa implementation. */
432 if (nodes != num_possible_nodes()) {
433 printk(KERN_DEBUG "Failed to setup CPU node(s) for IBS, "
434 "found: %d, expected %d",
435 nodes, num_possible_nodes());
436 return 1;
437 }
438#endif
439 return 0;
440}
441
442/*
443 * initialize the APIC for the IBS interrupts
444 * if available (AMD Family10h rev B0 and later)
445 */
446static void setup_ibs(void)
447{
448 ibs_allowed = boot_cpu_has(X86_FEATURE_IBS);
449
450 if (!ibs_allowed)
451 return;
452
453 if (pfm_amd64_setup_eilvt()) {
454 ibs_allowed = 0;
455 return;
456 }
457
458 printk(KERN_INFO "oprofile: AMD IBS detected\n");
459}
460
461
462/*
463 * unitialize the APIC for the IBS interrupts if needed on AMD Family10h
464 * rev B0 and later */
465static void clear_ibs_nmi(void)
466{
467 if (ibs_allowed)
468 on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
469}
470
471static int (*create_arch_files)(struct super_block * sb, struct dentry * root);
472
473static int setup_ibs_files(struct super_block * sb, struct dentry * root)
474{
475 char buf[12];
476 struct dentry *dir;
477 int ret = 0;
478
479 /* architecture specific files */
480 if (create_arch_files)
481 ret = create_arch_files(sb, root);
482
483 if (ret)
484 return ret;
485
486 if (!ibs_allowed)
487 return ret;
488
489 /* model specific files */
490
491 /* setup some reasonable defaults */
492 ibs_config.max_cnt_fetch = 250000;
493 ibs_config.fetch_enabled = 0;
494 ibs_config.max_cnt_op = 250000;
495 ibs_config.op_enabled = 0;
496 ibs_config.dispatched_ops = 1;
497 snprintf(buf, sizeof(buf), "ibs_fetch");
498 dir = oprofilefs_mkdir(sb, root, buf);
499 oprofilefs_create_ulong(sb, dir, "rand_enable",
500 &ibs_config.rand_en);
501 oprofilefs_create_ulong(sb, dir, "enable",
502 &ibs_config.fetch_enabled);
503 oprofilefs_create_ulong(sb, dir, "max_count",
504 &ibs_config.max_cnt_fetch);
505 snprintf(buf, sizeof(buf), "ibs_uops");
506 dir = oprofilefs_mkdir(sb, root, buf);
507 oprofilefs_create_ulong(sb, dir, "enable",
508 &ibs_config.op_enabled);
509 oprofilefs_create_ulong(sb, dir, "max_count",
510 &ibs_config.max_cnt_op);
511 oprofilefs_create_ulong(sb, dir, "dispatched_ops",
512 &ibs_config.dispatched_ops);
513
514 return 0;
515}
516
517static int op_amd_init(struct oprofile_operations *ops)
518{
519 setup_ibs();
520 create_arch_files = ops->create_files;
521 ops->create_files = setup_ibs_files;
522 return 0;
523}
524
525static void op_amd_exit(void)
526{
527 clear_ibs_nmi();
528}
529
530#endif
531
532struct op_x86_model_spec const op_amd_spec = {
533 .init = op_amd_init,
534 .exit = op_amd_exit,
535 .num_counters = NUM_COUNTERS,
536 .num_controls = NUM_CONTROLS,
537 .fill_in_addresses = &op_amd_fill_in_addresses,
538 .setup_ctrs = &op_amd_setup_ctrs,
539 .check_ctrs = &op_amd_check_ctrs,
540 .start = &op_amd_start,
541 .stop = &op_amd_stop,
542 .shutdown = &op_amd_shutdown
543};
diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c
deleted file mode 100644
index 3d534879a9dc..000000000000
--- a/arch/x86/oprofile/op_model_athlon.c
+++ /dev/null
@@ -1,190 +0,0 @@
1/*
2 * @file op_model_athlon.h
3 * athlon / K7 / K8 / Family 10h model-specific MSR operations
4 *
5 * @remark Copyright 2002 OProfile authors
6 * @remark Read the file COPYING
7 *
8 * @author John Levon
9 * @author Philippe Elie
10 * @author Graydon Hoare
11 */
12
13#include <linux/oprofile.h>
14#include <asm/ptrace.h>
15#include <asm/msr.h>
16#include <asm/nmi.h>
17
18#include "op_x86_model.h"
19#include "op_counter.h"
20
21#define NUM_COUNTERS 4
22#define NUM_CONTROLS 4
23
24#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
25#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0)
26#define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0)
27#define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
28
29#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
30#define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0)
31#define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0)
32#define CTRL_SET_ACTIVE(n) (n |= (1<<22))
33#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22))
34#define CTRL_CLEAR_LO(x) (x &= (1<<21))
35#define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0)
36#define CTRL_SET_ENABLE(val) (val |= 1<<20)
37#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16))
38#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17))
39#define CTRL_SET_UM(val, m) (val |= (m << 8))
40#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff))
41#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf))
42#define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9))
43#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8))
44
45static unsigned long reset_value[NUM_COUNTERS];
46
47static void athlon_fill_in_addresses(struct op_msrs * const msrs)
48{
49 int i;
50
51 for (i = 0; i < NUM_COUNTERS; i++) {
52 if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
53 msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
54 else
55 msrs->counters[i].addr = 0;
56 }
57
58 for (i = 0; i < NUM_CONTROLS; i++) {
59 if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i))
60 msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
61 else
62 msrs->controls[i].addr = 0;
63 }
64}
65
66
67static void athlon_setup_ctrs(struct op_msrs const * const msrs)
68{
69 unsigned int low, high;
70 int i;
71
72 /* clear all counters */
73 for (i = 0 ; i < NUM_CONTROLS; ++i) {
74 if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
75 continue;
76 CTRL_READ(low, high, msrs, i);
77 CTRL_CLEAR_LO(low);
78 CTRL_CLEAR_HI(high);
79 CTRL_WRITE(low, high, msrs, i);
80 }
81
82 /* avoid a false detection of ctr overflows in NMI handler */
83 for (i = 0; i < NUM_COUNTERS; ++i) {
84 if (unlikely(!CTR_IS_RESERVED(msrs, i)))
85 continue;
86 CTR_WRITE(1, msrs, i);
87 }
88
89 /* enable active counters */
90 for (i = 0; i < NUM_COUNTERS; ++i) {
91 if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) {
92 reset_value[i] = counter_config[i].count;
93
94 CTR_WRITE(counter_config[i].count, msrs, i);
95
96 CTRL_READ(low, high, msrs, i);
97 CTRL_CLEAR_LO(low);
98 CTRL_CLEAR_HI(high);
99 CTRL_SET_ENABLE(low);
100 CTRL_SET_USR(low, counter_config[i].user);
101 CTRL_SET_KERN(low, counter_config[i].kernel);
102 CTRL_SET_UM(low, counter_config[i].unit_mask);
103 CTRL_SET_EVENT_LOW(low, counter_config[i].event);
104 CTRL_SET_EVENT_HIGH(high, counter_config[i].event);
105 CTRL_SET_HOST_ONLY(high, 0);
106 CTRL_SET_GUEST_ONLY(high, 0);
107
108 CTRL_WRITE(low, high, msrs, i);
109 } else {
110 reset_value[i] = 0;
111 }
112 }
113}
114
115
116static int athlon_check_ctrs(struct pt_regs * const regs,
117 struct op_msrs const * const msrs)
118{
119 unsigned int low, high;
120 int i;
121
122 for (i = 0 ; i < NUM_COUNTERS; ++i) {
123 if (!reset_value[i])
124 continue;
125 CTR_READ(low, high, msrs, i);
126 if (CTR_OVERFLOWED(low)) {
127 oprofile_add_sample(regs, i);
128 CTR_WRITE(reset_value[i], msrs, i);
129 }
130 }
131
132 /* See op_model_ppro.c */
133 return 1;
134}
135
136
137static void athlon_start(struct op_msrs const * const msrs)
138{
139 unsigned int low, high;
140 int i;
141 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
142 if (reset_value[i]) {
143 CTRL_READ(low, high, msrs, i);
144 CTRL_SET_ACTIVE(low);
145 CTRL_WRITE(low, high, msrs, i);
146 }
147 }
148}
149
150
151static void athlon_stop(struct op_msrs const * const msrs)
152{
153 unsigned int low, high;
154 int i;
155
156 /* Subtle: stop on all counters to avoid race with
157 * setting our pm callback */
158 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
159 if (!reset_value[i])
160 continue;
161 CTRL_READ(low, high, msrs, i);
162 CTRL_SET_INACTIVE(low);
163 CTRL_WRITE(low, high, msrs, i);
164 }
165}
166
167static void athlon_shutdown(struct op_msrs const * const msrs)
168{
169 int i;
170
171 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
172 if (CTR_IS_RESERVED(msrs, i))
173 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
174 }
175 for (i = 0 ; i < NUM_CONTROLS ; ++i) {
176 if (CTRL_IS_RESERVED(msrs, i))
177 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
178 }
179}
180
181struct op_x86_model_spec const op_athlon_spec = {
182 .num_counters = NUM_COUNTERS,
183 .num_controls = NUM_CONTROLS,
184 .fill_in_addresses = &athlon_fill_in_addresses,
185 .setup_ctrs = &athlon_setup_ctrs,
186 .check_ctrs = &athlon_check_ctrs,
187 .start = &athlon_start,
188 .stop = &athlon_stop,
189 .shutdown = &athlon_shutdown
190};
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index 56b4757a1f47..43ac5af338d8 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -10,11 +10,12 @@
10 10
11#include <linux/oprofile.h> 11#include <linux/oprofile.h>
12#include <linux/smp.h> 12#include <linux/smp.h>
13#include <linux/ptrace.h>
14#include <linux/nmi.h>
13#include <asm/msr.h> 15#include <asm/msr.h>
14#include <asm/ptrace.h>
15#include <asm/fixmap.h> 16#include <asm/fixmap.h>
16#include <asm/apic.h> 17#include <asm/apic.h>
17#include <asm/nmi.h> 18
18 19
19#include "op_x86_model.h" 20#include "op_x86_model.h"
20#include "op_counter.h" 21#include "op_counter.h"
@@ -40,7 +41,7 @@ static unsigned int num_controls = NUM_CONTROLS_NON_HT;
40static inline void setup_num_counters(void) 41static inline void setup_num_counters(void)
41{ 42{
42#ifdef CONFIG_SMP 43#ifdef CONFIG_SMP
43 if (smp_num_siblings == 2){ 44 if (smp_num_siblings == 2) {
44 num_counters = NUM_COUNTERS_HT2; 45 num_counters = NUM_COUNTERS_HT2;
45 num_controls = NUM_CONTROLS_HT2; 46 num_controls = NUM_CONTROLS_HT2;
46 } 47 }
@@ -86,7 +87,7 @@ struct p4_event_binding {
86#define CTR_FLAME_2 (1 << 6) 87#define CTR_FLAME_2 (1 << 6)
87#define CTR_IQ_5 (1 << 7) 88#define CTR_IQ_5 (1 << 7)
88 89
89static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = { 90static struct p4_counter_binding p4_counters[NUM_COUNTERS_NON_HT] = {
90 { CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 }, 91 { CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 },
91 { CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 }, 92 { CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 },
92 { CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 }, 93 { CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 },
@@ -97,32 +98,32 @@ static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = {
97 { CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 } 98 { CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 }
98}; 99};
99 100
100#define NUM_UNUSED_CCCRS NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT 101#define NUM_UNUSED_CCCRS (NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT)
101 102
102/* p4 event codes in libop/op_event.h are indices into this table. */ 103/* p4 event codes in libop/op_event.h are indices into this table. */
103 104
104static struct p4_event_binding p4_events[NUM_EVENTS] = { 105static struct p4_event_binding p4_events[NUM_EVENTS] = {
105 106
106 { /* BRANCH_RETIRED */ 107 { /* BRANCH_RETIRED */
107 0x05, 0x06, 108 0x05, 0x06,
108 { {CTR_IQ_4, MSR_P4_CRU_ESCR2}, 109 { {CTR_IQ_4, MSR_P4_CRU_ESCR2},
109 {CTR_IQ_5, MSR_P4_CRU_ESCR3} } 110 {CTR_IQ_5, MSR_P4_CRU_ESCR3} }
110 }, 111 },
111 112
112 { /* MISPRED_BRANCH_RETIRED */ 113 { /* MISPRED_BRANCH_RETIRED */
113 0x04, 0x03, 114 0x04, 0x03,
114 { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, 115 { { CTR_IQ_4, MSR_P4_CRU_ESCR0},
115 { CTR_IQ_5, MSR_P4_CRU_ESCR1} } 116 { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
116 }, 117 },
117 118
118 { /* TC_DELIVER_MODE */ 119 { /* TC_DELIVER_MODE */
119 0x01, 0x01, 120 0x01, 0x01,
120 { { CTR_MS_0, MSR_P4_TC_ESCR0}, 121 { { CTR_MS_0, MSR_P4_TC_ESCR0},
121 { CTR_MS_2, MSR_P4_TC_ESCR1} } 122 { CTR_MS_2, MSR_P4_TC_ESCR1} }
122 }, 123 },
123 124
124 { /* BPU_FETCH_REQUEST */ 125 { /* BPU_FETCH_REQUEST */
125 0x00, 0x03, 126 0x00, 0x03,
126 { { CTR_BPU_0, MSR_P4_BPU_ESCR0}, 127 { { CTR_BPU_0, MSR_P4_BPU_ESCR0},
127 { CTR_BPU_2, MSR_P4_BPU_ESCR1} } 128 { CTR_BPU_2, MSR_P4_BPU_ESCR1} }
128 }, 129 },
@@ -146,7 +147,7 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
146 }, 147 },
147 148
148 { /* LOAD_PORT_REPLAY */ 149 { /* LOAD_PORT_REPLAY */
149 0x02, 0x04, 150 0x02, 0x04,
150 { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, 151 { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
151 { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } 152 { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
152 }, 153 },
@@ -170,43 +171,43 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
170 }, 171 },
171 172
172 { /* BSQ_CACHE_REFERENCE */ 173 { /* BSQ_CACHE_REFERENCE */
173 0x07, 0x0c, 174 0x07, 0x0c,
174 { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, 175 { { CTR_BPU_0, MSR_P4_BSU_ESCR0},
175 { CTR_BPU_2, MSR_P4_BSU_ESCR1} } 176 { CTR_BPU_2, MSR_P4_BSU_ESCR1} }
176 }, 177 },
177 178
178 { /* IOQ_ALLOCATION */ 179 { /* IOQ_ALLOCATION */
179 0x06, 0x03, 180 0x06, 0x03,
180 { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, 181 { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
181 { 0, 0 } } 182 { 0, 0 } }
182 }, 183 },
183 184
184 { /* IOQ_ACTIVE_ENTRIES */ 185 { /* IOQ_ACTIVE_ENTRIES */
185 0x06, 0x1a, 186 0x06, 0x1a,
186 { { CTR_BPU_2, MSR_P4_FSB_ESCR1}, 187 { { CTR_BPU_2, MSR_P4_FSB_ESCR1},
187 { 0, 0 } } 188 { 0, 0 } }
188 }, 189 },
189 190
190 { /* FSB_DATA_ACTIVITY */ 191 { /* FSB_DATA_ACTIVITY */
191 0x06, 0x17, 192 0x06, 0x17,
192 { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, 193 { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
193 { CTR_BPU_2, MSR_P4_FSB_ESCR1} } 194 { CTR_BPU_2, MSR_P4_FSB_ESCR1} }
194 }, 195 },
195 196
196 { /* BSQ_ALLOCATION */ 197 { /* BSQ_ALLOCATION */
197 0x07, 0x05, 198 0x07, 0x05,
198 { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, 199 { { CTR_BPU_0, MSR_P4_BSU_ESCR0},
199 { 0, 0 } } 200 { 0, 0 } }
200 }, 201 },
201 202
202 { /* BSQ_ACTIVE_ENTRIES */ 203 { /* BSQ_ACTIVE_ENTRIES */
203 0x07, 0x06, 204 0x07, 0x06,
204 { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */}, 205 { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */},
205 { 0, 0 } } 206 { 0, 0 } }
206 }, 207 },
207 208
208 { /* X87_ASSIST */ 209 { /* X87_ASSIST */
209 0x05, 0x03, 210 0x05, 0x03,
210 { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, 211 { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
211 { CTR_IQ_5, MSR_P4_CRU_ESCR3} } 212 { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
212 }, 213 },
@@ -216,21 +217,21 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
216 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 217 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
217 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 218 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
218 }, 219 },
219 220
220 { /* PACKED_SP_UOP */ 221 { /* PACKED_SP_UOP */
221 0x01, 0x08, 222 0x01, 0x08,
222 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 223 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
223 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 224 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
224 }, 225 },
225 226
226 { /* PACKED_DP_UOP */ 227 { /* PACKED_DP_UOP */
227 0x01, 0x0c, 228 0x01, 0x0c,
228 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 229 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
229 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 230 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
230 }, 231 },
231 232
232 { /* SCALAR_SP_UOP */ 233 { /* SCALAR_SP_UOP */
233 0x01, 0x0a, 234 0x01, 0x0a,
234 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 235 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
235 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 236 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
236 }, 237 },
@@ -242,31 +243,31 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
242 }, 243 },
243 244
244 { /* 64BIT_MMX_UOP */ 245 { /* 64BIT_MMX_UOP */
245 0x01, 0x02, 246 0x01, 0x02,
246 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 247 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
247 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 248 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
248 }, 249 },
249 250
250 { /* 128BIT_MMX_UOP */ 251 { /* 128BIT_MMX_UOP */
251 0x01, 0x1a, 252 0x01, 0x1a,
252 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 253 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
253 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 254 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
254 }, 255 },
255 256
256 { /* X87_FP_UOP */ 257 { /* X87_FP_UOP */
257 0x01, 0x04, 258 0x01, 0x04,
258 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 259 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
259 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 260 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
260 }, 261 },
261 262
262 { /* X87_SIMD_MOVES_UOP */ 263 { /* X87_SIMD_MOVES_UOP */
263 0x01, 0x2e, 264 0x01, 0x2e,
264 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 265 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
265 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 266 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
266 }, 267 },
267 268
268 { /* MACHINE_CLEAR */ 269 { /* MACHINE_CLEAR */
269 0x05, 0x02, 270 0x05, 0x02,
270 { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, 271 { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
271 { CTR_IQ_5, MSR_P4_CRU_ESCR3} } 272 { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
272 }, 273 },
@@ -276,9 +277,9 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
276 { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, 277 { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
277 { CTR_BPU_2, MSR_P4_FSB_ESCR1} } 278 { CTR_BPU_2, MSR_P4_FSB_ESCR1} }
278 }, 279 },
279 280
280 { /* TC_MS_XFER */ 281 { /* TC_MS_XFER */
281 0x00, 0x05, 282 0x00, 0x05,
282 { { CTR_MS_0, MSR_P4_MS_ESCR0}, 283 { { CTR_MS_0, MSR_P4_MS_ESCR0},
283 { CTR_MS_2, MSR_P4_MS_ESCR1} } 284 { CTR_MS_2, MSR_P4_MS_ESCR1} }
284 }, 285 },
@@ -308,7 +309,7 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
308 }, 309 },
309 310
310 { /* INSTR_RETIRED */ 311 { /* INSTR_RETIRED */
311 0x04, 0x02, 312 0x04, 0x02,
312 { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, 313 { { CTR_IQ_4, MSR_P4_CRU_ESCR0},
313 { CTR_IQ_5, MSR_P4_CRU_ESCR1} } 314 { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
314 }, 315 },
@@ -319,14 +320,14 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
319 { CTR_IQ_5, MSR_P4_CRU_ESCR1} } 320 { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
320 }, 321 },
321 322
322 { /* UOP_TYPE */ 323 { /* UOP_TYPE */
323 0x02, 0x02, 324 0x02, 0x02,
324 { { CTR_IQ_4, MSR_P4_RAT_ESCR0}, 325 { { CTR_IQ_4, MSR_P4_RAT_ESCR0},
325 { CTR_IQ_5, MSR_P4_RAT_ESCR1} } 326 { CTR_IQ_5, MSR_P4_RAT_ESCR1} }
326 }, 327 },
327 328
328 { /* RETIRED_MISPRED_BRANCH_TYPE */ 329 { /* RETIRED_MISPRED_BRANCH_TYPE */
329 0x02, 0x05, 330 0x02, 0x05,
330 { { CTR_MS_0, MSR_P4_TBPU_ESCR0}, 331 { { CTR_MS_0, MSR_P4_TBPU_ESCR0},
331 { CTR_MS_2, MSR_P4_TBPU_ESCR1} } 332 { CTR_MS_2, MSR_P4_TBPU_ESCR1} }
332 }, 333 },
@@ -349,8 +350,8 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
349#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1)) 350#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1))
350#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25)) 351#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25))
351#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9)) 352#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9))
352#define ESCR_READ(escr,high,ev,i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0) 353#define ESCR_READ(escr, high, ev, i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0)
353#define ESCR_WRITE(escr,high,ev,i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0) 354#define ESCR_WRITE(escr, high, ev, i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0)
354 355
355#define CCCR_RESERVED_BITS 0x38030FFF 356#define CCCR_RESERVED_BITS 0x38030FFF
356#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS) 357#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS)
@@ -360,15 +361,15 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
360#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27)) 361#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27))
361#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12)) 362#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12))
362#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12)) 363#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12))
363#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0) 364#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0)
364#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0) 365#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0)
365#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) 366#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31))
366#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) 367#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31)))
367 368
368#define CTRL_IS_RESERVED(msrs,c) (msrs->controls[(c)].addr ? 1 : 0) 369#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
369#define CTR_IS_RESERVED(msrs,c) (msrs->counters[(c)].addr ? 1 : 0) 370#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
370#define CTR_READ(l,h,i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h));} while (0) 371#define CTR_READ(l, h, i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h)); } while (0)
371#define CTR_WRITE(l,i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1);} while (0) 372#define CTR_WRITE(l, i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1); } while (0)
372#define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000)) 373#define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000))
373 374
374 375
@@ -380,7 +381,7 @@ static unsigned int get_stagger(void)
380#ifdef CONFIG_SMP 381#ifdef CONFIG_SMP
381 int cpu = smp_processor_id(); 382 int cpu = smp_processor_id();
382 return (cpu != first_cpu(per_cpu(cpu_sibling_map, cpu))); 383 return (cpu != first_cpu(per_cpu(cpu_sibling_map, cpu)));
383#endif 384#endif
384 return 0; 385 return 0;
385} 386}
386 387
@@ -395,25 +396,23 @@ static unsigned long reset_value[NUM_COUNTERS_NON_HT];
395 396
396static void p4_fill_in_addresses(struct op_msrs * const msrs) 397static void p4_fill_in_addresses(struct op_msrs * const msrs)
397{ 398{
398 unsigned int i; 399 unsigned int i;
399 unsigned int addr, cccraddr, stag; 400 unsigned int addr, cccraddr, stag;
400 401
401 setup_num_counters(); 402 setup_num_counters();
402 stag = get_stagger(); 403 stag = get_stagger();
403 404
404 /* initialize some registers */ 405 /* initialize some registers */
405 for (i = 0; i < num_counters; ++i) { 406 for (i = 0; i < num_counters; ++i)
406 msrs->counters[i].addr = 0; 407 msrs->counters[i].addr = 0;
407 } 408 for (i = 0; i < num_controls; ++i)
408 for (i = 0; i < num_controls; ++i) {
409 msrs->controls[i].addr = 0; 409 msrs->controls[i].addr = 0;
410 } 410
411
412 /* the counter & cccr registers we pay attention to */ 411 /* the counter & cccr registers we pay attention to */
413 for (i = 0; i < num_counters; ++i) { 412 for (i = 0; i < num_counters; ++i) {
414 addr = p4_counters[VIRT_CTR(stag, i)].counter_address; 413 addr = p4_counters[VIRT_CTR(stag, i)].counter_address;
415 cccraddr = p4_counters[VIRT_CTR(stag, i)].cccr_address; 414 cccraddr = p4_counters[VIRT_CTR(stag, i)].cccr_address;
416 if (reserve_perfctr_nmi(addr)){ 415 if (reserve_perfctr_nmi(addr)) {
417 msrs->counters[i].addr = addr; 416 msrs->counters[i].addr = addr;
418 msrs->controls[i].addr = cccraddr; 417 msrs->controls[i].addr = cccraddr;
419 } 418 }
@@ -447,22 +446,22 @@ static void p4_fill_in_addresses(struct op_msrs * const msrs)
447 if (reserve_evntsel_nmi(addr)) 446 if (reserve_evntsel_nmi(addr))
448 msrs->controls[i].addr = addr; 447 msrs->controls[i].addr = addr;
449 } 448 }
450 449
451 for (addr = MSR_P4_MS_ESCR0 + stag; 450 for (addr = MSR_P4_MS_ESCR0 + stag;
452 addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) { 451 addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) {
453 if (reserve_evntsel_nmi(addr)) 452 if (reserve_evntsel_nmi(addr))
454 msrs->controls[i].addr = addr; 453 msrs->controls[i].addr = addr;
455 } 454 }
456 455
457 for (addr = MSR_P4_IX_ESCR0 + stag; 456 for (addr = MSR_P4_IX_ESCR0 + stag;
458 addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) { 457 addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) {
459 if (reserve_evntsel_nmi(addr)) 458 if (reserve_evntsel_nmi(addr))
460 msrs->controls[i].addr = addr; 459 msrs->controls[i].addr = addr;
461 } 460 }
462 461
463 /* there are 2 remaining non-contiguously located ESCRs */ 462 /* there are 2 remaining non-contiguously located ESCRs */
464 463
465 if (num_counters == NUM_COUNTERS_NON_HT) { 464 if (num_counters == NUM_COUNTERS_NON_HT) {
466 /* standard non-HT CPUs handle both remaining ESCRs*/ 465 /* standard non-HT CPUs handle both remaining ESCRs*/
467 if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5)) 466 if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5))
468 msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; 467 msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
@@ -498,20 +497,20 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
498 unsigned int stag; 497 unsigned int stag;
499 498
500 stag = get_stagger(); 499 stag = get_stagger();
501 500
502 /* convert from counter *number* to counter *bit* */ 501 /* convert from counter *number* to counter *bit* */
503 counter_bit = 1 << VIRT_CTR(stag, ctr); 502 counter_bit = 1 << VIRT_CTR(stag, ctr);
504 503
505 /* find our event binding structure. */ 504 /* find our event binding structure. */
506 if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) { 505 if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) {
507 printk(KERN_ERR 506 printk(KERN_ERR
508 "oprofile: P4 event code 0x%lx out of range\n", 507 "oprofile: P4 event code 0x%lx out of range\n",
509 counter_config[ctr].event); 508 counter_config[ctr].event);
510 return; 509 return;
511 } 510 }
512 511
513 ev = &(p4_events[counter_config[ctr].event - 1]); 512 ev = &(p4_events[counter_config[ctr].event - 1]);
514 513
515 for (i = 0; i < maxbind; i++) { 514 for (i = 0; i < maxbind; i++) {
516 if (ev->bindings[i].virt_counter & counter_bit) { 515 if (ev->bindings[i].virt_counter & counter_bit) {
517 516
@@ -526,25 +525,24 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
526 ESCR_SET_OS_1(escr, counter_config[ctr].kernel); 525 ESCR_SET_OS_1(escr, counter_config[ctr].kernel);
527 } 526 }
528 ESCR_SET_EVENT_SELECT(escr, ev->event_select); 527 ESCR_SET_EVENT_SELECT(escr, ev->event_select);
529 ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask); 528 ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask);
530 ESCR_WRITE(escr, high, ev, i); 529 ESCR_WRITE(escr, high, ev, i);
531 530
532 /* modify CCCR */ 531 /* modify CCCR */
533 CCCR_READ(cccr, high, VIRT_CTR(stag, ctr)); 532 CCCR_READ(cccr, high, VIRT_CTR(stag, ctr));
534 CCCR_CLEAR(cccr); 533 CCCR_CLEAR(cccr);
535 CCCR_SET_REQUIRED_BITS(cccr); 534 CCCR_SET_REQUIRED_BITS(cccr);
536 CCCR_SET_ESCR_SELECT(cccr, ev->escr_select); 535 CCCR_SET_ESCR_SELECT(cccr, ev->escr_select);
537 if (stag == 0) { 536 if (stag == 0)
538 CCCR_SET_PMI_OVF_0(cccr); 537 CCCR_SET_PMI_OVF_0(cccr);
539 } else { 538 else
540 CCCR_SET_PMI_OVF_1(cccr); 539 CCCR_SET_PMI_OVF_1(cccr);
541 }
542 CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr)); 540 CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr));
543 return; 541 return;
544 } 542 }
545 } 543 }
546 544
547 printk(KERN_ERR 545 printk(KERN_ERR
548 "oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n", 546 "oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n",
549 counter_config[ctr].event, stag, ctr); 547 counter_config[ctr].event, stag, ctr);
550} 548}
@@ -559,14 +557,14 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs)
559 stag = get_stagger(); 557 stag = get_stagger();
560 558
561 rdmsr(MSR_IA32_MISC_ENABLE, low, high); 559 rdmsr(MSR_IA32_MISC_ENABLE, low, high);
562 if (! MISC_PMC_ENABLED_P(low)) { 560 if (!MISC_PMC_ENABLED_P(low)) {
563 printk(KERN_ERR "oprofile: P4 PMC not available\n"); 561 printk(KERN_ERR "oprofile: P4 PMC not available\n");
564 return; 562 return;
565 } 563 }
566 564
567 /* clear the cccrs we will use */ 565 /* clear the cccrs we will use */
568 for (i = 0 ; i < num_counters ; i++) { 566 for (i = 0 ; i < num_counters ; i++) {
569 if (unlikely(!CTRL_IS_RESERVED(msrs,i))) 567 if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
570 continue; 568 continue;
571 rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); 569 rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
572 CCCR_CLEAR(low); 570 CCCR_CLEAR(low);
@@ -576,14 +574,14 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs)
576 574
577 /* clear all escrs (including those outside our concern) */ 575 /* clear all escrs (including those outside our concern) */
578 for (i = num_counters; i < num_controls; i++) { 576 for (i = num_counters; i < num_controls; i++) {
579 if (unlikely(!CTRL_IS_RESERVED(msrs,i))) 577 if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
580 continue; 578 continue;
581 wrmsr(msrs->controls[i].addr, 0, 0); 579 wrmsr(msrs->controls[i].addr, 0, 0);
582 } 580 }
583 581
584 /* setup all counters */ 582 /* setup all counters */
585 for (i = 0 ; i < num_counters ; ++i) { 583 for (i = 0 ; i < num_counters ; ++i) {
586 if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs,i))) { 584 if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs, i))) {
587 reset_value[i] = counter_config[i].count; 585 reset_value[i] = counter_config[i].count;
588 pmc_setup_one_p4_counter(i); 586 pmc_setup_one_p4_counter(i);
589 CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i)); 587 CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i));
@@ -603,11 +601,11 @@ static int p4_check_ctrs(struct pt_regs * const regs,
603 stag = get_stagger(); 601 stag = get_stagger();
604 602
605 for (i = 0; i < num_counters; ++i) { 603 for (i = 0; i < num_counters; ++i) {
606 604
607 if (!reset_value[i]) 605 if (!reset_value[i])
608 continue; 606 continue;
609 607
610 /* 608 /*
611 * there is some eccentricity in the hardware which 609 * there is some eccentricity in the hardware which
612 * requires that we perform 2 extra corrections: 610 * requires that we perform 2 extra corrections:
613 * 611 *
@@ -616,24 +614,24 @@ static int p4_check_ctrs(struct pt_regs * const regs,
616 * 614 *
617 * - write the counter back twice to ensure it gets 615 * - write the counter back twice to ensure it gets
618 * updated properly. 616 * updated properly.
619 * 617 *
620 * the former seems to be related to extra NMIs happening 618 * the former seems to be related to extra NMIs happening
621 * during the current NMI; the latter is reported as errata 619 * during the current NMI; the latter is reported as errata
622 * N15 in intel doc 249199-029, pentium 4 specification 620 * N15 in intel doc 249199-029, pentium 4 specification
623 * update, though their suggested work-around does not 621 * update, though their suggested work-around does not
624 * appear to solve the problem. 622 * appear to solve the problem.
625 */ 623 */
626 624
627 real = VIRT_CTR(stag, i); 625 real = VIRT_CTR(stag, i);
628 626
629 CCCR_READ(low, high, real); 627 CCCR_READ(low, high, real);
630 CTR_READ(ctr, high, real); 628 CTR_READ(ctr, high, real);
631 if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) { 629 if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) {
632 oprofile_add_sample(regs, i); 630 oprofile_add_sample(regs, i);
633 CTR_WRITE(reset_value[i], real); 631 CTR_WRITE(reset_value[i], real);
634 CCCR_CLEAR_OVF(low); 632 CCCR_CLEAR_OVF(low);
635 CCCR_WRITE(low, high, real); 633 CCCR_WRITE(low, high, real);
636 CTR_WRITE(reset_value[i], real); 634 CTR_WRITE(reset_value[i], real);
637 } 635 }
638 } 636 }
639 637
@@ -683,15 +681,16 @@ static void p4_shutdown(struct op_msrs const * const msrs)
683 int i; 681 int i;
684 682
685 for (i = 0 ; i < num_counters ; ++i) { 683 for (i = 0 ; i < num_counters ; ++i) {
686 if (CTR_IS_RESERVED(msrs,i)) 684 if (CTR_IS_RESERVED(msrs, i))
687 release_perfctr_nmi(msrs->counters[i].addr); 685 release_perfctr_nmi(msrs->counters[i].addr);
688 } 686 }
689 /* some of the control registers are specially reserved in 687 /*
688 * some of the control registers are specially reserved in
690 * conjunction with the counter registers (hence the starting offset). 689 * conjunction with the counter registers (hence the starting offset).
691 * This saves a few bits. 690 * This saves a few bits.
692 */ 691 */
693 for (i = num_counters ; i < num_controls ; ++i) { 692 for (i = num_counters ; i < num_controls ; ++i) {
694 if (CTRL_IS_RESERVED(msrs,i)) 693 if (CTRL_IS_RESERVED(msrs, i))
695 release_evntsel_nmi(msrs->controls[i].addr); 694 release_evntsel_nmi(msrs->controls[i].addr);
696 } 695 }
697} 696}
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index 45b605fa71d0..05a0261ba0c3 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -32,6 +32,8 @@ struct pt_regs;
32 * various x86 CPU models' perfctr support. 32 * various x86 CPU models' perfctr support.
33 */ 33 */
34struct op_x86_model_spec { 34struct op_x86_model_spec {
35 int (*init)(struct oprofile_operations *ops);
36 void (*exit)(void);
35 unsigned int const num_counters; 37 unsigned int const num_counters;
36 unsigned int const num_controls; 38 unsigned int const num_controls;
37 void (*fill_in_addresses)(struct op_msrs * const msrs); 39 void (*fill_in_addresses)(struct op_msrs * const msrs);
@@ -46,6 +48,6 @@ struct op_x86_model_spec {
46extern struct op_x86_model_spec const op_ppro_spec; 48extern struct op_x86_model_spec const op_ppro_spec;
47extern struct op_x86_model_spec const op_p4_spec; 49extern struct op_x86_model_spec const op_p4_spec;
48extern struct op_x86_model_spec const op_p4_ht2_spec; 50extern struct op_x86_model_spec const op_p4_ht2_spec;
49extern struct op_x86_model_spec const op_athlon_spec; 51extern struct op_x86_model_spec const op_amd_spec;
50 52
51#endif /* OP_X86_MODEL_H */ 53#endif /* OP_X86_MODEL_H */
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index c5c8e485fc44..d49202e740ea 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -1,5 +1,17 @@
1ifeq ($(CONFIG_X86_32),y) 1obj-y := i386.o init.o
2include ${srctree}/arch/x86/pci/Makefile_32 2
3else 3obj-$(CONFIG_PCI_BIOS) += pcbios.o
4include ${srctree}/arch/x86/pci/Makefile_64 4obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_$(BITS).o direct.o mmconfig-shared.o
5endif 5obj-$(CONFIG_PCI_DIRECT) += direct.o
6obj-$(CONFIG_PCI_OLPC) += olpc.o
7
8obj-y += fixup.o
9obj-$(CONFIG_ACPI) += acpi.o
10obj-y += legacy.o irq.o
11
12obj-$(CONFIG_X86_VISWS) += visws.o
13
14obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
15
16obj-y += common.o early.o
17obj-y += amd_bus.o
diff --git a/arch/x86/pci/Makefile_32 b/arch/x86/pci/Makefile_32
deleted file mode 100644
index 89ec35d00efd..000000000000
--- a/arch/x86/pci/Makefile_32
+++ /dev/null
@@ -1,24 +0,0 @@
1obj-y := i386.o init.o
2
3obj-$(CONFIG_PCI_BIOS) += pcbios.o
4obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_32.o direct.o mmconfig-shared.o
5obj-$(CONFIG_PCI_DIRECT) += direct.o
6obj-$(CONFIG_PCI_OLPC) += olpc.o
7
8pci-y := fixup.o
9
10# Do not change the ordering here. There is a nasty init function
11# ordering dependency which breaks when you move acpi.o below
12# legacy/irq.o
13pci-$(CONFIG_ACPI) += acpi.o
14pci-y += legacy.o irq.o
15
16# Careful: VISWS and NUMAQ overrule the pci-y above. The colons are
17# therefor correct. This needs a proper fix by distangling the code.
18pci-$(CONFIG_X86_VISWS) := visws.o fixup.o
19pci-$(CONFIG_X86_NUMAQ) := numa.o irq.o
20
21# Necessary for NUMAQ as well
22pci-$(CONFIG_NUMA) += mp_bus_to_node.o
23
24obj-y += $(pci-y) common.o early.o
diff --git a/arch/x86/pci/Makefile_64 b/arch/x86/pci/Makefile_64
deleted file mode 100644
index 8fbd19832cf6..000000000000
--- a/arch/x86/pci/Makefile_64
+++ /dev/null
@@ -1,17 +0,0 @@
1#
2# Makefile for X86_64 specific PCI routines
3#
4# Reuse the i386 PCI subsystem
5#
6EXTRA_CFLAGS += -Iarch/x86/pci
7
8obj-y := i386.o
9obj-$(CONFIG_PCI_DIRECT)+= direct.o
10obj-y += fixup.o init.o
11obj-$(CONFIG_ACPI) += acpi.o
12obj-y += legacy.o irq.o common.o early.o
13# mmconfig has a 64bit special
14obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_64.o direct.o mmconfig-shared.o
15
16obj-y += k8-bus_64.o
17
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index d95de2f199cd..1d88d2b39771 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -171,8 +171,11 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
171 if (node != -1) 171 if (node != -1)
172 set_mp_bus_to_node(busnum, node); 172 set_mp_bus_to_node(busnum, node);
173 else 173 else
174 node = get_mp_bus_to_node(busnum);
175#endif 174#endif
175 node = get_mp_bus_to_node(busnum);
176
177 if (node != -1 && !node_online(node))
178 node = -1;
176 179
177 /* Allocate per-root-bus (not per bus) arch-specific data. 180 /* Allocate per-root-bus (not per bus) arch-specific data.
178 * TODO: leak; this memory is never freed. 181 * TODO: leak; this memory is never freed.
@@ -204,22 +207,23 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
204 if (!bus) 207 if (!bus)
205 kfree(sd); 208 kfree(sd);
206 209
210 if (bus && node != -1) {
207#ifdef CONFIG_ACPI_NUMA 211#ifdef CONFIG_ACPI_NUMA
208 if (bus) { 212 if (pxm >= 0)
209 if (pxm >= 0) {
210 printk(KERN_DEBUG "bus %02x -> pxm %d -> node %d\n", 213 printk(KERN_DEBUG "bus %02x -> pxm %d -> node %d\n",
211 busnum, pxm, pxm_to_node(pxm)); 214 busnum, pxm, node);
212 } 215#else
213 } 216 printk(KERN_DEBUG "bus %02x -> node %d\n",
217 busnum, node);
214#endif 218#endif
219 }
215 220
216 if (bus && (pci_probe & PCI_USE__CRS)) 221 if (bus && (pci_probe & PCI_USE__CRS))
217 get_current_resources(device, busnum, domain, bus); 222 get_current_resources(device, busnum, domain, bus);
218 return bus; 223 return bus;
219} 224}
220 225
221extern int pci_routeirq; 226int __init pci_acpi_init(void)
222static int __init pci_acpi_init(void)
223{ 227{
224 struct pci_dev *dev = NULL; 228 struct pci_dev *dev = NULL;
225 229
@@ -246,11 +250,5 @@ static int __init pci_acpi_init(void)
246 acpi_pci_irq_enable(dev); 250 acpi_pci_irq_enable(dev);
247 } 251 }
248 252
249#ifdef CONFIG_X86_IO_APIC
250 if (acpi_ioapic)
251 print_IO_APIC();
252#endif
253
254 return 0; 253 return 0;
255} 254}
256subsys_initcall(pci_acpi_init);
diff --git a/arch/x86/pci/k8-bus_64.c b/arch/x86/pci/amd_bus.c
index 5c2799c20e47..22e057665e55 100644
--- a/arch/x86/pci/k8-bus_64.c
+++ b/arch/x86/pci/amd_bus.c
@@ -1,40 +1,26 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/pci.h> 2#include <linux/pci.h>
3#include <linux/topology.h>
4#include <linux/cpu.h>
5#include "pci.h"
6
7#ifdef CONFIG_X86_64
3#include <asm/pci-direct.h> 8#include <asm/pci-direct.h>
4#include <asm/mpspec.h> 9#include <asm/mpspec.h>
5#include <linux/cpumask.h> 10#include <linux/cpumask.h>
6#include <linux/topology.h> 11#endif
7 12
8/* 13/*
9 * This discovers the pcibus <-> node mapping on AMD K8. 14 * This discovers the pcibus <-> node mapping on AMD K8.
10 * also get peer root bus resource for io,mmio 15 * also get peer root bus resource for io,mmio
11 */ 16 */
12 17
13
14/*
15 * sub bus (transparent) will use entres from 3 to store extra from root,
16 * so need to make sure have enought slot there, increase PCI_BUS_NUM_RESOURCES?
17 */
18#define RES_NUM 16
19struct pci_root_info {
20 char name[12];
21 unsigned int res_num;
22 struct resource res[RES_NUM];
23 int bus_min;
24 int bus_max;
25 int node;
26 int link;
27};
28
29/* 4 at this time, it may become to 32 */
30#define PCI_ROOT_NR 4
31static int pci_root_num;
32static struct pci_root_info pci_root_info[PCI_ROOT_NR];
33
34#ifdef CONFIG_NUMA 18#ifdef CONFIG_NUMA
35 19
36#define BUS_NR 256 20#define BUS_NR 256
37 21
22#ifdef CONFIG_X86_64
23
38static int mp_bus_to_node[BUS_NR]; 24static int mp_bus_to_node[BUS_NR];
39 25
40void set_mp_bus_to_node(int busnum, int node) 26void set_mp_bus_to_node(int busnum, int node)
@@ -61,7 +47,52 @@ int get_mp_bus_to_node(int busnum)
61 47
62 return node; 48 return node;
63} 49}
64#endif 50
51#else /* CONFIG_X86_32 */
52
53static unsigned char mp_bus_to_node[BUS_NR];
54
55void set_mp_bus_to_node(int busnum, int node)
56{
57 if (busnum >= 0 && busnum < BUS_NR)
58 mp_bus_to_node[busnum] = (unsigned char) node;
59}
60
61int get_mp_bus_to_node(int busnum)
62{
63 int node;
64
65 if (busnum < 0 || busnum > (BUS_NR - 1))
66 return 0;
67 node = mp_bus_to_node[busnum];
68 return node;
69}
70
71#endif /* CONFIG_X86_32 */
72
73#endif /* CONFIG_NUMA */
74
75#ifdef CONFIG_X86_64
76
77/*
78 * sub bus (transparent) will use entres from 3 to store extra from root,
79 * so need to make sure have enought slot there, increase PCI_BUS_NUM_RESOURCES?
80 */
81#define RES_NUM 16
82struct pci_root_info {
83 char name[12];
84 unsigned int res_num;
85 struct resource res[RES_NUM];
86 int bus_min;
87 int bus_max;
88 int node;
89 int link;
90};
91
92/* 4 at this time, it may become to 32 */
93#define PCI_ROOT_NR 4
94static int pci_root_num;
95static struct pci_root_info pci_root_info[PCI_ROOT_NR];
65 96
66void set_pci_bus_resources_arch_default(struct pci_bus *b) 97void set_pci_bus_resources_arch_default(struct pci_bus *b)
67{ 98{
@@ -384,7 +415,7 @@ static int __init early_fill_mp_bus_info(void)
384 /* need to take out [0, TOM) for RAM*/ 415 /* need to take out [0, TOM) for RAM*/
385 address = MSR_K8_TOP_MEM1; 416 address = MSR_K8_TOP_MEM1;
386 rdmsrl(address, val); 417 rdmsrl(address, val);
387 end = (val & 0xffffff8000000ULL); 418 end = (val & 0xffffff800000ULL);
388 printk(KERN_INFO "TOM: %016lx aka %ldM\n", end, end>>20); 419 printk(KERN_INFO "TOM: %016lx aka %ldM\n", end, end>>20);
389 if (end < (1ULL<<32)) 420 if (end < (1ULL<<32))
390 update_range(range, 0, end - 1); 421 update_range(range, 0, end - 1);
@@ -478,7 +509,7 @@ static int __init early_fill_mp_bus_info(void)
478 /* TOP_MEM2 */ 509 /* TOP_MEM2 */
479 address = MSR_K8_TOP_MEM2; 510 address = MSR_K8_TOP_MEM2;
480 rdmsrl(address, val); 511 rdmsrl(address, val);
481 end = (val & 0xffffff8000000ULL); 512 end = (val & 0xffffff800000ULL);
482 printk(KERN_INFO "TOM2: %016lx aka %ldM\n", end, end>>20); 513 printk(KERN_INFO "TOM2: %016lx aka %ldM\n", end, end>>20);
483 update_range(range, 1ULL<<32, end - 1); 514 update_range(range, 1ULL<<32, end - 1);
484 } 515 }
@@ -525,4 +556,71 @@ static int __init early_fill_mp_bus_info(void)
525 return 0; 556 return 0;
526} 557}
527 558
528postcore_initcall(early_fill_mp_bus_info); 559#else /* !CONFIG_X86_64 */
560
561static int __init early_fill_mp_bus_info(void) { return 0; }
562
563#endif /* !CONFIG_X86_64 */
564
565/* common 32/64 bit code */
566
567#define ENABLE_CF8_EXT_CFG (1ULL << 46)
568
569static void enable_pci_io_ecs(void *unused)
570{
571 u64 reg;
572 rdmsrl(MSR_AMD64_NB_CFG, reg);
573 if (!(reg & ENABLE_CF8_EXT_CFG)) {
574 reg |= ENABLE_CF8_EXT_CFG;
575 wrmsrl(MSR_AMD64_NB_CFG, reg);
576 }
577}
578
579static int __cpuinit amd_cpu_notify(struct notifier_block *self,
580 unsigned long action, void *hcpu)
581{
582 int cpu = (long)hcpu;
583 switch (action) {
584 case CPU_ONLINE:
585 case CPU_ONLINE_FROZEN:
586 smp_call_function_single(cpu, enable_pci_io_ecs, NULL, 0);
587 break;
588 default:
589 break;
590 }
591 return NOTIFY_OK;
592}
593
594static struct notifier_block __cpuinitdata amd_cpu_notifier = {
595 .notifier_call = amd_cpu_notify,
596};
597
598static int __init pci_io_ecs_init(void)
599{
600 int cpu;
601
602 /* assume all cpus from fam10h have IO ECS */
603 if (boot_cpu_data.x86 < 0x10)
604 return 0;
605
606 register_cpu_notifier(&amd_cpu_notifier);
607 for_each_online_cpu(cpu)
608 amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE,
609 (void *)(long)cpu);
610 pci_probe |= PCI_HAS_IO_ECS;
611
612 return 0;
613}
614
615static int __init amd_postcore_init(void)
616{
617 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
618 return 0;
619
620 early_fill_mp_bus_info();
621 pci_io_ecs_init();
622
623 return 0;
624}
625
626postcore_initcall(amd_postcore_init);
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 940185ecaeda..b67732bbb85a 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -20,6 +20,7 @@
20unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | 20unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
21 PCI_PROBE_MMCONF; 21 PCI_PROBE_MMCONF;
22 22
23unsigned int pci_early_dump_regs;
23static int pci_bf_sort; 24static int pci_bf_sort;
24int pci_routeirq; 25int pci_routeirq;
25int pcibios_last_bus = -1; 26int pcibios_last_bus = -1;
@@ -31,7 +32,7 @@ struct pci_raw_ops *raw_pci_ext_ops;
31int raw_pci_read(unsigned int domain, unsigned int bus, unsigned int devfn, 32int raw_pci_read(unsigned int domain, unsigned int bus, unsigned int devfn,
32 int reg, int len, u32 *val) 33 int reg, int len, u32 *val)
33{ 34{
34 if (reg < 256 && raw_pci_ops) 35 if (domain == 0 && reg < 256 && raw_pci_ops)
35 return raw_pci_ops->read(domain, bus, devfn, reg, len, val); 36 return raw_pci_ops->read(domain, bus, devfn, reg, len, val);
36 if (raw_pci_ext_ops) 37 if (raw_pci_ext_ops)
37 return raw_pci_ext_ops->read(domain, bus, devfn, reg, len, val); 38 return raw_pci_ext_ops->read(domain, bus, devfn, reg, len, val);
@@ -41,7 +42,7 @@ int raw_pci_read(unsigned int domain, unsigned int bus, unsigned int devfn,
41int raw_pci_write(unsigned int domain, unsigned int bus, unsigned int devfn, 42int raw_pci_write(unsigned int domain, unsigned int bus, unsigned int devfn,
42 int reg, int len, u32 val) 43 int reg, int len, u32 val)
43{ 44{
44 if (reg < 256 && raw_pci_ops) 45 if (domain == 0 && reg < 256 && raw_pci_ops)
45 return raw_pci_ops->write(domain, bus, devfn, reg, len, val); 46 return raw_pci_ops->write(domain, bus, devfn, reg, len, val);
46 if (raw_pci_ext_ops) 47 if (raw_pci_ext_ops)
47 return raw_pci_ext_ops->write(domain, bus, devfn, reg, len, val); 48 return raw_pci_ext_ops->write(domain, bus, devfn, reg, len, val);
@@ -121,6 +122,21 @@ void __init dmi_check_skip_isa_align(void)
121 dmi_check_system(can_skip_pciprobe_dmi_table); 122 dmi_check_system(can_skip_pciprobe_dmi_table);
122} 123}
123 124
125static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev)
126{
127 struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE];
128
129 if (pci_probe & PCI_NOASSIGN_ROMS) {
130 if (rom_r->parent)
131 return;
132 if (rom_r->start) {
133 /* we deal with BIOS assigned ROM later */
134 return;
135 }
136 rom_r->start = rom_r->end = rom_r->flags = 0;
137 }
138}
139
124/* 140/*
125 * Called after each bus is probed, but before its children 141 * Called after each bus is probed, but before its children
126 * are examined. 142 * are examined.
@@ -128,7 +144,11 @@ void __init dmi_check_skip_isa_align(void)
128 144
129void __devinit pcibios_fixup_bus(struct pci_bus *b) 145void __devinit pcibios_fixup_bus(struct pci_bus *b)
130{ 146{
147 struct pci_dev *dev;
148
131 pci_read_bridge_bases(b); 149 pci_read_bridge_bases(b);
150 list_for_each_entry(dev, &b->devices, bus_list)
151 pcibios_fixup_device_resources(dev);
132} 152}
133 153
134/* 154/*
@@ -328,18 +348,18 @@ static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = {
328#endif 348#endif
329 { 349 {
330 .callback = set_bf_sort, 350 .callback = set_bf_sort,
331 .ident = "HP ProLiant DL360", 351 .ident = "HP ProLiant DL385 G2",
332 .matches = { 352 .matches = {
333 DMI_MATCH(DMI_SYS_VENDOR, "HP"), 353 DMI_MATCH(DMI_SYS_VENDOR, "HP"),
334 DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL360"), 354 DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL385 G2"),
335 }, 355 },
336 }, 356 },
337 { 357 {
338 .callback = set_bf_sort, 358 .callback = set_bf_sort,
339 .ident = "HP ProLiant DL380", 359 .ident = "HP ProLiant DL585 G2",
340 .matches = { 360 .matches = {
341 DMI_MATCH(DMI_SYS_VENDOR, "HP"), 361 DMI_MATCH(DMI_SYS_VENDOR, "HP"),
342 DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL380"), 362 DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL585 G2"),
343 }, 363 },
344 }, 364 },
345 {} 365 {}
@@ -384,7 +404,7 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum)
384 404
385extern u8 pci_cache_line_size; 405extern u8 pci_cache_line_size;
386 406
387static int __init pcibios_init(void) 407int __init pcibios_init(void)
388{ 408{
389 struct cpuinfo_x86 *c = &boot_cpu_data; 409 struct cpuinfo_x86 *c = &boot_cpu_data;
390 410
@@ -411,8 +431,6 @@ static int __init pcibios_init(void)
411 return 0; 431 return 0;
412} 432}
413 433
414subsys_initcall(pcibios_init);
415
416char * __devinit pcibios_setup(char *str) 434char * __devinit pcibios_setup(char *str)
417{ 435{
418 if (!strcmp(str, "off")) { 436 if (!strcmp(str, "off")) {
@@ -483,12 +501,18 @@ char * __devinit pcibios_setup(char *str)
483 else if (!strcmp(str, "rom")) { 501 else if (!strcmp(str, "rom")) {
484 pci_probe |= PCI_ASSIGN_ROMS; 502 pci_probe |= PCI_ASSIGN_ROMS;
485 return NULL; 503 return NULL;
504 } else if (!strcmp(str, "norom")) {
505 pci_probe |= PCI_NOASSIGN_ROMS;
506 return NULL;
486 } else if (!strcmp(str, "assign-busses")) { 507 } else if (!strcmp(str, "assign-busses")) {
487 pci_probe |= PCI_ASSIGN_ALL_BUSSES; 508 pci_probe |= PCI_ASSIGN_ALL_BUSSES;
488 return NULL; 509 return NULL;
489 } else if (!strcmp(str, "use_crs")) { 510 } else if (!strcmp(str, "use_crs")) {
490 pci_probe |= PCI_USE__CRS; 511 pci_probe |= PCI_USE__CRS;
491 return NULL; 512 return NULL;
513 } else if (!strcmp(str, "earlydump")) {
514 pci_early_dump_regs = 1;
515 return NULL;
492 } else if (!strcmp(str, "routeirq")) { 516 } else if (!strcmp(str, "routeirq")) {
493 pci_routeirq = 1; 517 pci_routeirq = 1;
494 return NULL; 518 return NULL;
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c
index 21d1e0e0d535..9915293500fb 100644
--- a/arch/x86/pci/direct.c
+++ b/arch/x86/pci/direct.c
@@ -8,18 +8,21 @@
8#include "pci.h" 8#include "pci.h"
9 9
10/* 10/*
11 * Functions for accessing PCI configuration space with type 1 accesses 11 * Functions for accessing PCI base (first 256 bytes) and extended
12 * (4096 bytes per PCI function) configuration space with type 1
13 * accesses.
12 */ 14 */
13 15
14#define PCI_CONF1_ADDRESS(bus, devfn, reg) \ 16#define PCI_CONF1_ADDRESS(bus, devfn, reg) \
15 (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3)) 17 (0x80000000 | ((reg & 0xF00) << 16) | (bus << 16) \
18 | (devfn << 8) | (reg & 0xFC))
16 19
17static int pci_conf1_read(unsigned int seg, unsigned int bus, 20static int pci_conf1_read(unsigned int seg, unsigned int bus,
18 unsigned int devfn, int reg, int len, u32 *value) 21 unsigned int devfn, int reg, int len, u32 *value)
19{ 22{
20 unsigned long flags; 23 unsigned long flags;
21 24
22 if ((bus > 255) || (devfn > 255) || (reg > 255)) { 25 if ((bus > 255) || (devfn > 255) || (reg > 4095)) {
23 *value = -1; 26 *value = -1;
24 return -EINVAL; 27 return -EINVAL;
25 } 28 }
@@ -50,7 +53,7 @@ static int pci_conf1_write(unsigned int seg, unsigned int bus,
50{ 53{
51 unsigned long flags; 54 unsigned long flags;
52 55
53 if ((bus > 255) || (devfn > 255) || (reg > 255)) 56 if ((bus > 255) || (devfn > 255) || (reg > 4095))
54 return -EINVAL; 57 return -EINVAL;
55 58
56 spin_lock_irqsave(&pci_config_lock, flags); 59 spin_lock_irqsave(&pci_config_lock, flags);
@@ -260,10 +263,18 @@ void __init pci_direct_init(int type)
260 return; 263 return;
261 printk(KERN_INFO "PCI: Using configuration type %d for base access\n", 264 printk(KERN_INFO "PCI: Using configuration type %d for base access\n",
262 type); 265 type);
263 if (type == 1) 266 if (type == 1) {
264 raw_pci_ops = &pci_direct_conf1; 267 raw_pci_ops = &pci_direct_conf1;
265 else 268 if (raw_pci_ext_ops)
266 raw_pci_ops = &pci_direct_conf2; 269 return;
270 if (!(pci_probe & PCI_HAS_IO_ECS))
271 return;
272 printk(KERN_INFO "PCI: Using configuration type 1 "
273 "for extended access\n");
274 raw_pci_ext_ops = &pci_direct_conf1;
275 return;
276 }
277 raw_pci_ops = &pci_direct_conf2;
267} 278}
268 279
269int __init pci_direct_probe(void) 280int __init pci_direct_probe(void)
diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
index 42df4b6606df..86631ccbc25a 100644
--- a/arch/x86/pci/early.c
+++ b/arch/x86/pci/early.c
@@ -7,15 +7,13 @@
7/* Direct PCI access. This is used for PCI accesses in early boot before 7/* Direct PCI access. This is used for PCI accesses in early boot before
8 the PCI subsystem works. */ 8 the PCI subsystem works. */
9 9
10#define PDprintk(x...)
11
12u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset) 10u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset)
13{ 11{
14 u32 v; 12 u32 v;
15 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 13 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
16 v = inl(0xcfc); 14 v = inl(0xcfc);
17 if (v != 0xffffffff) 15 if (v != 0xffffffff)
18 PDprintk("%x reading 4 from %x: %x\n", slot, offset, v); 16 pr_debug("%x reading 4 from %x: %x\n", slot, offset, v);
19 return v; 17 return v;
20} 18}
21 19
@@ -24,7 +22,7 @@ u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset)
24 u8 v; 22 u8 v;
25 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 23 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
26 v = inb(0xcfc + (offset&3)); 24 v = inb(0xcfc + (offset&3));
27 PDprintk("%x reading 1 from %x: %x\n", slot, offset, v); 25 pr_debug("%x reading 1 from %x: %x\n", slot, offset, v);
28 return v; 26 return v;
29} 27}
30 28
@@ -33,23 +31,30 @@ u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset)
33 u16 v; 31 u16 v;
34 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 32 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
35 v = inw(0xcfc + (offset&2)); 33 v = inw(0xcfc + (offset&2));
36 PDprintk("%x reading 2 from %x: %x\n", slot, offset, v); 34 pr_debug("%x reading 2 from %x: %x\n", slot, offset, v);
37 return v; 35 return v;
38} 36}
39 37
40void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, 38void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset,
41 u32 val) 39 u32 val)
42{ 40{
43 PDprintk("%x writing to %x: %x\n", slot, offset, val); 41 pr_debug("%x writing to %x: %x\n", slot, offset, val);
44 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 42 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
45 outl(val, 0xcfc); 43 outl(val, 0xcfc);
46} 44}
47 45
48void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val) 46void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val)
49{ 47{
50 PDprintk("%x writing to %x: %x\n", slot, offset, val); 48 pr_debug("%x writing to %x: %x\n", slot, offset, val);
49 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
50 outb(val, 0xcfc + (offset&3));
51}
52
53void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val)
54{
55 pr_debug("%x writing to %x: %x\n", slot, offset, val);
51 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 56 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
52 outb(val, 0xcfc); 57 outw(val, 0xcfc + (offset&2));
53} 58}
54 59
55int early_pci_allowed(void) 60int early_pci_allowed(void)
@@ -57,3 +62,54 @@ int early_pci_allowed(void)
57 return (pci_probe & (PCI_PROBE_CONF1|PCI_PROBE_NOEARLY)) == 62 return (pci_probe & (PCI_PROBE_CONF1|PCI_PROBE_NOEARLY)) ==
58 PCI_PROBE_CONF1; 63 PCI_PROBE_CONF1;
59} 64}
65
66void early_dump_pci_device(u8 bus, u8 slot, u8 func)
67{
68 int i;
69 int j;
70 u32 val;
71
72 printk(KERN_INFO "PCI: %02x:%02x:%02x", bus, slot, func);
73
74 for (i = 0; i < 256; i += 4) {
75 if (!(i & 0x0f))
76 printk("\n%04x:",i);
77
78 val = read_pci_config(bus, slot, func, i);
79 for (j = 0; j < 4; j++) {
80 printk(" %02x", val & 0xff);
81 val >>= 8;
82 }
83 }
84 printk("\n");
85}
86
87void early_dump_pci_devices(void)
88{
89 unsigned bus, slot, func;
90
91 if (!early_pci_allowed())
92 return;
93
94 for (bus = 0; bus < 256; bus++) {
95 for (slot = 0; slot < 32; slot++) {
96 for (func = 0; func < 8; func++) {
97 u32 class;
98 u8 type;
99 class = read_pci_config(bus, slot, func,
100 PCI_CLASS_REVISION);
101 if (class == 0xffffffff)
102 break;
103
104 early_dump_pci_device(bus, slot, func);
105
106 /* No multi-function device? */
107 type = read_pci_config_byte(bus, slot, func,
108 PCI_HEADER_TYPE);
109 if (!(type & 0x80))
110 break;
111 }
112 }
113 }
114}
115
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index ff3a6a336342..3c27a809393b 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -23,7 +23,8 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
23 pci_read_config_byte(d, reg++, &busno); 23 pci_read_config_byte(d, reg++, &busno);
24 pci_read_config_byte(d, reg++, &suba); 24 pci_read_config_byte(d, reg++, &suba);
25 pci_read_config_byte(d, reg++, &subb); 25 pci_read_config_byte(d, reg++, &subb);
26 DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb); 26 dev_dbg(&d->dev, "i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno,
27 suba, subb);
27 if (busno) 28 if (busno)
28 pci_scan_bus_with_sysdata(busno); /* Bus A */ 29 pci_scan_bus_with_sysdata(busno); /* Bus A */
29 if (suba < subb) 30 if (suba < subb)
@@ -510,3 +511,31 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1201, fam10h_pci_cfg_space_size);
510DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1202, fam10h_pci_cfg_space_size); 511DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1202, fam10h_pci_cfg_space_size);
511DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1203, fam10h_pci_cfg_space_size); 512DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1203, fam10h_pci_cfg_space_size);
512DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1204, fam10h_pci_cfg_space_size); 513DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1204, fam10h_pci_cfg_space_size);
514
515/*
516 * SB600: Disable BAR1 on device 14.0 to avoid HPET resources from
517 * confusing the PCI engine:
518 */
519static void sb600_disable_hpet_bar(struct pci_dev *dev)
520{
521 u8 val;
522
523 /*
524 * The SB600 and SB700 both share the same device
525 * ID, but the PM register 0x55 does something different
526 * for the SB700, so make sure we are dealing with the
527 * SB600 before touching the bit:
528 */
529
530 pci_read_config_byte(dev, 0x08, &val);
531
532 if (val < 0x2F) {
533 outb(0x55, 0xCD6);
534 val = inb(0xCD7);
535
536 /* Set bit 7 in PM register 0x55 */
537 outb(0x55, 0xCD6);
538 outb(val | 0x80, 0xCD7);
539 }
540}
541DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_ATI, 0x4385, sb600_disable_hpet_bar);
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 10fb308fded8..844df0cbbd3e 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -33,6 +33,7 @@
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34 34
35#include <asm/pat.h> 35#include <asm/pat.h>
36#include <asm/e820.h>
36 37
37#include "pci.h" 38#include "pci.h"
38 39
@@ -128,10 +129,7 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
128 pr = pci_find_parent_resource(dev, r); 129 pr = pci_find_parent_resource(dev, r);
129 if (!r->start || !pr || 130 if (!r->start || !pr ||
130 request_resource(pr, r) < 0) { 131 request_resource(pr, r) < 0) {
131 printk(KERN_ERR "PCI: Cannot allocate " 132 dev_err(&dev->dev, "BAR %d: can't allocate resource\n", idx);
132 "resource region %d "
133 "of bridge %s\n",
134 idx, pci_name(dev));
135 /* 133 /*
136 * Something is wrong with the region. 134 * Something is wrong with the region.
137 * Invalidate the resource to prevent 135 * Invalidate the resource to prevent
@@ -166,15 +164,13 @@ static void __init pcibios_allocate_resources(int pass)
166 else 164 else
167 disabled = !(command & PCI_COMMAND_MEMORY); 165 disabled = !(command & PCI_COMMAND_MEMORY);
168 if (pass == disabled) { 166 if (pass == disabled) {
169 DBG("PCI: Resource %08lx-%08lx " 167 dev_dbg(&dev->dev, "resource %#08llx-%#08llx (f=%lx, d=%d, p=%d)\n",
170 "(f=%lx, d=%d, p=%d)\n", 168 (unsigned long long) r->start,
171 r->start, r->end, r->flags, disabled, pass); 169 (unsigned long long) r->end,
170 r->flags, disabled, pass);
172 pr = pci_find_parent_resource(dev, r); 171 pr = pci_find_parent_resource(dev, r);
173 if (!pr || request_resource(pr, r) < 0) { 172 if (!pr || request_resource(pr, r) < 0) {
174 printk(KERN_ERR "PCI: Cannot allocate " 173 dev_err(&dev->dev, "BAR %d: can't allocate resource\n", idx);
175 "resource region %d "
176 "of device %s\n",
177 idx, pci_name(dev));
178 /* We'll assign a new address later */ 174 /* We'll assign a new address later */
179 r->end -= r->start; 175 r->end -= r->start;
180 r->start = 0; 176 r->start = 0;
@@ -187,8 +183,7 @@ static void __init pcibios_allocate_resources(int pass)
187 /* Turn the ROM off, leave the resource region, 183 /* Turn the ROM off, leave the resource region,
188 * but keep it unregistered. */ 184 * but keep it unregistered. */
189 u32 reg; 185 u32 reg;
190 DBG("PCI: Switching off ROM of %s\n", 186 dev_dbg(&dev->dev, "disabling ROM\n");
191 pci_name(dev));
192 r->flags &= ~IORESOURCE_ROM_ENABLE; 187 r->flags &= ~IORESOURCE_ROM_ENABLE;
193 pci_read_config_dword(dev, 188 pci_read_config_dword(dev,
194 dev->rom_base_reg, &reg); 189 dev->rom_base_reg, &reg);
@@ -233,6 +228,8 @@ void __init pcibios_resource_survey(void)
233 pcibios_allocate_bus_resources(&pci_root_buses); 228 pcibios_allocate_bus_resources(&pci_root_buses);
234 pcibios_allocate_resources(0); 229 pcibios_allocate_resources(0);
235 pcibios_allocate_resources(1); 230 pcibios_allocate_resources(1);
231
232 e820_reserve_resources_late();
236} 233}
237 234
238/** 235/**
@@ -257,8 +254,7 @@ void pcibios_set_master(struct pci_dev *dev)
257 lat = pcibios_max_latency; 254 lat = pcibios_max_latency;
258 else 255 else
259 return; 256 return;
260 printk(KERN_DEBUG "PCI: Setting latency timer of device %s to %d\n", 257 dev_printk(KERN_DEBUG, &dev->dev, "setting latency timer to %d\n", lat);
261 pci_name(dev), lat);
262 pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat); 258 pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
263} 259}
264 260
@@ -280,6 +276,7 @@ static void pci_track_mmap_page_range(struct vm_area_struct *vma)
280static struct vm_operations_struct pci_mmap_ops = { 276static struct vm_operations_struct pci_mmap_ops = {
281 .open = pci_track_mmap_page_range, 277 .open = pci_track_mmap_page_range,
282 .close = pci_unmap_page_range, 278 .close = pci_unmap_page_range,
279 .access = generic_access_phys,
283}; 280};
284 281
285int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, 282int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
@@ -299,9 +296,9 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
299 return -EINVAL; 296 return -EINVAL;
300 297
301 prot = pgprot_val(vma->vm_page_prot); 298 prot = pgprot_val(vma->vm_page_prot);
302 if (pat_wc_enabled && write_combine) 299 if (pat_enabled && write_combine)
303 prot |= _PAGE_CACHE_WC; 300 prot |= _PAGE_CACHE_WC;
304 else if (pat_wc_enabled || boot_cpu_data.x86 > 3) 301 else if (pat_enabled || boot_cpu_data.x86 > 3)
305 /* 302 /*
306 * ioremap() and ioremap_nocache() defaults to UC MINUS for now. 303 * ioremap() and ioremap_nocache() defaults to UC MINUS for now.
307 * To avoid attribute conflicts, request UC MINUS here 304 * To avoid attribute conflicts, request UC MINUS here
@@ -334,7 +331,9 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
334 flags = new_flags; 331 flags = new_flags;
335 } 332 }
336 333
337 if (vma->vm_pgoff <= max_pfn_mapped && 334 if (((vma->vm_pgoff < max_low_pfn_mapped) ||
335 (vma->vm_pgoff >= (1UL<<(32 - PAGE_SHIFT)) &&
336 vma->vm_pgoff < max_pfn_mapped)) &&
338 ioremap_change_attr((unsigned long)__va(addr), len, flags)) { 337 ioremap_change_attr((unsigned long)__va(addr), len, flags)) {
339 free_memtype(addr, addr + len); 338 free_memtype(addr, addr + len);
340 return -EINVAL; 339 return -EINVAL;
diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
index b821f4462d99..d6c950f81858 100644
--- a/arch/x86/pci/init.c
+++ b/arch/x86/pci/init.c
@@ -4,7 +4,7 @@
4 4
5/* arch_initcall has too random ordering, so call the initializers 5/* arch_initcall has too random ordering, so call the initializers
6 in the right sequence from here. */ 6 in the right sequence from here. */
7static __init int pci_access_init(void) 7static __init int pci_arch_init(void)
8{ 8{
9#ifdef CONFIG_PCI_DIRECT 9#ifdef CONFIG_PCI_DIRECT
10 int type = 0; 10 int type = 0;
@@ -40,4 +40,4 @@ static __init int pci_access_init(void)
40 40
41 return 0; 41 return 0;
42} 42}
43arch_initcall(pci_access_init); 43arch_initcall(pci_arch_init);
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index ca8df9c260bc..006599db0dc7 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -11,8 +11,8 @@
11#include <linux/slab.h> 11#include <linux/slab.h>
12#include <linux/interrupt.h> 12#include <linux/interrupt.h>
13#include <linux/dmi.h> 13#include <linux/dmi.h>
14#include <asm/io.h> 14#include <linux/io.h>
15#include <asm/smp.h> 15#include <linux/smp.h>
16#include <asm/io_apic.h> 16#include <asm/io_apic.h>
17#include <linux/irq.h> 17#include <linux/irq.h>
18#include <linux/acpi.h> 18#include <linux/acpi.h>
@@ -45,7 +45,8 @@ struct irq_router {
45 char *name; 45 char *name;
46 u16 vendor, device; 46 u16 vendor, device;
47 int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq); 47 int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
48 int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new); 48 int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq,
49 int new);
49}; 50};
50 51
51struct irq_router_handler { 52struct irq_router_handler {
@@ -61,7 +62,7 @@ void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL;
61 * and perform checksum verification. 62 * and perform checksum verification.
62 */ 63 */
63 64
64static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr) 65static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr)
65{ 66{
66 struct irq_routing_table *rt; 67 struct irq_routing_table *rt;
67 int i; 68 int i;
@@ -74,10 +75,11 @@ static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr)
74 rt->size < sizeof(struct irq_routing_table)) 75 rt->size < sizeof(struct irq_routing_table))
75 return NULL; 76 return NULL;
76 sum = 0; 77 sum = 0;
77 for (i=0; i < rt->size; i++) 78 for (i = 0; i < rt->size; i++)
78 sum += addr[i]; 79 sum += addr[i];
79 if (!sum) { 80 if (!sum) {
80 DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt); 81 DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n",
82 rt);
81 return rt; 83 return rt;
82 } 84 }
83 return NULL; 85 return NULL;
@@ -100,7 +102,7 @@ static struct irq_routing_table * __init pirq_find_routing_table(void)
100 return rt; 102 return rt;
101 printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n"); 103 printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
102 } 104 }
103 for(addr = (u8 *) __va(0xf0000); addr < (u8 *) __va(0x100000); addr += 16) { 105 for (addr = (u8 *) __va(0xf0000); addr < (u8 *) __va(0x100000); addr += 16) {
104 rt = pirq_check_routing_table(addr); 106 rt = pirq_check_routing_table(addr);
105 if (rt) 107 if (rt)
106 return rt; 108 return rt;
@@ -122,20 +124,20 @@ static void __init pirq_peer_trick(void)
122 struct irq_info *e; 124 struct irq_info *e;
123 125
124 memset(busmap, 0, sizeof(busmap)); 126 memset(busmap, 0, sizeof(busmap));
125 for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) { 127 for (i = 0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
126 e = &rt->slots[i]; 128 e = &rt->slots[i];
127#ifdef DEBUG 129#ifdef DEBUG
128 { 130 {
129 int j; 131 int j;
130 DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot); 132 DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
131 for(j=0; j<4; j++) 133 for (j = 0; j < 4; j++)
132 DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap); 134 DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
133 DBG("\n"); 135 DBG("\n");
134 } 136 }
135#endif 137#endif
136 busmap[e->bus] = 1; 138 busmap[e->bus] = 1;
137 } 139 }
138 for(i = 1; i < 256; i++) { 140 for (i = 1; i < 256; i++) {
139 int node; 141 int node;
140 if (!busmap[i] || pci_find_bus(0, i)) 142 if (!busmap[i] || pci_find_bus(0, i))
141 continue; 143 continue;
@@ -183,7 +185,8 @@ static unsigned int read_config_nybble(struct pci_dev *router, unsigned offset,
183 return (nr & 1) ? (x >> 4) : (x & 0xf); 185 return (nr & 1) ? (x >> 4) : (x & 0xf);
184} 186}
185 187
186static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val) 188static void write_config_nybble(struct pci_dev *router, unsigned offset,
189 unsigned nr, unsigned int val)
187{ 190{
188 u8 x; 191 u8 x;
189 unsigned reg = offset + (nr >> 1); 192 unsigned reg = offset + (nr >> 1);
@@ -285,7 +288,7 @@ static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
285 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 }; 288 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
286 289
287 WARN_ON_ONCE(pirq > 4); 290 WARN_ON_ONCE(pirq > 4);
288 return read_config_nybble(router,0x43, pirqmap[pirq-1]); 291 return read_config_nybble(router, 0x43, pirqmap[pirq-1]);
289} 292}
290 293
291static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) 294static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
@@ -314,7 +317,7 @@ static int pirq_opti_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
314 317
315/* 318/*
316 * Cyrix: nibble offset 0x5C 319 * Cyrix: nibble offset 0x5C
317 * 0x5C bits 7:4 is INTB bits 3:0 is INTA 320 * 0x5C bits 7:4 is INTB bits 3:0 is INTA
318 * 0x5D bits 7:4 is INTD bits 3:0 is INTC 321 * 0x5D bits 7:4 is INTD bits 3:0 is INTC
319 */ 322 */
320static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq) 323static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
@@ -350,7 +353,7 @@ static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
350 * Apparently there are systems implementing PCI routing table using 353 * Apparently there are systems implementing PCI routing table using
351 * link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D. 354 * link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D.
352 * We try our best to handle both link mappings. 355 * We try our best to handle both link mappings.
353 * 356 *
354 * Currently (2003-05-21) it appears most SiS chipsets follow the 357 * Currently (2003-05-21) it appears most SiS chipsets follow the
355 * definition of routing registers from the SiS-5595 southbridge. 358 * definition of routing registers from the SiS-5595 southbridge.
356 * According to the SiS 5595 datasheets the revision id's of the 359 * According to the SiS 5595 datasheets the revision id's of the
@@ -370,7 +373,7 @@ static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
370 * 373 *
371 * 0x62: USBIRQ: 374 * 0x62: USBIRQ:
372 * bit 6 OHCI function disabled (0), enabled (1) 375 * bit 6 OHCI function disabled (0), enabled (1)
373 * 376 *
374 * 0x6a: ACPI/SCI IRQ: bits 4-6 reserved 377 * 0x6a: ACPI/SCI IRQ: bits 4-6 reserved
375 * 378 *
376 * 0x7e: Data Acq. Module IRQ - bits 4-6 reserved 379 * 0x7e: Data Acq. Module IRQ - bits 4-6 reserved
@@ -433,7 +436,7 @@ static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
433{ 436{
434 WARN_ON_ONCE(pirq >= 9); 437 WARN_ON_ONCE(pirq >= 9);
435 if (pirq > 8) { 438 if (pirq > 8) {
436 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); 439 dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
437 return 0; 440 return 0;
438 } 441 }
439 return read_config_nybble(router, 0x74, pirq-1); 442 return read_config_nybble(router, 0x74, pirq-1);
@@ -443,7 +446,7 @@ static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
443{ 446{
444 WARN_ON_ONCE(pirq >= 9); 447 WARN_ON_ONCE(pirq >= 9);
445 if (pirq > 8) { 448 if (pirq > 8) {
446 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); 449 dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
447 return 0; 450 return 0;
448 } 451 }
449 write_config_nybble(router, 0x74, pirq-1, irq); 452 write_config_nybble(router, 0x74, pirq-1, irq);
@@ -467,7 +470,8 @@ static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int
467 return inb(0xc01) & 0xf; 470 return inb(0xc01) & 0xf;
468} 471}
469 472
470static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) 473static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev,
474 int pirq, int irq)
471{ 475{
472 outb(pirq, 0xc00); 476 outb(pirq, 0xc00);
473 outb(irq, 0xc01); 477 outb(irq, 0xc01);
@@ -487,22 +491,20 @@ static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq
487 u8 irq; 491 u8 irq;
488 irq = 0; 492 irq = 0;
489 if (pirq <= 4) 493 if (pirq <= 4)
490 {
491 irq = read_config_nybble(router, 0x56, pirq - 1); 494 irq = read_config_nybble(router, 0x56, pirq - 1);
492 } 495 dev_info(&dev->dev,
493 printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n", 496 "AMD756: dev [%04x/%04x], router PIRQ %d get IRQ %d\n",
494 dev->vendor, dev->device, pirq, irq); 497 dev->vendor, dev->device, pirq, irq);
495 return irq; 498 return irq;
496} 499}
497 500
498static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) 501static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
499{ 502{
500 printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n", 503 dev_info(&dev->dev,
501 dev->vendor, dev->device, pirq, irq); 504 "AMD756: dev [%04x/%04x], router PIRQ %d set IRQ %d\n",
505 dev->vendor, dev->device, pirq, irq);
502 if (pirq <= 4) 506 if (pirq <= 4)
503 {
504 write_config_nybble(router, 0x56, pirq - 1, irq); 507 write_config_nybble(router, 0x56, pirq - 1, irq);
505 }
506 return 1; 508 return 1;
507} 509}
508 510
@@ -549,50 +551,51 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
549 if (pci_dev_present(pirq_440gx)) 551 if (pci_dev_present(pirq_440gx))
550 return 0; 552 return 0;
551 553
552 switch(device) 554 switch (device) {
553 { 555 case PCI_DEVICE_ID_INTEL_82371FB_0:
554 case PCI_DEVICE_ID_INTEL_82371FB_0: 556 case PCI_DEVICE_ID_INTEL_82371SB_0:
555 case PCI_DEVICE_ID_INTEL_82371SB_0: 557 case PCI_DEVICE_ID_INTEL_82371AB_0:
556 case PCI_DEVICE_ID_INTEL_82371AB_0: 558 case PCI_DEVICE_ID_INTEL_82371MX:
557 case PCI_DEVICE_ID_INTEL_82371MX: 559 case PCI_DEVICE_ID_INTEL_82443MX_0:
558 case PCI_DEVICE_ID_INTEL_82443MX_0: 560 case PCI_DEVICE_ID_INTEL_82801AA_0:
559 case PCI_DEVICE_ID_INTEL_82801AA_0: 561 case PCI_DEVICE_ID_INTEL_82801AB_0:
560 case PCI_DEVICE_ID_INTEL_82801AB_0: 562 case PCI_DEVICE_ID_INTEL_82801BA_0:
561 case PCI_DEVICE_ID_INTEL_82801BA_0: 563 case PCI_DEVICE_ID_INTEL_82801BA_10:
562 case PCI_DEVICE_ID_INTEL_82801BA_10: 564 case PCI_DEVICE_ID_INTEL_82801CA_0:
563 case PCI_DEVICE_ID_INTEL_82801CA_0: 565 case PCI_DEVICE_ID_INTEL_82801CA_12:
564 case PCI_DEVICE_ID_INTEL_82801CA_12: 566 case PCI_DEVICE_ID_INTEL_82801DB_0:
565 case PCI_DEVICE_ID_INTEL_82801DB_0: 567 case PCI_DEVICE_ID_INTEL_82801E_0:
566 case PCI_DEVICE_ID_INTEL_82801E_0: 568 case PCI_DEVICE_ID_INTEL_82801EB_0:
567 case PCI_DEVICE_ID_INTEL_82801EB_0: 569 case PCI_DEVICE_ID_INTEL_ESB_1:
568 case PCI_DEVICE_ID_INTEL_ESB_1: 570 case PCI_DEVICE_ID_INTEL_ICH6_0:
569 case PCI_DEVICE_ID_INTEL_ICH6_0: 571 case PCI_DEVICE_ID_INTEL_ICH6_1:
570 case PCI_DEVICE_ID_INTEL_ICH6_1: 572 case PCI_DEVICE_ID_INTEL_ICH7_0:
571 case PCI_DEVICE_ID_INTEL_ICH7_0: 573 case PCI_DEVICE_ID_INTEL_ICH7_1:
572 case PCI_DEVICE_ID_INTEL_ICH7_1: 574 case PCI_DEVICE_ID_INTEL_ICH7_30:
573 case PCI_DEVICE_ID_INTEL_ICH7_30: 575 case PCI_DEVICE_ID_INTEL_ICH7_31:
574 case PCI_DEVICE_ID_INTEL_ICH7_31: 576 case PCI_DEVICE_ID_INTEL_ESB2_0:
575 case PCI_DEVICE_ID_INTEL_ESB2_0: 577 case PCI_DEVICE_ID_INTEL_ICH8_0:
576 case PCI_DEVICE_ID_INTEL_ICH8_0: 578 case PCI_DEVICE_ID_INTEL_ICH8_1:
577 case PCI_DEVICE_ID_INTEL_ICH8_1: 579 case PCI_DEVICE_ID_INTEL_ICH8_2:
578 case PCI_DEVICE_ID_INTEL_ICH8_2: 580 case PCI_DEVICE_ID_INTEL_ICH8_3:
579 case PCI_DEVICE_ID_INTEL_ICH8_3: 581 case PCI_DEVICE_ID_INTEL_ICH8_4:
580 case PCI_DEVICE_ID_INTEL_ICH8_4: 582 case PCI_DEVICE_ID_INTEL_ICH9_0:
581 case PCI_DEVICE_ID_INTEL_ICH9_0: 583 case PCI_DEVICE_ID_INTEL_ICH9_1:
582 case PCI_DEVICE_ID_INTEL_ICH9_1: 584 case PCI_DEVICE_ID_INTEL_ICH9_2:
583 case PCI_DEVICE_ID_INTEL_ICH9_2: 585 case PCI_DEVICE_ID_INTEL_ICH9_3:
584 case PCI_DEVICE_ID_INTEL_ICH9_3: 586 case PCI_DEVICE_ID_INTEL_ICH9_4:
585 case PCI_DEVICE_ID_INTEL_ICH9_4: 587 case PCI_DEVICE_ID_INTEL_ICH9_5:
586 case PCI_DEVICE_ID_INTEL_ICH9_5: 588 case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
587 case PCI_DEVICE_ID_INTEL_TOLAPAI_0: 589 case PCI_DEVICE_ID_INTEL_ICH10_0:
588 case PCI_DEVICE_ID_INTEL_ICH10_0: 590 case PCI_DEVICE_ID_INTEL_ICH10_1:
589 case PCI_DEVICE_ID_INTEL_ICH10_1: 591 case PCI_DEVICE_ID_INTEL_ICH10_2:
590 case PCI_DEVICE_ID_INTEL_ICH10_2: 592 case PCI_DEVICE_ID_INTEL_ICH10_3:
591 case PCI_DEVICE_ID_INTEL_ICH10_3: 593 case PCI_DEVICE_ID_INTEL_PCH_0:
592 r->name = "PIIX/ICH"; 594 case PCI_DEVICE_ID_INTEL_PCH_1:
593 r->get = pirq_piix_get; 595 r->name = "PIIX/ICH";
594 r->set = pirq_piix_set; 596 r->get = pirq_piix_get;
595 return 1; 597 r->set = pirq_piix_set;
598 return 1;
596 } 599 }
597 return 0; 600 return 0;
598} 601}
@@ -606,7 +609,7 @@ static __init int via_router_probe(struct irq_router *r,
606 * workarounds for some buggy BIOSes 609 * workarounds for some buggy BIOSes
607 */ 610 */
608 if (device == PCI_DEVICE_ID_VIA_82C586_0) { 611 if (device == PCI_DEVICE_ID_VIA_82C586_0) {
609 switch(router->device) { 612 switch (router->device) {
610 case PCI_DEVICE_ID_VIA_82C686: 613 case PCI_DEVICE_ID_VIA_82C686:
611 /* 614 /*
612 * Asus k7m bios wrongly reports 82C686A 615 * Asus k7m bios wrongly reports 82C686A
@@ -631,7 +634,7 @@ static __init int via_router_probe(struct irq_router *r,
631 } 634 }
632 } 635 }
633 636
634 switch(device) { 637 switch (device) {
635 case PCI_DEVICE_ID_VIA_82C586_0: 638 case PCI_DEVICE_ID_VIA_82C586_0:
636 r->name = "VIA"; 639 r->name = "VIA";
637 r->get = pirq_via586_get; 640 r->get = pirq_via586_get;
@@ -654,28 +657,27 @@ static __init int via_router_probe(struct irq_router *r,
654 657
655static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) 658static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
656{ 659{
657 switch(device) 660 switch (device) {
658 { 661 case PCI_DEVICE_ID_VLSI_82C534:
659 case PCI_DEVICE_ID_VLSI_82C534: 662 r->name = "VLSI 82C534";
660 r->name = "VLSI 82C534"; 663 r->get = pirq_vlsi_get;
661 r->get = pirq_vlsi_get; 664 r->set = pirq_vlsi_set;
662 r->set = pirq_vlsi_set; 665 return 1;
663 return 1;
664 } 666 }
665 return 0; 667 return 0;
666} 668}
667 669
668 670
669static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) 671static __init int serverworks_router_probe(struct irq_router *r,
672 struct pci_dev *router, u16 device)
670{ 673{
671 switch(device) 674 switch (device) {
672 { 675 case PCI_DEVICE_ID_SERVERWORKS_OSB4:
673 case PCI_DEVICE_ID_SERVERWORKS_OSB4: 676 case PCI_DEVICE_ID_SERVERWORKS_CSB5:
674 case PCI_DEVICE_ID_SERVERWORKS_CSB5: 677 r->name = "ServerWorks";
675 r->name = "ServerWorks"; 678 r->get = pirq_serverworks_get;
676 r->get = pirq_serverworks_get; 679 r->set = pirq_serverworks_set;
677 r->set = pirq_serverworks_set; 680 return 1;
678 return 1;
679 } 681 }
680 return 0; 682 return 0;
681} 683}
@@ -684,7 +686,7 @@ static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router,
684{ 686{
685 if (device != PCI_DEVICE_ID_SI_503) 687 if (device != PCI_DEVICE_ID_SI_503)
686 return 0; 688 return 0;
687 689
688 r->name = "SIS"; 690 r->name = "SIS";
689 r->get = pirq_sis_get; 691 r->get = pirq_sis_get;
690 r->set = pirq_sis_set; 692 r->set = pirq_sis_set;
@@ -693,50 +695,45 @@ static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router,
693 695
694static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) 696static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
695{ 697{
696 switch(device) 698 switch (device) {
697 { 699 case PCI_DEVICE_ID_CYRIX_5520:
698 case PCI_DEVICE_ID_CYRIX_5520: 700 r->name = "NatSemi";
699 r->name = "NatSemi"; 701 r->get = pirq_cyrix_get;
700 r->get = pirq_cyrix_get; 702 r->set = pirq_cyrix_set;
701 r->set = pirq_cyrix_set; 703 return 1;
702 return 1;
703 } 704 }
704 return 0; 705 return 0;
705} 706}
706 707
707static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) 708static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
708{ 709{
709 switch(device) 710 switch (device) {
710 { 711 case PCI_DEVICE_ID_OPTI_82C700:
711 case PCI_DEVICE_ID_OPTI_82C700: 712 r->name = "OPTI";
712 r->name = "OPTI"; 713 r->get = pirq_opti_get;
713 r->get = pirq_opti_get; 714 r->set = pirq_opti_set;
714 r->set = pirq_opti_set; 715 return 1;
715 return 1;
716 } 716 }
717 return 0; 717 return 0;
718} 718}
719 719
720static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) 720static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
721{ 721{
722 switch(device) 722 switch (device) {
723 { 723 case PCI_DEVICE_ID_ITE_IT8330G_0:
724 case PCI_DEVICE_ID_ITE_IT8330G_0: 724 r->name = "ITE";
725 r->name = "ITE"; 725 r->get = pirq_ite_get;
726 r->get = pirq_ite_get; 726 r->set = pirq_ite_set;
727 r->set = pirq_ite_set; 727 return 1;
728 return 1;
729 } 728 }
730 return 0; 729 return 0;
731} 730}
732 731
733static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) 732static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
734{ 733{
735 switch(device) 734 switch (device) {
736 {
737 case PCI_DEVICE_ID_AL_M1533: 735 case PCI_DEVICE_ID_AL_M1533:
738 case PCI_DEVICE_ID_AL_M1563: 736 case PCI_DEVICE_ID_AL_M1563:
739 printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
740 r->name = "ALI"; 737 r->name = "ALI";
741 r->get = pirq_ali_get; 738 r->get = pirq_ali_get;
742 r->set = pirq_ali_set; 739 r->set = pirq_ali_set;
@@ -747,25 +744,24 @@ static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router,
747 744
748static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) 745static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
749{ 746{
750 switch(device) 747 switch (device) {
751 { 748 case PCI_DEVICE_ID_AMD_VIPER_740B:
752 case PCI_DEVICE_ID_AMD_VIPER_740B: 749 r->name = "AMD756";
753 r->name = "AMD756"; 750 break;
754 break; 751 case PCI_DEVICE_ID_AMD_VIPER_7413:
755 case PCI_DEVICE_ID_AMD_VIPER_7413: 752 r->name = "AMD766";
756 r->name = "AMD766"; 753 break;
757 break; 754 case PCI_DEVICE_ID_AMD_VIPER_7443:
758 case PCI_DEVICE_ID_AMD_VIPER_7443: 755 r->name = "AMD768";
759 r->name = "AMD768"; 756 break;
760 break; 757 default:
761 default: 758 return 0;
762 return 0;
763 } 759 }
764 r->get = pirq_amd756_get; 760 r->get = pirq_amd756_get;
765 r->set = pirq_amd756_set; 761 r->set = pirq_amd756_set;
766 return 1; 762 return 1;
767} 763}
768 764
769static __init int pico_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) 765static __init int pico_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
770{ 766{
771 switch (device) { 767 switch (device) {
@@ -807,7 +803,7 @@ static struct pci_dev *pirq_router_dev;
807 * FIXME: should we have an option to say "generic for 803 * FIXME: should we have an option to say "generic for
808 * chipset" ? 804 * chipset" ?
809 */ 805 */
810 806
811static void __init pirq_find_router(struct irq_router *r) 807static void __init pirq_find_router(struct irq_router *r)
812{ 808{
813 struct irq_routing_table *rt = pirq_table; 809 struct irq_routing_table *rt = pirq_table;
@@ -826,7 +822,7 @@ static void __init pirq_find_router(struct irq_router *r)
826 r->name = "default"; 822 r->name = "default";
827 r->get = NULL; 823 r->get = NULL;
828 r->set = NULL; 824 r->set = NULL;
829 825
830 DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n", 826 DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
831 rt->rtr_vendor, rt->rtr_device); 827 rt->rtr_vendor, rt->rtr_device);
832 828
@@ -837,19 +833,19 @@ static void __init pirq_find_router(struct irq_router *r)
837 return; 833 return;
838 } 834 }
839 835
840 for( h = pirq_routers; h->vendor; h++) { 836 for (h = pirq_routers; h->vendor; h++) {
841 /* First look for a router match */ 837 /* First look for a router match */
842 if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device)) 838 if (rt->rtr_vendor == h->vendor &&
839 h->probe(r, pirq_router_dev, rt->rtr_device))
843 break; 840 break;
844 /* Fall back to a device match */ 841 /* Fall back to a device match */
845 if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device)) 842 if (pirq_router_dev->vendor == h->vendor &&
843 h->probe(r, pirq_router_dev, pirq_router_dev->device))
846 break; 844 break;
847 } 845 }
848 printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n", 846 dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x/%04x]\n",
849 pirq_router.name, 847 pirq_router.name,
850 pirq_router_dev->vendor, 848 pirq_router_dev->vendor, pirq_router_dev->device);
851 pirq_router_dev->device,
852 pci_name(pirq_router_dev));
853 849
854 /* The device remains referenced for the kernel lifetime */ 850 /* The device remains referenced for the kernel lifetime */
855} 851}
@@ -857,11 +853,13 @@ static void __init pirq_find_router(struct irq_router *r)
857static struct irq_info *pirq_get_info(struct pci_dev *dev) 853static struct irq_info *pirq_get_info(struct pci_dev *dev)
858{ 854{
859 struct irq_routing_table *rt = pirq_table; 855 struct irq_routing_table *rt = pirq_table;
860 int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); 856 int entries = (rt->size - sizeof(struct irq_routing_table)) /
857 sizeof(struct irq_info);
861 struct irq_info *info; 858 struct irq_info *info;
862 859
863 for (info = rt->slots; entries--; info++) 860 for (info = rt->slots; entries--; info++)
864 if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn)) 861 if (info->bus == dev->bus->number &&
862 PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
865 return info; 863 return info;
866 return NULL; 864 return NULL;
867} 865}
@@ -880,7 +878,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
880 /* Find IRQ pin */ 878 /* Find IRQ pin */
881 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); 879 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
882 if (!pin) { 880 if (!pin) {
883 DBG(KERN_DEBUG " -> no interrupt pin\n"); 881 dev_dbg(&dev->dev, "no interrupt pin\n");
884 return 0; 882 return 0;
885 } 883 }
886 pin = pin - 1; 884 pin = pin - 1;
@@ -889,20 +887,21 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
889 887
890 if (!pirq_table) 888 if (!pirq_table)
891 return 0; 889 return 0;
892 890
893 DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
894 info = pirq_get_info(dev); 891 info = pirq_get_info(dev);
895 if (!info) { 892 if (!info) {
896 DBG(" -> not found in routing table\n" KERN_DEBUG); 893 dev_dbg(&dev->dev, "PCI INT %c not found in routing table\n",
894 'A' + pin);
897 return 0; 895 return 0;
898 } 896 }
899 pirq = info->irq[pin].link; 897 pirq = info->irq[pin].link;
900 mask = info->irq[pin].bitmap; 898 mask = info->irq[pin].bitmap;
901 if (!pirq) { 899 if (!pirq) {
902 DBG(" -> not routed\n" KERN_DEBUG); 900 dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin);
903 return 0; 901 return 0;
904 } 902 }
905 DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs); 903 dev_dbg(&dev->dev, "PCI INT %c -> PIRQ %02x, mask %04x, excl %04x",
904 'A' + pin, pirq, mask, pirq_table->exclusive_irqs);
906 mask &= pcibios_irq_mask; 905 mask &= pcibios_irq_mask;
907 906
908 /* Work around broken HP Pavilion Notebooks which assign USB to 907 /* Work around broken HP Pavilion Notebooks which assign USB to
@@ -915,7 +914,8 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
915 } 914 }
916 915
917 /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */ 916 /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
918 if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) { 917 if (acer_tm360_irqrouting && dev->irq == 11 &&
918 dev->vendor == PCI_VENDOR_ID_O2) {
919 pirq = 0x68; 919 pirq = 0x68;
920 mask = 0x400; 920 mask = 0x400;
921 dev->irq = r->get(pirq_router_dev, dev, pirq); 921 dev->irq = r->get(pirq_router_dev, dev, pirq);
@@ -928,51 +928,50 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
928 */ 928 */
929 newirq = dev->irq; 929 newirq = dev->irq;
930 if (newirq && !((1 << newirq) & mask)) { 930 if (newirq && !((1 << newirq) & mask)) {
931 if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0; 931 if (pci_probe & PCI_USE_PIRQ_MASK)
932 else printk("\n" KERN_WARNING 932 newirq = 0;
933 "PCI: IRQ %i for device %s doesn't match PIRQ mask " 933 else
934 "- try pci=usepirqmask\n" KERN_DEBUG, newirq, 934 dev_warn(&dev->dev, "IRQ %d doesn't match PIRQ mask "
935 pci_name(dev)); 935 "%#x; try pci=usepirqmask\n", newirq, mask);
936 } 936 }
937 if (!newirq && assign) { 937 if (!newirq && assign) {
938 for (i = 0; i < 16; i++) { 938 for (i = 0; i < 16; i++) {
939 if (!(mask & (1 << i))) 939 if (!(mask & (1 << i)))
940 continue; 940 continue;
941 if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, IRQF_SHARED)) 941 if (pirq_penalty[i] < pirq_penalty[newirq] &&
942 can_request_irq(i, IRQF_SHARED))
942 newirq = i; 943 newirq = i;
943 } 944 }
944 } 945 }
945 DBG(" -> newirq=%d", newirq); 946 dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin, newirq);
946 947
947 /* Check if it is hardcoded */ 948 /* Check if it is hardcoded */
948 if ((pirq & 0xf0) == 0xf0) { 949 if ((pirq & 0xf0) == 0xf0) {
949 irq = pirq & 0xf; 950 irq = pirq & 0xf;
950 DBG(" -> hardcoded IRQ %d\n", irq); 951 msg = "hardcoded";
951 msg = "Hardcoded"; 952 } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
952 } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \ 953 ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) {
953 ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) { 954 msg = "found";
954 DBG(" -> got IRQ %d\n", irq);
955 msg = "Found";
956 eisa_set_level_irq(irq); 955 eisa_set_level_irq(irq);
957 } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) { 956 } else if (newirq && r->set &&
958 DBG(" -> assigning IRQ %d", newirq); 957 (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
959 if (r->set(pirq_router_dev, dev, pirq, newirq)) { 958 if (r->set(pirq_router_dev, dev, pirq, newirq)) {
960 eisa_set_level_irq(newirq); 959 eisa_set_level_irq(newirq);
961 DBG(" ... OK\n"); 960 msg = "assigned";
962 msg = "Assigned";
963 irq = newirq; 961 irq = newirq;
964 } 962 }
965 } 963 }
966 964
967 if (!irq) { 965 if (!irq) {
968 DBG(" ... failed\n");
969 if (newirq && mask == (1 << newirq)) { 966 if (newirq && mask == (1 << newirq)) {
970 msg = "Guessed"; 967 msg = "guessed";
971 irq = newirq; 968 irq = newirq;
972 } else 969 } else {
970 dev_dbg(&dev->dev, "can't route interrupt\n");
973 return 0; 971 return 0;
972 }
974 } 973 }
975 printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev)); 974 dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin, irq);
976 975
977 /* Update IRQ for all devices with the same pirq value */ 976 /* Update IRQ for all devices with the same pirq value */
978 while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) { 977 while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
@@ -984,20 +983,25 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
984 if (!info) 983 if (!info)
985 continue; 984 continue;
986 if (info->irq[pin].link == pirq) { 985 if (info->irq[pin].link == pirq) {
987 /* We refuse to override the dev->irq information. Give a warning! */ 986 /*
988 if ( dev2->irq && dev2->irq != irq && \ 987 * We refuse to override the dev->irq
988 * information. Give a warning!
989 */
990 if (dev2->irq && dev2->irq != irq && \
989 (!(pci_probe & PCI_USE_PIRQ_MASK) || \ 991 (!(pci_probe & PCI_USE_PIRQ_MASK) || \
990 ((1 << dev2->irq) & mask)) ) { 992 ((1 << dev2->irq) & mask))) {
991#ifndef CONFIG_PCI_MSI 993#ifndef CONFIG_PCI_MSI
992 printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n", 994 dev_info(&dev2->dev, "IRQ routing conflict: "
993 pci_name(dev2), dev2->irq, irq); 995 "have IRQ %d, want IRQ %d\n",
996 dev2->irq, irq);
994#endif 997#endif
995 continue; 998 continue;
996 } 999 }
997 dev2->irq = irq; 1000 dev2->irq = irq;
998 pirq_penalty[irq]++; 1001 pirq_penalty[irq]++;
999 if (dev != dev2) 1002 if (dev != dev2)
1000 printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2)); 1003 dev_info(&dev->dev, "sharing IRQ %d with %s\n",
1004 irq, pci_name(dev2));
1001 } 1005 }
1002 } 1006 }
1003 return 1; 1007 return 1;
@@ -1011,15 +1015,20 @@ static void __init pcibios_fixup_irqs(void)
1011 DBG(KERN_DEBUG "PCI: IRQ fixup\n"); 1015 DBG(KERN_DEBUG "PCI: IRQ fixup\n");
1012 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 1016 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
1013 /* 1017 /*
1014 * If the BIOS has set an out of range IRQ number, just ignore it. 1018 * If the BIOS has set an out of range IRQ number, just
1015 * Also keep track of which IRQ's are already in use. 1019 * ignore it. Also keep track of which IRQ's are
1020 * already in use.
1016 */ 1021 */
1017 if (dev->irq >= 16) { 1022 if (dev->irq >= 16) {
1018 DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq); 1023 dev_dbg(&dev->dev, "ignoring bogus IRQ %d\n", dev->irq);
1019 dev->irq = 0; 1024 dev->irq = 0;
1020 } 1025 }
1021 /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */ 1026 /*
1022 if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000) 1027 * If the IRQ is already assigned to a PCI device,
1028 * ignore its ISA use penalty
1029 */
1030 if (pirq_penalty[dev->irq] >= 100 &&
1031 pirq_penalty[dev->irq] < 100000)
1023 pirq_penalty[dev->irq] = 0; 1032 pirq_penalty[dev->irq] = 0;
1024 pirq_penalty[dev->irq]++; 1033 pirq_penalty[dev->irq]++;
1025 } 1034 }
@@ -1031,34 +1040,47 @@ static void __init pcibios_fixup_irqs(void)
1031 /* 1040 /*
1032 * Recalculate IRQ numbers if we use the I/O APIC. 1041 * Recalculate IRQ numbers if we use the I/O APIC.
1033 */ 1042 */
1034 if (io_apic_assign_pci_irqs) 1043 if (io_apic_assign_pci_irqs) {
1035 {
1036 int irq; 1044 int irq;
1037 1045
1038 if (pin) { 1046 if (!pin)
1039 pin--; /* interrupt pins are numbered starting from 1 */ 1047 continue;
1040 irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin); 1048
1041 /* 1049 /*
1042 * Busses behind bridges are typically not listed in the MP-table. 1050 * interrupt pins are numbered starting from 1
1043 * In this case we have to look up the IRQ based on the parent bus, 1051 */
1044 * parent slot, and pin number. The SMP code detects such bridged 1052 pin--;
1045 * busses itself so we should get into this branch reliably. 1053 irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
1046 */ 1054 PCI_SLOT(dev->devfn), pin);
1047 if (irq < 0 && dev->bus->parent) { /* go back to the bridge */ 1055 /*
1048 struct pci_dev * bridge = dev->bus->self; 1056 * Busses behind bridges are typically not listed in the
1049 1057 * MP-table. In this case we have to look up the IRQ
1050 pin = (pin + PCI_SLOT(dev->devfn)) % 4; 1058 * based on the parent bus, parent slot, and pin number.
1051 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 1059 * The SMP code detects such bridged busses itself so we
1052 PCI_SLOT(bridge->devfn), pin); 1060 * should get into this branch reliably.
1053 if (irq >= 0) 1061 */
1054 printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n", 1062 if (irq < 0 && dev->bus->parent) {
1055 pci_name(bridge), 'A' + pin, irq); 1063 /* go back to the bridge */
1056 } 1064 struct pci_dev *bridge = dev->bus->self;
1057 if (irq >= 0) { 1065 int bus;
1058 printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", 1066
1059 pci_name(dev), 'A' + pin, irq); 1067 pin = (pin + PCI_SLOT(dev->devfn)) % 4;
1060 dev->irq = irq; 1068 bus = bridge->bus->number;
1061 } 1069 irq = IO_APIC_get_PCI_irq_vector(bus,
1070 PCI_SLOT(bridge->devfn), pin);
1071 if (irq >= 0)
1072 dev_warn(&dev->dev,
1073 "using bridge %s INT %c to "
1074 "get IRQ %d\n",
1075 pci_name(bridge),
1076 'A' + pin, irq);
1077 }
1078 if (irq >= 0) {
1079 dev_info(&dev->dev,
1080 "PCI->APIC IRQ transform: INT %c "
1081 "-> IRQ %d\n",
1082 'A' + pin, irq);
1083 dev->irq = irq;
1062 } 1084 }
1063 } 1085 }
1064#endif 1086#endif
@@ -1078,7 +1100,8 @@ static int __init fix_broken_hp_bios_irq9(const struct dmi_system_id *d)
1078{ 1100{
1079 if (!broken_hp_bios_irq9) { 1101 if (!broken_hp_bios_irq9) {
1080 broken_hp_bios_irq9 = 1; 1102 broken_hp_bios_irq9 = 1;
1081 printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident); 1103 printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
1104 d->ident);
1082 } 1105 }
1083 return 0; 1106 return 0;
1084} 1107}
@@ -1091,7 +1114,8 @@ static int __init fix_acer_tm360_irqrouting(const struct dmi_system_id *d)
1091{ 1114{
1092 if (!acer_tm360_irqrouting) { 1115 if (!acer_tm360_irqrouting) {
1093 acer_tm360_irqrouting = 1; 1116 acer_tm360_irqrouting = 1;
1094 printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident); 1117 printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
1118 d->ident);
1095 } 1119 }
1096 return 0; 1120 return 0;
1097} 1121}
@@ -1103,7 +1127,8 @@ static struct dmi_system_id __initdata pciirq_dmi_table[] = {
1103 .matches = { 1127 .matches = {
1104 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), 1128 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
1105 DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"), 1129 DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
1106 DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"), 1130 DMI_MATCH(DMI_PRODUCT_VERSION,
1131 "HP Pavilion Notebook Model GE"),
1107 DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"), 1132 DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
1108 }, 1133 },
1109 }, 1134 },
@@ -1118,7 +1143,7 @@ static struct dmi_system_id __initdata pciirq_dmi_table[] = {
1118 { } 1143 { }
1119}; 1144};
1120 1145
1121static int __init pcibios_irq_init(void) 1146int __init pcibios_irq_init(void)
1122{ 1147{
1123 DBG(KERN_DEBUG "PCI: IRQ init\n"); 1148 DBG(KERN_DEBUG "PCI: IRQ init\n");
1124 1149
@@ -1138,11 +1163,14 @@ static int __init pcibios_irq_init(void)
1138 pirq_find_router(&pirq_router); 1163 pirq_find_router(&pirq_router);
1139 if (pirq_table->exclusive_irqs) { 1164 if (pirq_table->exclusive_irqs) {
1140 int i; 1165 int i;
1141 for (i=0; i<16; i++) 1166 for (i = 0; i < 16; i++)
1142 if (!(pirq_table->exclusive_irqs & (1 << i))) 1167 if (!(pirq_table->exclusive_irqs & (1 << i)))
1143 pirq_penalty[i] += 100; 1168 pirq_penalty[i] += 100;
1144 } 1169 }
1145 /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */ 1170 /*
1171 * If we're using the I/O APIC, avoid using the PCI IRQ
1172 * routing table
1173 */
1146 if (io_apic_assign_pci_irqs) 1174 if (io_apic_assign_pci_irqs)
1147 pirq_table = NULL; 1175 pirq_table = NULL;
1148 } 1176 }
@@ -1153,9 +1181,6 @@ static int __init pcibios_irq_init(void)
1153 return 0; 1181 return 0;
1154} 1182}
1155 1183
1156subsys_initcall(pcibios_irq_init);
1157
1158
1159static void pirq_penalize_isa_irq(int irq, int active) 1184static void pirq_penalize_isa_irq(int irq, int active)
1160{ 1185{
1161 /* 1186 /*
@@ -1189,7 +1214,7 @@ static int pirq_enable_irq(struct pci_dev *dev)
1189 if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) { 1214 if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
1190 char *msg = ""; 1215 char *msg = "";
1191 1216
1192 pin--; /* interrupt pins are numbered starting from 1 */ 1217 pin--; /* interrupt pins are numbered starting from 1 */
1193 1218
1194 if (io_apic_assign_pci_irqs) { 1219 if (io_apic_assign_pci_irqs) {
1195 int irq; 1220 int irq;
@@ -1203,35 +1228,41 @@ static int pirq_enable_irq(struct pci_dev *dev)
1203 */ 1228 */
1204 temp_dev = dev; 1229 temp_dev = dev;
1205 while (irq < 0 && dev->bus->parent) { /* go back to the bridge */ 1230 while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
1206 struct pci_dev * bridge = dev->bus->self; 1231 struct pci_dev *bridge = dev->bus->self;
1207 1232
1208 pin = (pin + PCI_SLOT(dev->devfn)) % 4; 1233 pin = (pin + PCI_SLOT(dev->devfn)) % 4;
1209 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 1234 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
1210 PCI_SLOT(bridge->devfn), pin); 1235 PCI_SLOT(bridge->devfn), pin);
1211 if (irq >= 0) 1236 if (irq >= 0)
1212 printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n", 1237 dev_warn(&dev->dev, "using bridge %s "
1213 pci_name(bridge), 'A' + pin, irq); 1238 "INT %c to get IRQ %d\n",
1239 pci_name(bridge), 'A' + pin,
1240 irq);
1214 dev = bridge; 1241 dev = bridge;
1215 } 1242 }
1216 dev = temp_dev; 1243 dev = temp_dev;
1217 if (irq >= 0) { 1244 if (irq >= 0) {
1218 printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", 1245 dev_info(&dev->dev, "PCI->APIC IRQ transform: "
1219 pci_name(dev), 'A' + pin, irq); 1246 "INT %c -> IRQ %d\n", 'A' + pin, irq);
1220 dev->irq = irq; 1247 dev->irq = irq;
1221 return 0; 1248 return 0;
1222 } else 1249 } else
1223 msg = " Probably buggy MP table."; 1250 msg = "; probably buggy MP table";
1224 } else if (pci_probe & PCI_BIOS_IRQ_SCAN) 1251 } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
1225 msg = ""; 1252 msg = "";
1226 else 1253 else
1227 msg = " Please try using pci=biosirq."; 1254 msg = "; please try using pci=biosirq";
1228 1255
1229 /* With IDE legacy devices the IRQ lookup failure is not a problem.. */ 1256 /*
1230 if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5)) 1257 * With IDE legacy devices the IRQ lookup failure is not
1258 * a problem..
1259 */
1260 if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE &&
1261 !(dev->class & 0x5))
1231 return 0; 1262 return 0;
1232 1263
1233 printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n", 1264 dev_warn(&dev->dev, "can't find IRQ for PCI INT %c%s\n",
1234 'A' + pin, pci_name(dev), msg); 1265 'A' + pin, msg);
1235 } 1266 }
1236 return 0; 1267 return 0;
1237} 1268}
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index a67921ce60af..b722dd481b39 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -14,7 +14,7 @@ static void __devinit pcibios_fixup_peer_bridges(void)
14 int n, devfn; 14 int n, devfn;
15 long node; 15 long node;
16 16
17 if (pcibios_last_bus <= 0 || pcibios_last_bus >= 0xff) 17 if (pcibios_last_bus <= 0 || pcibios_last_bus > 0xff)
18 return; 18 return;
19 DBG("PCI: Peer bridge fixup\n"); 19 DBG("PCI: Peer bridge fixup\n");
20 20
@@ -55,4 +55,21 @@ static int __init pci_legacy_init(void)
55 return 0; 55 return 0;
56} 56}
57 57
58subsys_initcall(pci_legacy_init); 58int __init pci_subsys_init(void)
59{
60#ifdef CONFIG_X86_NUMAQ
61 pci_numaq_init();
62#endif
63#ifdef CONFIG_ACPI
64 pci_acpi_init();
65#endif
66#ifdef CONFIG_X86_VISWS
67 pci_visws_init();
68#endif
69 pci_legacy_init();
70 pcibios_irq_init();
71 pcibios_init();
72
73 return 0;
74}
75subsys_initcall(pci_subsys_init);
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 0cfebecf2a8f..654a2234f8f3 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -209,7 +209,7 @@ static int __init pci_mmcfg_check_hostbridge(void)
209 return name != NULL; 209 return name != NULL;
210} 210}
211 211
212static void __init pci_mmcfg_insert_resources(unsigned long resource_flags) 212static void __init pci_mmcfg_insert_resources(void)
213{ 213{
214#define PCI_MMCFG_RESOURCE_NAME_LEN 19 214#define PCI_MMCFG_RESOURCE_NAME_LEN 19
215 int i; 215 int i;
@@ -233,7 +233,7 @@ static void __init pci_mmcfg_insert_resources(unsigned long resource_flags)
233 cfg->pci_segment); 233 cfg->pci_segment);
234 res->start = cfg->address; 234 res->start = cfg->address;
235 res->end = res->start + (num_buses << 20) - 1; 235 res->end = res->start + (num_buses << 20) - 1;
236 res->flags = IORESOURCE_MEM | resource_flags; 236 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
237 insert_resource(&iomem_resource, res); 237 insert_resource(&iomem_resource, res);
238 names += PCI_MMCFG_RESOURCE_NAME_LEN; 238 names += PCI_MMCFG_RESOURCE_NAME_LEN;
239 } 239 }
@@ -293,7 +293,7 @@ static acpi_status __init find_mboard_resource(acpi_handle handle, u32 lvl,
293 return AE_OK; 293 return AE_OK;
294} 294}
295 295
296static int __init is_acpi_reserved(unsigned long start, unsigned long end) 296static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used)
297{ 297{
298 struct resource mcfg_res; 298 struct resource mcfg_res;
299 299
@@ -310,6 +310,41 @@ static int __init is_acpi_reserved(unsigned long start, unsigned long end)
310 return mcfg_res.flags; 310 return mcfg_res.flags;
311} 311}
312 312
313typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type);
314
315static int __init is_mmconf_reserved(check_reserved_t is_reserved,
316 u64 addr, u64 size, int i,
317 typeof(pci_mmcfg_config[0]) *cfg, int with_e820)
318{
319 u64 old_size = size;
320 int valid = 0;
321
322 while (!is_reserved(addr, addr + size - 1, E820_RESERVED)) {
323 size >>= 1;
324 if (size < (16UL<<20))
325 break;
326 }
327
328 if (size >= (16UL<<20) || size == old_size) {
329 printk(KERN_NOTICE
330 "PCI: MCFG area at %Lx reserved in %s\n",
331 addr, with_e820?"E820":"ACPI motherboard resources");
332 valid = 1;
333
334 if (old_size != size) {
335 /* update end_bus_number */
336 cfg->end_bus_number = cfg->start_bus_number + ((size>>20) - 1);
337 printk(KERN_NOTICE "PCI: updated MCFG configuration %d: base %lx "
338 "segment %hu buses %u - %u\n",
339 i, (unsigned long)cfg->address, cfg->pci_segment,
340 (unsigned int)cfg->start_bus_number,
341 (unsigned int)cfg->end_bus_number);
342 }
343 }
344
345 return valid;
346}
347
313static void __init pci_mmcfg_reject_broken(int early) 348static void __init pci_mmcfg_reject_broken(int early)
314{ 349{
315 typeof(pci_mmcfg_config[0]) *cfg; 350 typeof(pci_mmcfg_config[0]) *cfg;
@@ -324,21 +359,22 @@ static void __init pci_mmcfg_reject_broken(int early)
324 359
325 for (i = 0; i < pci_mmcfg_config_num; i++) { 360 for (i = 0; i < pci_mmcfg_config_num; i++) {
326 int valid = 0; 361 int valid = 0;
327 u32 size = (cfg->end_bus_number + 1) << 20; 362 u64 addr, size;
363
328 cfg = &pci_mmcfg_config[i]; 364 cfg = &pci_mmcfg_config[i];
365 addr = cfg->start_bus_number;
366 addr <<= 20;
367 addr += cfg->address;
368 size = cfg->end_bus_number + 1 - cfg->start_bus_number;
369 size <<= 20;
329 printk(KERN_NOTICE "PCI: MCFG configuration %d: base %lx " 370 printk(KERN_NOTICE "PCI: MCFG configuration %d: base %lx "
330 "segment %hu buses %u - %u\n", 371 "segment %hu buses %u - %u\n",
331 i, (unsigned long)cfg->address, cfg->pci_segment, 372 i, (unsigned long)cfg->address, cfg->pci_segment,
332 (unsigned int)cfg->start_bus_number, 373 (unsigned int)cfg->start_bus_number,
333 (unsigned int)cfg->end_bus_number); 374 (unsigned int)cfg->end_bus_number);
334 375
335 if (!early && 376 if (!early)
336 is_acpi_reserved(cfg->address, cfg->address + size - 1)) { 377 valid = is_mmconf_reserved(is_acpi_reserved, addr, size, i, cfg, 0);
337 printk(KERN_NOTICE "PCI: MCFG area at %Lx reserved "
338 "in ACPI motherboard resources\n",
339 cfg->address);
340 valid = 1;
341 }
342 378
343 if (valid) 379 if (valid)
344 continue; 380 continue;
@@ -347,16 +383,11 @@ static void __init pci_mmcfg_reject_broken(int early)
347 printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not" 383 printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not"
348 " reserved in ACPI motherboard resources\n", 384 " reserved in ACPI motherboard resources\n",
349 cfg->address); 385 cfg->address);
386
350 /* Don't try to do this check unless configuration 387 /* Don't try to do this check unless configuration
351 type 1 is available. how about type 2 ?*/ 388 type 1 is available. how about type 2 ?*/
352 if (raw_pci_ops && e820_all_mapped(cfg->address, 389 if (raw_pci_ops)
353 cfg->address + size - 1, 390 valid = is_mmconf_reserved(e820_all_mapped, addr, size, i, cfg, 1);
354 E820_RESERVED)) {
355 printk(KERN_NOTICE
356 "PCI: MCFG area at %Lx reserved in E820\n",
357 cfg->address);
358 valid = 1;
359 }
360 391
361 if (!valid) 392 if (!valid)
362 goto reject; 393 goto reject;
@@ -365,7 +396,7 @@ static void __init pci_mmcfg_reject_broken(int early)
365 return; 396 return;
366 397
367reject: 398reject:
368 printk(KERN_ERR "PCI: Not using MMCONFIG.\n"); 399 printk(KERN_INFO "PCI: Not using MMCONFIG.\n");
369 pci_mmcfg_arch_free(); 400 pci_mmcfg_arch_free();
370 kfree(pci_mmcfg_config); 401 kfree(pci_mmcfg_config);
371 pci_mmcfg_config = NULL; 402 pci_mmcfg_config = NULL;
@@ -374,7 +405,7 @@ reject:
374 405
375static int __initdata known_bridge; 406static int __initdata known_bridge;
376 407
377void __init __pci_mmcfg_init(int early) 408static void __init __pci_mmcfg_init(int early)
378{ 409{
379 /* MMCONFIG disabled */ 410 /* MMCONFIG disabled */
380 if ((pci_probe & PCI_PROBE_MMCONF) == 0) 411 if ((pci_probe & PCI_PROBE_MMCONF) == 0)
@@ -403,11 +434,9 @@ void __init __pci_mmcfg_init(int early)
403 (pci_mmcfg_config[0].address == 0)) 434 (pci_mmcfg_config[0].address == 0))
404 return; 435 return;
405 436
406 if (pci_mmcfg_arch_init()) { 437 if (pci_mmcfg_arch_init())
407 if (known_bridge)
408 pci_mmcfg_insert_resources(IORESOURCE_BUSY);
409 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; 438 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
410 } else { 439 else {
411 /* 440 /*
412 * Signal not to attempt to insert mmcfg resources because 441 * Signal not to attempt to insert mmcfg resources because
413 * the architecture mmcfg setup could not initialize. 442 * the architecture mmcfg setup could not initialize.
@@ -444,7 +473,7 @@ static int __init pci_mmcfg_late_insert_resources(void)
444 * marked so it won't cause request errors when __request_region is 473 * marked so it won't cause request errors when __request_region is
445 * called. 474 * called.
446 */ 475 */
447 pci_mmcfg_insert_resources(0); 476 pci_mmcfg_insert_resources();
448 477
449 return 0; 478 return 0;
450} 479}
diff --git a/arch/x86/pci/mp_bus_to_node.c b/arch/x86/pci/mp_bus_to_node.c
deleted file mode 100644
index 022943999b84..000000000000
--- a/arch/x86/pci/mp_bus_to_node.c
+++ /dev/null
@@ -1,23 +0,0 @@
1#include <linux/pci.h>
2#include <linux/init.h>
3#include <linux/topology.h>
4
5#define BUS_NR 256
6
7static unsigned char mp_bus_to_node[BUS_NR];
8
9void set_mp_bus_to_node(int busnum, int node)
10{
11 if (busnum >= 0 && busnum < BUS_NR)
12 mp_bus_to_node[busnum] = (unsigned char) node;
13}
14
15int get_mp_bus_to_node(int busnum)
16{
17 int node;
18
19 if (busnum < 0 || busnum > (BUS_NR - 1))
20 return 0;
21 node = mp_bus_to_node[busnum];
22 return node;
23}
diff --git a/arch/x86/pci/numa.c b/arch/x86/pci/numaq_32.c
index d9afbae5092b..1177845d3186 100644
--- a/arch/x86/pci/numa.c
+++ b/arch/x86/pci/numaq_32.c
@@ -1,50 +1,26 @@
1/* 1/*
2 * numa.c - Low-level PCI access for NUMA-Q machines 2 * numaq_32.c - Low-level PCI access for NUMA-Q machines
3 */ 3 */
4 4
5#include <linux/pci.h> 5#include <linux/pci.h>
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/nodemask.h> 7#include <linux/nodemask.h>
8#include <mach_apic.h> 8#include <mach_apic.h>
9#include <asm/mpspec.h>
9#include "pci.h" 10#include "pci.h"
10 11
11#define XQUAD_PORTIO_BASE 0xfe400000 12#define XQUAD_PORTIO_BASE 0xfe400000
12#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ 13#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
13 14
14int mp_bus_id_to_node[MAX_MP_BUSSES];
15#define BUS2QUAD(global) (mp_bus_id_to_node[global]) 15#define BUS2QUAD(global) (mp_bus_id_to_node[global])
16 16
17int mp_bus_id_to_local[MAX_MP_BUSSES];
18#define BUS2LOCAL(global) (mp_bus_id_to_local[global]) 17#define BUS2LOCAL(global) (mp_bus_id_to_local[global])
19 18
20void mpc_oem_bus_info(struct mpc_config_bus *m, char *name,
21 struct mpc_config_translation *translation)
22{
23 int quad = translation->trans_quad;
24 int local = translation->trans_local;
25
26 mp_bus_id_to_node[m->mpc_busid] = quad;
27 mp_bus_id_to_local[m->mpc_busid] = local;
28 printk(KERN_INFO "Bus #%d is %s (node %d)\n",
29 m->mpc_busid, name, quad);
30}
31
32int quad_local_to_mp_bus_id [NR_CPUS/4][4];
33#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local]) 19#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local])
34void mpc_oem_pci_bus(struct mpc_config_bus *m,
35 struct mpc_config_translation *translation)
36{
37 int quad = translation->trans_quad;
38 int local = translation->trans_local;
39
40 quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
41}
42 20
43/* Where the IO area was mapped on multiquad, always 0 otherwise */ 21/* Where the IO area was mapped on multiquad, always 0 otherwise */
44void *xquad_portio; 22void *xquad_portio;
45#ifdef CONFIG_X86_NUMAQ
46EXPORT_SYMBOL(xquad_portio); 23EXPORT_SYMBOL(xquad_portio);
47#endif
48 24
49#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port) 25#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
50 26
@@ -155,13 +131,14 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
155 u8 busno, suba, subb; 131 u8 busno, suba, subb;
156 int quad = BUS2QUAD(d->bus->number); 132 int quad = BUS2QUAD(d->bus->number);
157 133
158 printk("PCI: Searching for i450NX host bridges on %s\n", pci_name(d)); 134 dev_info(&d->dev, "searching for i450NX host bridges\n");
159 reg = 0xd0; 135 reg = 0xd0;
160 for(pxb=0; pxb<2; pxb++) { 136 for(pxb=0; pxb<2; pxb++) {
161 pci_read_config_byte(d, reg++, &busno); 137 pci_read_config_byte(d, reg++, &busno);
162 pci_read_config_byte(d, reg++, &suba); 138 pci_read_config_byte(d, reg++, &suba);
163 pci_read_config_byte(d, reg++, &subb); 139 pci_read_config_byte(d, reg++, &subb);
164 DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb); 140 dev_dbg(&d->dev, "i450NX PXB %d: %02x/%02x/%02x\n",
141 pxb, busno, suba, subb);
165 if (busno) { 142 if (busno) {
166 /* Bus A */ 143 /* Bus A */
167 pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, busno)); 144 pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, busno));
@@ -175,10 +152,13 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
175} 152}
176DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx); 153DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx);
177 154
178static int __init pci_numa_init(void) 155int __init pci_numaq_init(void)
179{ 156{
180 int quad; 157 int quad;
181 158
159 if (!found_numaq)
160 return 0;
161
182 raw_pci_ops = &pci_direct_conf1_mq; 162 raw_pci_ops = &pci_direct_conf1_mq;
183 163
184 if (pcibios_scanned++) 164 if (pcibios_scanned++)
@@ -197,5 +177,3 @@ static int __init pci_numa_init(void)
197 } 177 }
198 return 0; 178 return 0;
199} 179}
200
201subsys_initcall(pci_numa_init);
diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h
index 720c4c554534..15b9cf6be729 100644
--- a/arch/x86/pci/pci.h
+++ b/arch/x86/pci/pci.h
@@ -27,6 +27,8 @@
27#define PCI_CAN_SKIP_ISA_ALIGN 0x8000 27#define PCI_CAN_SKIP_ISA_ALIGN 0x8000
28#define PCI_USE__CRS 0x10000 28#define PCI_USE__CRS 0x10000
29#define PCI_CHECK_ENABLE_AMD_MMCONF 0x20000 29#define PCI_CHECK_ENABLE_AMD_MMCONF 0x20000
30#define PCI_HAS_IO_ECS 0x40000
31#define PCI_NOASSIGN_ROMS 0x80000
30 32
31extern unsigned int pci_probe; 33extern unsigned int pci_probe;
32extern unsigned long pirq_table_addr; 34extern unsigned long pirq_table_addr;
@@ -38,9 +40,6 @@ enum pci_bf_sort_state {
38 pci_dmi_bf, 40 pci_dmi_bf,
39}; 41};
40 42
41extern void __init dmi_check_pciprobe(void);
42extern void __init dmi_check_skip_isa_align(void);
43
44/* pci-i386.c */ 43/* pci-i386.c */
45 44
46extern unsigned int pcibios_max_latency; 45extern unsigned int pcibios_max_latency;
@@ -98,10 +97,20 @@ extern struct pci_raw_ops *raw_pci_ext_ops;
98 97
99extern struct pci_raw_ops pci_direct_conf1; 98extern struct pci_raw_ops pci_direct_conf1;
100 99
100/* arch_initcall level */
101extern int pci_direct_probe(void); 101extern int pci_direct_probe(void);
102extern void pci_direct_init(int type); 102extern void pci_direct_init(int type);
103extern void pci_pcbios_init(void); 103extern void pci_pcbios_init(void);
104extern int pci_olpc_init(void); 104extern int pci_olpc_init(void);
105extern void __init dmi_check_pciprobe(void);
106extern void __init dmi_check_skip_isa_align(void);
107
108/* some common used subsys_initcalls */
109extern int __init pci_acpi_init(void);
110extern int __init pcibios_irq_init(void);
111extern int __init pci_visws_init(void);
112extern int __init pci_numaq_init(void);
113extern int __init pcibios_init(void);
105 114
106/* pci-mmconfig.c */ 115/* pci-mmconfig.c */
107 116
diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c
index c2df4e97eed6..42f4cb19faca 100644
--- a/arch/x86/pci/visws.c
+++ b/arch/x86/pci/visws.c
@@ -8,18 +8,19 @@
8#include <linux/pci.h> 8#include <linux/pci.h>
9#include <linux/init.h> 9#include <linux/init.h>
10 10
11#include "cobalt.h" 11#include <asm/setup.h>
12#include "lithium.h" 12#include <asm/visws/cobalt.h>
13#include <asm/visws/lithium.h>
13 14
14#include "pci.h" 15#include "pci.h"
15 16
16static int pci_visws_enable_irq(struct pci_dev *dev) { return 0; } 17static int pci_visws_enable_irq(struct pci_dev *dev) { return 0; }
17static void pci_visws_disable_irq(struct pci_dev *dev) { } 18static void pci_visws_disable_irq(struct pci_dev *dev) { }
18 19
19int (*pcibios_enable_irq)(struct pci_dev *dev) = &pci_visws_enable_irq; 20/* int (*pcibios_enable_irq)(struct pci_dev *dev) = &pci_visws_enable_irq; */
20void (*pcibios_disable_irq)(struct pci_dev *dev) = &pci_visws_disable_irq; 21/* void (*pcibios_disable_irq)(struct pci_dev *dev) = &pci_visws_disable_irq; */
21 22
22void __init pcibios_penalize_isa_irq(int irq, int active) {} 23/* void __init pcibios_penalize_isa_irq(int irq, int active) {} */
23 24
24 25
25unsigned int pci_bus0, pci_bus1; 26unsigned int pci_bus0, pci_bus1;
@@ -85,8 +86,14 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq)
85 pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); 86 pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
86} 87}
87 88
88static int __init pcibios_init(void) 89int __init pci_visws_init(void)
89{ 90{
91 if (!is_visws_box())
92 return -1;
93
94 pcibios_enable_irq = &pci_visws_enable_irq;
95 pcibios_disable_irq = &pci_visws_disable_irq;
96
90 /* The VISWS supports configuration access type 1 only */ 97 /* The VISWS supports configuration access type 1 only */
91 pci_probe = (pci_probe | PCI_PROBE_CONF1) & 98 pci_probe = (pci_probe | PCI_PROBE_CONF1) &
92 ~(PCI_PROBE_BIOS | PCI_PROBE_CONF2); 99 ~(PCI_PROBE_BIOS | PCI_PROBE_CONF2);
@@ -104,5 +111,3 @@ static int __init pcibios_init(void)
104 pcibios_resource_survey(); 111 pcibios_resource_survey();
105 return 0; 112 return 0;
106} 113}
107
108subsys_initcall(pcibios_init);
diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c
index 7dc5d5cf50a2..274d06082f48 100644
--- a/arch/x86/power/cpu_32.c
+++ b/arch/x86/power/cpu_32.c
@@ -11,6 +11,7 @@
11#include <linux/suspend.h> 11#include <linux/suspend.h>
12#include <asm/mtrr.h> 12#include <asm/mtrr.h>
13#include <asm/mce.h> 13#include <asm/mce.h>
14#include <asm/xcr.h>
14 15
15static struct saved_context saved_context; 16static struct saved_context saved_context;
16 17
@@ -45,7 +46,7 @@ static void __save_processor_state(struct saved_context *ctxt)
45 ctxt->cr0 = read_cr0(); 46 ctxt->cr0 = read_cr0();
46 ctxt->cr2 = read_cr2(); 47 ctxt->cr2 = read_cr2();
47 ctxt->cr3 = read_cr3(); 48 ctxt->cr3 = read_cr3();
48 ctxt->cr4 = read_cr4(); 49 ctxt->cr4 = read_cr4_safe();
49} 50}
50 51
51/* Needed by apm.c */ 52/* Needed by apm.c */
@@ -98,7 +99,9 @@ static void __restore_processor_state(struct saved_context *ctxt)
98 /* 99 /*
99 * control registers 100 * control registers
100 */ 101 */
101 write_cr4(ctxt->cr4); 102 /* cr4 was introduced in the Pentium CPU */
103 if (ctxt->cr4)
104 write_cr4(ctxt->cr4);
102 write_cr3(ctxt->cr3); 105 write_cr3(ctxt->cr3);
103 write_cr2(ctxt->cr2); 106 write_cr2(ctxt->cr2);
104 write_cr0(ctxt->cr0); 107 write_cr0(ctxt->cr0);
@@ -124,6 +127,12 @@ static void __restore_processor_state(struct saved_context *ctxt)
124 if (boot_cpu_has(X86_FEATURE_SEP)) 127 if (boot_cpu_has(X86_FEATURE_SEP))
125 enable_sep_cpu(); 128 enable_sep_cpu();
126 129
130 /*
131 * restore XCR0 for xsave capable cpu's.
132 */
133 if (cpu_has_xsave)
134 xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
135
127 fix_processor_context(); 136 fix_processor_context();
128 do_fpu_end(); 137 do_fpu_end();
129 mtrr_ap_init(); 138 mtrr_ap_init();
diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu_64.c
index 66bdfb591fd8..e3b6cf70d62c 100644
--- a/arch/x86/power/cpu_64.c
+++ b/arch/x86/power/cpu_64.c
@@ -14,6 +14,7 @@
14#include <asm/page.h> 14#include <asm/page.h>
15#include <asm/pgtable.h> 15#include <asm/pgtable.h>
16#include <asm/mtrr.h> 16#include <asm/mtrr.h>
17#include <asm/xcr.h>
17 18
18static void fix_processor_context(void); 19static void fix_processor_context(void);
19 20
@@ -122,6 +123,12 @@ static void __restore_processor_state(struct saved_context *ctxt)
122 wrmsrl(MSR_GS_BASE, ctxt->gs_base); 123 wrmsrl(MSR_GS_BASE, ctxt->gs_base);
123 wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); 124 wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
124 125
126 /*
127 * restore XCR0 for xsave capable cpu's.
128 */
129 if (cpu_has_xsave)
130 xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
131
125 fix_processor_context(); 132 fix_processor_context();
126 133
127 do_fpu_end(); 134 do_fpu_end();
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index b542355e0e34..6dd000dd7933 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -83,7 +83,7 @@ static int set_up_temporary_mappings(void)
83 83
84 /* Set up the direct mapping from scratch */ 84 /* Set up the direct mapping from scratch */
85 start = (unsigned long)pfn_to_kaddr(0); 85 start = (unsigned long)pfn_to_kaddr(0);
86 end = (unsigned long)pfn_to_kaddr(end_pfn); 86 end = (unsigned long)pfn_to_kaddr(max_pfn);
87 87
88 for (; start < end; start = next) { 88 for (; start < end; start = next) {
89 pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC); 89 pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC);
diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S
index b95aa6cfe3cb..d1e9b53f9d33 100644
--- a/arch/x86/power/hibernate_asm_32.S
+++ b/arch/x86/power/hibernate_asm_32.S
@@ -1,5 +1,3 @@
1.text
2
3/* 1/*
4 * This may not use any stack, nor any variable that is not "NoSave": 2 * This may not use any stack, nor any variable that is not "NoSave":
5 * 3 *
@@ -12,25 +10,26 @@
12#include <asm/segment.h> 10#include <asm/segment.h>
13#include <asm/page.h> 11#include <asm/page.h>
14#include <asm/asm-offsets.h> 12#include <asm/asm-offsets.h>
13#include <asm/processor-flags.h>
15 14
16 .text 15.text
17 16
18ENTRY(swsusp_arch_suspend) 17ENTRY(swsusp_arch_suspend)
19
20 movl %esp, saved_context_esp 18 movl %esp, saved_context_esp
21 movl %ebx, saved_context_ebx 19 movl %ebx, saved_context_ebx
22 movl %ebp, saved_context_ebp 20 movl %ebp, saved_context_ebp
23 movl %esi, saved_context_esi 21 movl %esi, saved_context_esi
24 movl %edi, saved_context_edi 22 movl %edi, saved_context_edi
25 pushfl ; popl saved_context_eflags 23 pushfl
24 popl saved_context_eflags
26 25
27 call swsusp_save 26 call swsusp_save
28 ret 27 ret
29 28
30ENTRY(restore_image) 29ENTRY(restore_image)
31 movl resume_pg_dir, %ecx 30 movl resume_pg_dir, %eax
32 subl $__PAGE_OFFSET, %ecx 31 subl $__PAGE_OFFSET, %eax
33 movl %ecx, %cr3 32 movl %eax, %cr3
34 33
35 movl restore_pblist, %edx 34 movl restore_pblist, %edx
36 .p2align 4,,7 35 .p2align 4,,7
@@ -52,17 +51,21 @@ copy_loop:
52 51
53done: 52done:
54 /* go back to the original page tables */ 53 /* go back to the original page tables */
55 movl $swapper_pg_dir, %ecx 54 movl $swapper_pg_dir, %eax
56 subl $__PAGE_OFFSET, %ecx 55 subl $__PAGE_OFFSET, %eax
57 movl %ecx, %cr3 56 movl %eax, %cr3
58 /* Flush TLB, including "global" things (vmalloc) */ 57 /* Flush TLB, including "global" things (vmalloc) */
59 movl mmu_cr4_features, %eax 58 movl mmu_cr4_features, %ecx
60 movl %eax, %edx 59 jecxz 1f # cr4 Pentium and higher, skip if zero
61 andl $~(1<<7), %edx; # PGE 60 movl %ecx, %edx
61 andl $~(X86_CR4_PGE), %edx
62 movl %edx, %cr4; # turn off PGE 62 movl %edx, %cr4; # turn off PGE
63 movl %cr3, %ecx; # flush TLB 631:
64 movl %ecx, %cr3 64 movl %cr3, %eax; # flush TLB
65 movl %eax, %cr4; # turn PGE back on 65 movl %eax, %cr3
66 jecxz 1f # cr4 Pentium and higher, skip if zero
67 movl %ecx, %cr4; # turn PGE back on
681:
66 69
67 movl saved_context_esp, %esp 70 movl saved_context_esp, %esp
68 movl saved_context_ebp, %ebp 71 movl saved_context_ebp, %ebp
@@ -70,7 +73,8 @@ done:
70 movl saved_context_esi, %esi 73 movl saved_context_esi, %esi
71 movl saved_context_edi, %edi 74 movl saved_context_edi, %edi
72 75
73 pushl saved_context_eflags ; popfl 76 pushl saved_context_eflags
77 popfl
74 78
75 xorl %eax, %eax 79 xorl %eax, %eax
76 80
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index b7ad9f89d21f..4d6ef0a336d6 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -62,7 +62,7 @@ $(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE
62# Build multiple 32-bit vDSO images to choose from at boot time. 62# Build multiple 32-bit vDSO images to choose from at boot time.
63# 63#
64obj-$(VDSO32-y) += vdso32-syms.lds 64obj-$(VDSO32-y) += vdso32-syms.lds
65vdso32.so-$(CONFIG_X86_32) += int80 65vdso32.so-$(VDSO32-y) += int80
66vdso32.so-$(CONFIG_COMPAT) += syscall 66vdso32.so-$(CONFIG_COMPAT) += syscall
67vdso32.so-$(VDSO32-y) += sysenter 67vdso32.so-$(VDSO32-y) += sysenter
68 68
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index efa2ba7c6005..1ef0f90813d6 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -23,7 +23,7 @@
23 23
24#define gtod vdso_vsyscall_gtod_data 24#define gtod vdso_vsyscall_gtod_data
25 25
26static long vdso_fallback_gettime(long clock, struct timespec *ts) 26notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
27{ 27{
28 long ret; 28 long ret;
29 asm("syscall" : "=a" (ret) : 29 asm("syscall" : "=a" (ret) :
@@ -31,7 +31,7 @@ static long vdso_fallback_gettime(long clock, struct timespec *ts)
31 return ret; 31 return ret;
32} 32}
33 33
34static inline long vgetns(void) 34notrace static inline long vgetns(void)
35{ 35{
36 long v; 36 long v;
37 cycles_t (*vread)(void); 37 cycles_t (*vread)(void);
@@ -40,7 +40,7 @@ static inline long vgetns(void)
40 return (v * gtod->clock.mult) >> gtod->clock.shift; 40 return (v * gtod->clock.mult) >> gtod->clock.shift;
41} 41}
42 42
43static noinline int do_realtime(struct timespec *ts) 43notrace static noinline int do_realtime(struct timespec *ts)
44{ 44{
45 unsigned long seq, ns; 45 unsigned long seq, ns;
46 do { 46 do {
@@ -54,7 +54,8 @@ static noinline int do_realtime(struct timespec *ts)
54} 54}
55 55
56/* Copy of the version in kernel/time.c which we cannot directly access */ 56/* Copy of the version in kernel/time.c which we cannot directly access */
57static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec) 57notrace static void
58vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
58{ 59{
59 while (nsec >= NSEC_PER_SEC) { 60 while (nsec >= NSEC_PER_SEC) {
60 nsec -= NSEC_PER_SEC; 61 nsec -= NSEC_PER_SEC;
@@ -68,7 +69,7 @@ static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
68 ts->tv_nsec = nsec; 69 ts->tv_nsec = nsec;
69} 70}
70 71
71static noinline int do_monotonic(struct timespec *ts) 72notrace static noinline int do_monotonic(struct timespec *ts)
72{ 73{
73 unsigned long seq, ns, secs; 74 unsigned long seq, ns, secs;
74 do { 75 do {
@@ -82,7 +83,7 @@ static noinline int do_monotonic(struct timespec *ts)
82 return 0; 83 return 0;
83} 84}
84 85
85int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) 86notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
86{ 87{
87 if (likely(gtod->sysctl_enabled && gtod->clock.vread)) 88 if (likely(gtod->sysctl_enabled && gtod->clock.vread))
88 switch (clock) { 89 switch (clock) {
@@ -96,7 +97,7 @@ int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
96int clock_gettime(clockid_t, struct timespec *) 97int clock_gettime(clockid_t, struct timespec *)
97 __attribute__((weak, alias("__vdso_clock_gettime"))); 98 __attribute__((weak, alias("__vdso_clock_gettime")));
98 99
99int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) 100notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
100{ 101{
101 long ret; 102 long ret;
102 if (likely(gtod->sysctl_enabled && gtod->clock.vread)) { 103 if (likely(gtod->sysctl_enabled && gtod->clock.vread)) {
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index cf058fecfcee..513f330c5832 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -193,30 +193,16 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
193 } 193 }
194} 194}
195 195
196/*
197 * These symbols are defined by vdso32.S to mark the bounds
198 * of the ELF DSO images included therein.
199 */
200extern const char vdso32_default_start, vdso32_default_end;
201extern const char vdso32_sysenter_start, vdso32_sysenter_end;
202static struct page *vdso32_pages[1]; 196static struct page *vdso32_pages[1];
203 197
204#ifdef CONFIG_X86_64 198#ifdef CONFIG_X86_64
205 199
206static int use_sysenter __read_mostly = -1; 200#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32))
207 201#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
208#define vdso32_sysenter() (use_sysenter > 0)
209 202
210/* May not be __init: called during resume */ 203/* May not be __init: called during resume */
211void syscall32_cpu_init(void) 204void syscall32_cpu_init(void)
212{ 205{
213 if (use_sysenter < 0) {
214 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
215 use_sysenter = 1;
216 if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR)
217 use_sysenter = 1;
218 }
219
220 /* Load these always in case some future AMD CPU supports 206 /* Load these always in case some future AMD CPU supports
221 SYSENTER from compat mode too. */ 207 SYSENTER from compat mode too. */
222 checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); 208 checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
@@ -235,6 +221,7 @@ static inline void map_compat_vdso(int map)
235#else /* CONFIG_X86_32 */ 221#else /* CONFIG_X86_32 */
236 222
237#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) 223#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
224#define vdso32_syscall() (0)
238 225
239void enable_sep_cpu(void) 226void enable_sep_cpu(void)
240{ 227{
@@ -305,12 +292,15 @@ int __init sysenter_setup(void)
305 gate_vma_init(); 292 gate_vma_init();
306#endif 293#endif
307 294
308 if (!vdso32_sysenter()) { 295 if (vdso32_syscall()) {
309 vsyscall = &vdso32_default_start; 296 vsyscall = &vdso32_syscall_start;
310 vsyscall_len = &vdso32_default_end - &vdso32_default_start; 297 vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
311 } else { 298 } else if (vdso32_sysenter()){
312 vsyscall = &vdso32_sysenter_start; 299 vsyscall = &vdso32_sysenter_start;
313 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start; 300 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
301 } else {
302 vsyscall = &vdso32_int80_start;
303 vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
314 } 304 }
315 305
316 memcpy(syscall_page, vsyscall, vsyscall_len); 306 memcpy(syscall_page, vsyscall, vsyscall_len);
diff --git a/arch/x86/vdso/vdso32.S b/arch/x86/vdso/vdso32.S
index 1e36f72cab86..2ce5f82c333b 100644
--- a/arch/x86/vdso/vdso32.S
+++ b/arch/x86/vdso/vdso32.S
@@ -2,14 +2,17 @@
2 2
3__INITDATA 3__INITDATA
4 4
5 .globl vdso32_default_start, vdso32_default_end 5 .globl vdso32_int80_start, vdso32_int80_end
6vdso32_default_start: 6vdso32_int80_start:
7#ifdef CONFIG_X86_32
8 .incbin "arch/x86/vdso/vdso32-int80.so" 7 .incbin "arch/x86/vdso/vdso32-int80.so"
9#else 8vdso32_int80_end:
9
10 .globl vdso32_syscall_start, vdso32_syscall_end
11vdso32_syscall_start:
12#ifdef CONFIG_COMPAT
10 .incbin "arch/x86/vdso/vdso32-syscall.so" 13 .incbin "arch/x86/vdso/vdso32-syscall.so"
11#endif 14#endif
12vdso32_default_end: 15vdso32_syscall_end:
13 16
14 .globl vdso32_sysenter_start, vdso32_sysenter_end 17 .globl vdso32_sysenter_start, vdso32_sysenter_end
15vdso32_sysenter_start: 18vdso32_sysenter_start:
diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c
index c8097f17f8a9..9fbc6b20026b 100644
--- a/arch/x86/vdso/vgetcpu.c
+++ b/arch/x86/vdso/vgetcpu.c
@@ -13,7 +13,8 @@
13#include <asm/vgtod.h> 13#include <asm/vgtod.h>
14#include "vextern.h" 14#include "vextern.h"
15 15
16long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) 16notrace long
17__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
17{ 18{
18 unsigned int p; 19 unsigned int p;
19 20
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 3fdd51497a83..257ba4a10abf 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -16,12 +16,13 @@
16#include "vextern.h" /* Just for VMAGIC. */ 16#include "vextern.h" /* Just for VMAGIC. */
17#undef VEXTERN 17#undef VEXTERN
18 18
19int vdso_enabled = 1; 19unsigned int __read_mostly vdso_enabled = 1;
20 20
21extern char vdso_start[], vdso_end[]; 21extern char vdso_start[], vdso_end[];
22extern unsigned short vdso_sync_cpuid; 22extern unsigned short vdso_sync_cpuid;
23 23
24struct page **vdso_pages; 24static struct page **vdso_pages;
25static unsigned vdso_size;
25 26
26static inline void *var_ref(void *p, char *name) 27static inline void *var_ref(void *p, char *name)
27{ 28{
@@ -38,6 +39,7 @@ static int __init init_vdso_vars(void)
38 int i; 39 int i;
39 char *vbase; 40 char *vbase;
40 41
42 vdso_size = npages << PAGE_SHIFT;
41 vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL); 43 vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL);
42 if (!vdso_pages) 44 if (!vdso_pages)
43 goto oom; 45 goto oom;
@@ -101,20 +103,19 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
101 struct mm_struct *mm = current->mm; 103 struct mm_struct *mm = current->mm;
102 unsigned long addr; 104 unsigned long addr;
103 int ret; 105 int ret;
104 unsigned len = round_up(vdso_end - vdso_start, PAGE_SIZE);
105 106
106 if (!vdso_enabled) 107 if (!vdso_enabled)
107 return 0; 108 return 0;
108 109
109 down_write(&mm->mmap_sem); 110 down_write(&mm->mmap_sem);
110 addr = vdso_addr(mm->start_stack, len); 111 addr = vdso_addr(mm->start_stack, vdso_size);
111 addr = get_unmapped_area(NULL, addr, len, 0, 0); 112 addr = get_unmapped_area(NULL, addr, vdso_size, 0, 0);
112 if (IS_ERR_VALUE(addr)) { 113 if (IS_ERR_VALUE(addr)) {
113 ret = addr; 114 ret = addr;
114 goto up_fail; 115 goto up_fail;
115 } 116 }
116 117
117 ret = install_special_mapping(mm, addr, len, 118 ret = install_special_mapping(mm, addr, vdso_size,
118 VM_READ|VM_EXEC| 119 VM_READ|VM_EXEC|
119 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| 120 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
120 VM_ALWAYSDUMP, 121 VM_ALWAYSDUMP,
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 6c388e593bc8..87b9ab166423 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -6,9 +6,33 @@ config XEN
6 bool "Xen guest support" 6 bool "Xen guest support"
7 select PARAVIRT 7 select PARAVIRT
8 select PARAVIRT_CLOCK 8 select PARAVIRT_CLOCK
9 depends on X86_32 9 depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER))
10 depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER) 10 depends on X86_CMPXCHG && X86_TSC
11 help 11 help
12 This is the Linux Xen port. Enabling this will allow the 12 This is the Linux Xen port. Enabling this will allow the
13 kernel to boot in a paravirtualized environment under the 13 kernel to boot in a paravirtualized environment under the
14 Xen hypervisor. 14 Xen hypervisor.
15
16config XEN_MAX_DOMAIN_MEMORY
17 int "Maximum allowed size of a domain in gigabytes"
18 default 8 if X86_32
19 default 32 if X86_64
20 depends on XEN
21 help
22 The pseudo-physical to machine address array is sized
23 according to the maximum possible memory size of a Xen
24 domain. This array uses 1 page per gigabyte, so there's no
25 need to be too stingy here.
26
27config XEN_SAVE_RESTORE
28 bool
29 depends on XEN && PM
30 default y
31
32config XEN_DEBUG_FS
33 bool "Enable Xen debug and tuning parameters in debugfs"
34 depends on XEN && DEBUG_FS
35 default n
36 help
37 Enable statistics output and various tuning options in debugfs.
38 Enabling this option may incur a significant performance overhead.
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 3d8df981d5fd..313947940a1a 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,12 @@
1obj-y := enlighten.o setup.o multicalls.o mmu.o \ 1ifdef CONFIG_FTRACE
2 time.o manage.o xen-asm.o grant-table.o 2# Do not profile debug and lowlevel utilities
3CFLAGS_REMOVE_spinlock.o = -pg
4CFLAGS_REMOVE_time.o = -pg
5CFLAGS_REMOVE_irq.o = -pg
6endif
3 7
4obj-$(CONFIG_SMP) += smp.o 8obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
9 time.o xen-asm_$(BITS).o grant-table.o suspend.o
10
11obj-$(CONFIG_SMP) += smp.o spinlock.o
12obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
new file mode 100644
index 000000000000..b53225d2cac3
--- /dev/null
+++ b/arch/x86/xen/debugfs.c
@@ -0,0 +1,123 @@
1#include <linux/init.h>
2#include <linux/debugfs.h>
3#include <linux/module.h>
4
5#include "debugfs.h"
6
7static struct dentry *d_xen_debug;
8
9struct dentry * __init xen_init_debugfs(void)
10{
11 if (!d_xen_debug) {
12 d_xen_debug = debugfs_create_dir("xen", NULL);
13
14 if (!d_xen_debug)
15 pr_warning("Could not create 'xen' debugfs directory\n");
16 }
17
18 return d_xen_debug;
19}
20
21struct array_data
22{
23 void *array;
24 unsigned elements;
25};
26
27static int u32_array_open(struct inode *inode, struct file *file)
28{
29 file->private_data = NULL;
30 return nonseekable_open(inode, file);
31}
32
33static size_t format_array(char *buf, size_t bufsize, const char *fmt,
34 u32 *array, unsigned array_size)
35{
36 size_t ret = 0;
37 unsigned i;
38
39 for(i = 0; i < array_size; i++) {
40 size_t len;
41
42 len = snprintf(buf, bufsize, fmt, array[i]);
43 len++; /* ' ' or '\n' */
44 ret += len;
45
46 if (buf) {
47 buf += len;
48 bufsize -= len;
49 buf[-1] = (i == array_size-1) ? '\n' : ' ';
50 }
51 }
52
53 ret++; /* \0 */
54 if (buf)
55 *buf = '\0';
56
57 return ret;
58}
59
60static char *format_array_alloc(const char *fmt, u32 *array, unsigned array_size)
61{
62 size_t len = format_array(NULL, 0, fmt, array, array_size);
63 char *ret;
64
65 ret = kmalloc(len, GFP_KERNEL);
66 if (ret == NULL)
67 return NULL;
68
69 format_array(ret, len, fmt, array, array_size);
70 return ret;
71}
72
73static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len,
74 loff_t *ppos)
75{
76 struct inode *inode = file->f_path.dentry->d_inode;
77 struct array_data *data = inode->i_private;
78 size_t size;
79
80 if (*ppos == 0) {
81 if (file->private_data) {
82 kfree(file->private_data);
83 file->private_data = NULL;
84 }
85
86 file->private_data = format_array_alloc("%u", data->array, data->elements);
87 }
88
89 size = 0;
90 if (file->private_data)
91 size = strlen(file->private_data);
92
93 return simple_read_from_buffer(buf, len, ppos, file->private_data, size);
94}
95
96static int xen_array_release(struct inode *inode, struct file *file)
97{
98 kfree(file->private_data);
99
100 return 0;
101}
102
103static struct file_operations u32_array_fops = {
104 .owner = THIS_MODULE,
105 .open = u32_array_open,
106 .release= xen_array_release,
107 .read = u32_array_read,
108};
109
110struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
111 struct dentry *parent,
112 u32 *array, unsigned elements)
113{
114 struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL);
115
116 if (data == NULL)
117 return NULL;
118
119 data->array = array;
120 data->elements = elements;
121
122 return debugfs_create_file(name, mode, parent, data, &u32_array_fops);
123}
diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h
new file mode 100644
index 000000000000..e28132084832
--- /dev/null
+++ b/arch/x86/xen/debugfs.h
@@ -0,0 +1,10 @@
1#ifndef _XEN_DEBUGFS_H
2#define _XEN_DEBUGFS_H
3
4struct dentry * __init xen_init_debugfs(void);
5
6struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
7 struct dentry *parent,
8 u32 *array, unsigned elements);
9
10#endif /* _XEN_DEBUGFS_H */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index f09c1c69c37a..0013a729b41d 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -30,16 +30,18 @@
30#include <xen/interface/xen.h> 30#include <xen/interface/xen.h>
31#include <xen/interface/physdev.h> 31#include <xen/interface/physdev.h>
32#include <xen/interface/vcpu.h> 32#include <xen/interface/vcpu.h>
33#include <xen/interface/sched.h>
34#include <xen/features.h> 33#include <xen/features.h>
35#include <xen/page.h> 34#include <xen/page.h>
35#include <xen/hvc-console.h>
36 36
37#include <asm/paravirt.h> 37#include <asm/paravirt.h>
38#include <asm/apic.h>
38#include <asm/page.h> 39#include <asm/page.h>
39#include <asm/xen/hypercall.h> 40#include <asm/xen/hypercall.h>
40#include <asm/xen/hypervisor.h> 41#include <asm/xen/hypervisor.h>
41#include <asm/fixmap.h> 42#include <asm/fixmap.h>
42#include <asm/processor.h> 43#include <asm/processor.h>
44#include <asm/msr-index.h>
43#include <asm/setup.h> 45#include <asm/setup.h>
44#include <asm/desc.h> 46#include <asm/desc.h>
45#include <asm/pgtable.h> 47#include <asm/pgtable.h>
@@ -55,6 +57,21 @@ EXPORT_SYMBOL_GPL(hypercall_page);
55DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); 57DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
56DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); 58DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
57 59
60enum xen_domain_type xen_domain_type = XEN_NATIVE;
61EXPORT_SYMBOL_GPL(xen_domain_type);
62
63/*
64 * Identity map, in addition to plain kernel map. This needs to be
65 * large enough to allocate page table pages to allocate the rest.
66 * Each page can map 2MB.
67 */
68static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
69
70#ifdef CONFIG_X86_64
71/* l3 pud for userspace vsyscall mapping */
72static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
73#endif /* CONFIG_X86_64 */
74
58/* 75/*
59 * Note about cr3 (pagetable base) values: 76 * Note about cr3 (pagetable base) values:
60 * 77 *
@@ -75,13 +92,13 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
75struct start_info *xen_start_info; 92struct start_info *xen_start_info;
76EXPORT_SYMBOL_GPL(xen_start_info); 93EXPORT_SYMBOL_GPL(xen_start_info);
77 94
78static /* __initdata */ struct shared_info dummy_shared_info; 95struct shared_info xen_dummy_shared_info;
79 96
80/* 97/*
81 * Point at some empty memory to start with. We map the real shared_info 98 * Point at some empty memory to start with. We map the real shared_info
82 * page as soon as fixmap is up and running. 99 * page as soon as fixmap is up and running.
83 */ 100 */
84struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info; 101struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
85 102
86/* 103/*
87 * Flag to determine whether vcpu info placement is available on all 104 * Flag to determine whether vcpu info placement is available on all
@@ -96,15 +113,22 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info;
96 * 113 *
97 * 0: not available, 1: available 114 * 0: not available, 1: available
98 */ 115 */
99static int have_vcpu_info_placement = 1; 116static int have_vcpu_info_placement =
117#ifdef CONFIG_X86_32
118 1
119#else
120 0
121#endif
122 ;
100 123
101static void __init xen_vcpu_setup(int cpu) 124
125static void xen_vcpu_setup(int cpu)
102{ 126{
103 struct vcpu_register_vcpu_info info; 127 struct vcpu_register_vcpu_info info;
104 int err; 128 int err;
105 struct vcpu_info *vcpup; 129 struct vcpu_info *vcpup;
106 130
107 BUG_ON(HYPERVISOR_shared_info == &dummy_shared_info); 131 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
108 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; 132 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
109 133
110 if (!have_vcpu_info_placement) 134 if (!have_vcpu_info_placement)
@@ -136,11 +160,45 @@ static void __init xen_vcpu_setup(int cpu)
136 } 160 }
137} 161}
138 162
163/*
164 * On restore, set the vcpu placement up again.
165 * If it fails, then we're in a bad state, since
166 * we can't back out from using it...
167 */
168void xen_vcpu_restore(void)
169{
170 if (have_vcpu_info_placement) {
171 int cpu;
172
173 for_each_online_cpu(cpu) {
174 bool other_cpu = (cpu != smp_processor_id());
175
176 if (other_cpu &&
177 HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL))
178 BUG();
179
180 xen_vcpu_setup(cpu);
181
182 if (other_cpu &&
183 HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
184 BUG();
185 }
186
187 BUG_ON(!have_vcpu_info_placement);
188 }
189}
190
139static void __init xen_banner(void) 191static void __init xen_banner(void)
140{ 192{
193 unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
194 struct xen_extraversion extra;
195 HYPERVISOR_xen_version(XENVER_extraversion, &extra);
196
141 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 197 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
142 pv_info.name); 198 pv_info.name);
143 printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); 199 printk(KERN_INFO "Xen version: %d.%d%s%s\n",
200 version >> 16, version & 0xffff, extra.extraversion,
201 xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
144} 202}
145 203
146static void xen_cpuid(unsigned int *ax, unsigned int *bx, 204static void xen_cpuid(unsigned int *ax, unsigned int *bx,
@@ -178,103 +236,68 @@ static unsigned long xen_get_debugreg(int reg)
178 return HYPERVISOR_get_debugreg(reg); 236 return HYPERVISOR_get_debugreg(reg);
179} 237}
180 238
181static unsigned long xen_save_fl(void) 239static void xen_leave_lazy(void)
182{
183 struct vcpu_info *vcpu;
184 unsigned long flags;
185
186 vcpu = x86_read_percpu(xen_vcpu);
187
188 /* flag has opposite sense of mask */
189 flags = !vcpu->evtchn_upcall_mask;
190
191 /* convert to IF type flag
192 -0 -> 0x00000000
193 -1 -> 0xffffffff
194 */
195 return (-flags) & X86_EFLAGS_IF;
196}
197
198static void xen_restore_fl(unsigned long flags)
199{ 240{
200 struct vcpu_info *vcpu; 241 paravirt_leave_lazy(paravirt_get_lazy_mode());
201 242 xen_mc_flush();
202 /* convert from IF type flag */
203 flags = !(flags & X86_EFLAGS_IF);
204
205 /* There's a one instruction preempt window here. We need to
206 make sure we're don't switch CPUs between getting the vcpu
207 pointer and updating the mask. */
208 preempt_disable();
209 vcpu = x86_read_percpu(xen_vcpu);
210 vcpu->evtchn_upcall_mask = flags;
211 preempt_enable_no_resched();
212
213 /* Doesn't matter if we get preempted here, because any
214 pending event will get dealt with anyway. */
215
216 if (flags == 0) {
217 preempt_check_resched();
218 barrier(); /* unmask then check (avoid races) */
219 if (unlikely(vcpu->evtchn_upcall_pending))
220 force_evtchn_callback();
221 }
222} 243}
223 244
224static void xen_irq_disable(void) 245static unsigned long xen_store_tr(void)
225{ 246{
226 /* There's a one instruction preempt window here. We need to 247 return 0;
227 make sure we're don't switch CPUs between getting the vcpu
228 pointer and updating the mask. */
229 preempt_disable();
230 x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
231 preempt_enable_no_resched();
232} 248}
233 249
234static void xen_irq_enable(void) 250/*
251 * Set the page permissions for a particular virtual address. If the
252 * address is a vmalloc mapping (or other non-linear mapping), then
253 * find the linear mapping of the page and also set its protections to
254 * match.
255 */
256static void set_aliased_prot(void *v, pgprot_t prot)
235{ 257{
236 struct vcpu_info *vcpu; 258 int level;
259 pte_t *ptep;
260 pte_t pte;
261 unsigned long pfn;
262 struct page *page;
237 263
238 /* There's a one instruction preempt window here. We need to 264 ptep = lookup_address((unsigned long)v, &level);
239 make sure we're don't switch CPUs between getting the vcpu 265 BUG_ON(ptep == NULL);
240 pointer and updating the mask. */
241 preempt_disable();
242 vcpu = x86_read_percpu(xen_vcpu);
243 vcpu->evtchn_upcall_mask = 0;
244 preempt_enable_no_resched();
245 266
246 /* Doesn't matter if we get preempted here, because any 267 pfn = pte_pfn(*ptep);
247 pending event will get dealt with anyway. */ 268 page = pfn_to_page(pfn);
248 269
249 barrier(); /* unmask then check (avoid races) */ 270 pte = pfn_pte(pfn, prot);
250 if (unlikely(vcpu->evtchn_upcall_pending))
251 force_evtchn_callback();
252}
253 271
254static void xen_safe_halt(void) 272 if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
255{
256 /* Blocking includes an implicit local_irq_enable(). */
257 if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0)
258 BUG(); 273 BUG();
259}
260 274
261static void xen_halt(void) 275 if (!PageHighMem(page)) {
262{ 276 void *av = __va(PFN_PHYS(pfn));
263 if (irqs_disabled()) 277
264 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); 278 if (av != v)
265 else 279 if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0))
266 xen_safe_halt(); 280 BUG();
281 } else
282 kmap_flush_unused();
267} 283}
268 284
269static void xen_leave_lazy(void) 285static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
270{ 286{
271 paravirt_leave_lazy(paravirt_get_lazy_mode()); 287 const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
272 xen_mc_flush(); 288 int i;
289
290 for(i = 0; i < entries; i += entries_per_page)
291 set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
273} 292}
274 293
275static unsigned long xen_store_tr(void) 294static void xen_free_ldt(struct desc_struct *ldt, unsigned entries)
276{ 295{
277 return 0; 296 const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
297 int i;
298
299 for(i = 0; i < entries; i += entries_per_page)
300 set_aliased_prot(ldt + i, PAGE_KERNEL);
278} 301}
279 302
280static void xen_set_ldt(const void *addr, unsigned entries) 303static void xen_set_ldt(const void *addr, unsigned entries)
@@ -332,14 +355,6 @@ static void load_TLS_descriptor(struct thread_struct *t,
332 355
333static void xen_load_tls(struct thread_struct *t, unsigned int cpu) 356static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
334{ 357{
335 xen_mc_batch();
336
337 load_TLS_descriptor(t, cpu, 0);
338 load_TLS_descriptor(t, cpu, 1);
339 load_TLS_descriptor(t, cpu, 2);
340
341 xen_mc_issue(PARAVIRT_LAZY_CPU);
342
343 /* 358 /*
344 * XXX sleazy hack: If we're being called in a lazy-cpu zone, 359 * XXX sleazy hack: If we're being called in a lazy-cpu zone,
345 * it means we're in a context switch, and %gs has just been 360 * it means we're in a context switch, and %gs has just been
@@ -348,16 +363,44 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
348 * Either way, it has been saved, and the new value will get 363 * Either way, it has been saved, and the new value will get
349 * loaded properly. This will go away as soon as Xen has been 364 * loaded properly. This will go away as soon as Xen has been
350 * modified to not save/restore %gs for normal hypercalls. 365 * modified to not save/restore %gs for normal hypercalls.
366 *
367 * On x86_64, this hack is not used for %gs, because gs points
368 * to KERNEL_GS_BASE (and uses it for PDA references), so we
369 * must not zero %gs on x86_64
370 *
371 * For x86_64, we need to zero %fs, otherwise we may get an
372 * exception between the new %fs descriptor being loaded and
373 * %fs being effectively cleared at __switch_to().
351 */ 374 */
352 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) 375 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
376#ifdef CONFIG_X86_32
353 loadsegment(gs, 0); 377 loadsegment(gs, 0);
378#else
379 loadsegment(fs, 0);
380#endif
381 }
382
383 xen_mc_batch();
384
385 load_TLS_descriptor(t, cpu, 0);
386 load_TLS_descriptor(t, cpu, 1);
387 load_TLS_descriptor(t, cpu, 2);
388
389 xen_mc_issue(PARAVIRT_LAZY_CPU);
390}
391
392#ifdef CONFIG_X86_64
393static void xen_load_gs_index(unsigned int idx)
394{
395 if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
396 BUG();
354} 397}
398#endif
355 399
356static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, 400static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
357 const void *ptr) 401 const void *ptr)
358{ 402{
359 unsigned long lp = (unsigned long)&dt[entrynum]; 403 xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
360 xmaddr_t mach_lp = virt_to_machine(lp);
361 u64 entry = *(u64 *)ptr; 404 u64 entry = *(u64 *)ptr;
362 405
363 preempt_disable(); 406 preempt_disable();
@@ -369,23 +412,18 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
369 preempt_enable(); 412 preempt_enable();
370} 413}
371 414
372static int cvt_gate_to_trap(int vector, u32 low, u32 high, 415static int cvt_gate_to_trap(int vector, const gate_desc *val,
373 struct trap_info *info) 416 struct trap_info *info)
374{ 417{
375 u8 type, dpl; 418 if (val->type != 0xf && val->type != 0xe)
376
377 type = (high >> 8) & 0x1f;
378 dpl = (high >> 13) & 3;
379
380 if (type != 0xf && type != 0xe)
381 return 0; 419 return 0;
382 420
383 info->vector = vector; 421 info->vector = vector;
384 info->address = (high & 0xffff0000) | (low & 0x0000ffff); 422 info->address = gate_offset(*val);
385 info->cs = low >> 16; 423 info->cs = gate_segment(*val);
386 info->flags = dpl; 424 info->flags = val->dpl;
387 /* interrupt gates clear IF */ 425 /* interrupt gates clear IF */
388 if (type == 0xe) 426 if (val->type == 0xe)
389 info->flags |= 4; 427 info->flags |= 4;
390 428
391 return 1; 429 return 1;
@@ -412,11 +450,10 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
412 450
413 if (p >= start && (p + 8) <= end) { 451 if (p >= start && (p + 8) <= end) {
414 struct trap_info info[2]; 452 struct trap_info info[2];
415 u32 *desc = (u32 *)g;
416 453
417 info[1].address = 0; 454 info[1].address = 0;
418 455
419 if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0])) 456 if (cvt_gate_to_trap(entrynum, g, &info[0]))
420 if (HYPERVISOR_set_trap_table(info)) 457 if (HYPERVISOR_set_trap_table(info))
421 BUG(); 458 BUG();
422 } 459 }
@@ -429,13 +466,13 @@ static void xen_convert_trap_info(const struct desc_ptr *desc,
429{ 466{
430 unsigned in, out, count; 467 unsigned in, out, count;
431 468
432 count = (desc->size+1) / 8; 469 count = (desc->size+1) / sizeof(gate_desc);
433 BUG_ON(count > 256); 470 BUG_ON(count > 256);
434 471
435 for (in = out = 0; in < count; in++) { 472 for (in = out = 0; in < count; in++) {
436 const u32 *entry = (u32 *)(desc->address + in * 8); 473 gate_desc *entry = (gate_desc*)(desc->address) + in;
437 474
438 if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out])) 475 if (cvt_gate_to_trap(in, entry, &traps[out]))
439 out++; 476 out++;
440 } 477 }
441 traps[out].address = 0; 478 traps[out].address = 0;
@@ -496,7 +533,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
496} 533}
497 534
498static void xen_load_sp0(struct tss_struct *tss, 535static void xen_load_sp0(struct tss_struct *tss,
499 struct thread_struct *thread) 536 struct thread_struct *thread)
500{ 537{
501 struct multicall_space mcs = xen_mc_entry(0); 538 struct multicall_space mcs = xen_mc_entry(0);
502 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); 539 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
@@ -517,16 +554,47 @@ static void xen_io_delay(void)
517} 554}
518 555
519#ifdef CONFIG_X86_LOCAL_APIC 556#ifdef CONFIG_X86_LOCAL_APIC
520static u32 xen_apic_read(unsigned long reg) 557static u32 xen_apic_read(u32 reg)
521{ 558{
522 return 0; 559 return 0;
523} 560}
524 561
525static void xen_apic_write(unsigned long reg, u32 val) 562static void xen_apic_write(u32 reg, u32 val)
526{ 563{
527 /* Warn to see if there's any stray references */ 564 /* Warn to see if there's any stray references */
528 WARN_ON(1); 565 WARN_ON(1);
529} 566}
567
568static u64 xen_apic_icr_read(void)
569{
570 return 0;
571}
572
573static void xen_apic_icr_write(u32 low, u32 id)
574{
575 /* Warn to see if there's any stray references */
576 WARN_ON(1);
577}
578
579static void xen_apic_wait_icr_idle(void)
580{
581 return;
582}
583
584static u32 xen_safe_apic_wait_icr_idle(void)
585{
586 return 0;
587}
588
589static struct apic_ops xen_basic_apic_ops = {
590 .read = xen_apic_read,
591 .write = xen_apic_write,
592 .icr_read = xen_apic_icr_read,
593 .icr_write = xen_apic_icr_write,
594 .wait_icr_idle = xen_apic_wait_icr_idle,
595 .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle,
596};
597
530#endif 598#endif
531 599
532static void xen_flush_tlb(void) 600static void xen_flush_tlb(void)
@@ -607,6 +675,30 @@ static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
607 xen_mc_issue(PARAVIRT_LAZY_MMU); 675 xen_mc_issue(PARAVIRT_LAZY_MMU);
608} 676}
609 677
678static void xen_clts(void)
679{
680 struct multicall_space mcs;
681
682 mcs = xen_mc_entry(0);
683
684 MULTI_fpu_taskswitch(mcs.mc, 0);
685
686 xen_mc_issue(PARAVIRT_LAZY_CPU);
687}
688
689static void xen_write_cr0(unsigned long cr0)
690{
691 struct multicall_space mcs;
692
693 /* Only pay attention to cr0.TS; everything else is
694 ignored. */
695 mcs = xen_mc_entry(0);
696
697 MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0);
698
699 xen_mc_issue(PARAVIRT_LAZY_CPU);
700}
701
610static void xen_write_cr2(unsigned long cr2) 702static void xen_write_cr2(unsigned long cr2)
611{ 703{
612 x86_read_percpu(xen_vcpu)->arch.cr2 = cr2; 704 x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
@@ -624,8 +716,10 @@ static unsigned long xen_read_cr2_direct(void)
624 716
625static void xen_write_cr4(unsigned long cr4) 717static void xen_write_cr4(unsigned long cr4)
626{ 718{
627 /* Just ignore cr4 changes; Xen doesn't allow us to do 719 cr4 &= ~X86_CR4_PGE;
628 anything anyway. */ 720 cr4 &= ~X86_CR4_PSE;
721
722 native_write_cr4(cr4);
629} 723}
630 724
631static unsigned long xen_read_cr3(void) 725static unsigned long xen_read_cr3(void)
@@ -638,36 +732,105 @@ static void set_current_cr3(void *v)
638 x86_write_percpu(xen_current_cr3, (unsigned long)v); 732 x86_write_percpu(xen_current_cr3, (unsigned long)v);
639} 733}
640 734
641static void xen_write_cr3(unsigned long cr3) 735static void __xen_write_cr3(bool kernel, unsigned long cr3)
642{ 736{
643 struct mmuext_op *op; 737 struct mmuext_op *op;
644 struct multicall_space mcs; 738 struct multicall_space mcs;
645 unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); 739 unsigned long mfn;
646 740
647 BUG_ON(preemptible()); 741 if (cr3)
742 mfn = pfn_to_mfn(PFN_DOWN(cr3));
743 else
744 mfn = 0;
648 745
649 mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */ 746 WARN_ON(mfn == 0 && kernel);
650 747
651 /* Update while interrupts are disabled, so its atomic with 748 mcs = __xen_mc_entry(sizeof(*op));
652 respect to ipis */
653 x86_write_percpu(xen_cr3, cr3);
654 749
655 op = mcs.args; 750 op = mcs.args;
656 op->cmd = MMUEXT_NEW_BASEPTR; 751 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
657 op->arg1.mfn = mfn; 752 op->arg1.mfn = mfn;
658 753
659 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 754 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
660 755
661 /* Update xen_update_cr3 once the batch has actually 756 if (kernel) {
662 been submitted. */ 757 x86_write_percpu(xen_cr3, cr3);
663 xen_mc_callback(set_current_cr3, (void *)cr3); 758
759 /* Update xen_current_cr3 once the batch has actually
760 been submitted. */
761 xen_mc_callback(set_current_cr3, (void *)cr3);
762 }
763}
764
765static void xen_write_cr3(unsigned long cr3)
766{
767 BUG_ON(preemptible());
768
769 xen_mc_batch(); /* disables interrupts */
770
771 /* Update while interrupts are disabled, so its atomic with
772 respect to ipis */
773 x86_write_percpu(xen_cr3, cr3);
774
775 __xen_write_cr3(true, cr3);
776
777#ifdef CONFIG_X86_64
778 {
779 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
780 if (user_pgd)
781 __xen_write_cr3(false, __pa(user_pgd));
782 else
783 __xen_write_cr3(false, 0);
784 }
785#endif
664 786
665 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ 787 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
666} 788}
667 789
790static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
791{
792 int ret;
793
794 ret = 0;
795
796 switch(msr) {
797#ifdef CONFIG_X86_64
798 unsigned which;
799 u64 base;
800
801 case MSR_FS_BASE: which = SEGBASE_FS; goto set;
802 case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set;
803 case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set;
804
805 set:
806 base = ((u64)high << 32) | low;
807 if (HYPERVISOR_set_segment_base(which, base) != 0)
808 ret = -EFAULT;
809 break;
810#endif
811
812 case MSR_STAR:
813 case MSR_CSTAR:
814 case MSR_LSTAR:
815 case MSR_SYSCALL_MASK:
816 case MSR_IA32_SYSENTER_CS:
817 case MSR_IA32_SYSENTER_ESP:
818 case MSR_IA32_SYSENTER_EIP:
819 /* Fast syscall setup is all done in hypercalls, so
820 these are all ignored. Stub them out here to stop
821 Xen console noise. */
822 break;
823
824 default:
825 ret = native_write_msr_safe(msr, low, high);
826 }
827
828 return ret;
829}
830
668/* Early in boot, while setting up the initial pagetable, assume 831/* Early in boot, while setting up the initial pagetable, assume
669 everything is pinned. */ 832 everything is pinned. */
670static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) 833static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
671{ 834{
672#ifdef CONFIG_FLATMEM 835#ifdef CONFIG_FLATMEM
673 BUG_ON(mem_map); /* should only be used early */ 836 BUG_ON(mem_map); /* should only be used early */
@@ -677,7 +840,7 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
677 840
678/* Early release_pte assumes that all pts are pinned, since there's 841/* Early release_pte assumes that all pts are pinned, since there's
679 only init_mm and anything attached to that is pinned. */ 842 only init_mm and anything attached to that is pinned. */
680static void xen_release_pte_init(u32 pfn) 843static void xen_release_pte_init(unsigned long pfn)
681{ 844{
682 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 845 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
683} 846}
@@ -693,7 +856,7 @@ static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
693 856
694/* This needs to make sure the new pte page is pinned iff its being 857/* This needs to make sure the new pte page is pinned iff its being
695 attached to a pinned pagetable. */ 858 attached to a pinned pagetable. */
696static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) 859static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
697{ 860{
698 struct page *page = pfn_to_page(pfn); 861 struct page *page = pfn_to_page(pfn);
699 862
@@ -701,8 +864,8 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
701 SetPagePinned(page); 864 SetPagePinned(page);
702 865
703 if (!PageHighMem(page)) { 866 if (!PageHighMem(page)) {
704 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 867 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
705 if (level == PT_PTE) 868 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
706 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); 869 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
707 } else 870 } else
708 /* make sure there are no stray mappings of 871 /* make sure there are no stray mappings of
@@ -711,24 +874,66 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
711 } 874 }
712} 875}
713 876
714static void xen_alloc_pte(struct mm_struct *mm, u32 pfn) 877static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
715{ 878{
716 xen_alloc_ptpage(mm, pfn, PT_PTE); 879 xen_alloc_ptpage(mm, pfn, PT_PTE);
717} 880}
718 881
719static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn) 882static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
720{ 883{
721 xen_alloc_ptpage(mm, pfn, PT_PMD); 884 xen_alloc_ptpage(mm, pfn, PT_PMD);
722} 885}
723 886
887static int xen_pgd_alloc(struct mm_struct *mm)
888{
889 pgd_t *pgd = mm->pgd;
890 int ret = 0;
891
892 BUG_ON(PagePinned(virt_to_page(pgd)));
893
894#ifdef CONFIG_X86_64
895 {
896 struct page *page = virt_to_page(pgd);
897 pgd_t *user_pgd;
898
899 BUG_ON(page->private != 0);
900
901 ret = -ENOMEM;
902
903 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
904 page->private = (unsigned long)user_pgd;
905
906 if (user_pgd != NULL) {
907 user_pgd[pgd_index(VSYSCALL_START)] =
908 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
909 ret = 0;
910 }
911
912 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
913 }
914#endif
915
916 return ret;
917}
918
919static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
920{
921#ifdef CONFIG_X86_64
922 pgd_t *user_pgd = xen_get_user_pgd(pgd);
923
924 if (user_pgd)
925 free_page((unsigned long)user_pgd);
926#endif
927}
928
724/* This should never happen until we're OK to use struct page */ 929/* This should never happen until we're OK to use struct page */
725static void xen_release_ptpage(u32 pfn, unsigned level) 930static void xen_release_ptpage(unsigned long pfn, unsigned level)
726{ 931{
727 struct page *page = pfn_to_page(pfn); 932 struct page *page = pfn_to_page(pfn);
728 933
729 if (PagePinned(page)) { 934 if (PagePinned(page)) {
730 if (!PageHighMem(page)) { 935 if (!PageHighMem(page)) {
731 if (level == PT_PTE) 936 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
732 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); 937 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
733 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 938 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
734 } 939 }
@@ -736,16 +941,28 @@ static void xen_release_ptpage(u32 pfn, unsigned level)
736 } 941 }
737} 942}
738 943
739static void xen_release_pte(u32 pfn) 944static void xen_release_pte(unsigned long pfn)
740{ 945{
741 xen_release_ptpage(pfn, PT_PTE); 946 xen_release_ptpage(pfn, PT_PTE);
742} 947}
743 948
744static void xen_release_pmd(u32 pfn) 949static void xen_release_pmd(unsigned long pfn)
745{ 950{
746 xen_release_ptpage(pfn, PT_PMD); 951 xen_release_ptpage(pfn, PT_PMD);
747} 952}
748 953
954#if PAGETABLE_LEVELS == 4
955static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
956{
957 xen_alloc_ptpage(mm, pfn, PT_PUD);
958}
959
960static void xen_release_pud(unsigned long pfn)
961{
962 xen_release_ptpage(pfn, PT_PUD);
963}
964#endif
965
749#ifdef CONFIG_HIGHPTE 966#ifdef CONFIG_HIGHPTE
750static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) 967static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
751{ 968{
@@ -763,6 +980,7 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
763} 980}
764#endif 981#endif
765 982
983#ifdef CONFIG_X86_32
766static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) 984static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
767{ 985{
768 /* If there's an existing pte, then don't allow _PAGE_RW to be set */ 986 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
@@ -781,71 +999,20 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
781 999
782 xen_set_pte(ptep, pte); 1000 xen_set_pte(ptep, pte);
783} 1001}
1002#endif
784 1003
785static __init void xen_pagetable_setup_start(pgd_t *base) 1004static __init void xen_pagetable_setup_start(pgd_t *base)
786{ 1005{
787 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
788 int i;
789
790 /* special set_pte for pagetable initialization */
791 pv_mmu_ops.set_pte = xen_set_pte_init;
792
793 init_mm.pgd = base;
794 /*
795 * copy top-level of Xen-supplied pagetable into place. This
796 * is a stand-in while we copy the pmd pages.
797 */
798 memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
799
800 /*
801 * For PAE, need to allocate new pmds, rather than
802 * share Xen's, since Xen doesn't like pmd's being
803 * shared between address spaces.
804 */
805 for (i = 0; i < PTRS_PER_PGD; i++) {
806 if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
807 pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
808
809 memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
810 PAGE_SIZE);
811
812 make_lowmem_page_readonly(pmd);
813
814 set_pgd(&base[i], __pgd(1 + __pa(pmd)));
815 } else
816 pgd_clear(&base[i]);
817 }
818
819 /* make sure zero_page is mapped RO so we can use it in pagetables */
820 make_lowmem_page_readonly(empty_zero_page);
821 make_lowmem_page_readonly(base);
822 /*
823 * Switch to new pagetable. This is done before
824 * pagetable_init has done anything so that the new pages
825 * added to the table can be prepared properly for Xen.
826 */
827 xen_write_cr3(__pa(base));
828
829 /* Unpin initial Xen pagetable */
830 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
831 PFN_DOWN(__pa(xen_start_info->pt_base)));
832} 1006}
833 1007
834static __init void setup_shared_info(void) 1008void xen_setup_shared_info(void)
835{ 1009{
836 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 1010 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
837 unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP); 1011 set_fixmap(FIX_PARAVIRT_BOOTMAP,
838 1012 xen_start_info->shared_info);
839 /* 1013
840 * Create a mapping for the shared info page. 1014 HYPERVISOR_shared_info =
841 * Should be set_fixmap(), but shared_info is a machine 1015 (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
842 * address with no corresponding pseudo-phys address.
843 */
844 set_pte_mfn(addr,
845 PFN_DOWN(xen_start_info->shared_info),
846 PAGE_KERNEL);
847
848 HYPERVISOR_shared_info = (struct shared_info *)addr;
849 } else 1016 } else
850 HYPERVISOR_shared_info = 1017 HYPERVISOR_shared_info =
851 (struct shared_info *)__va(xen_start_info->shared_info); 1018 (struct shared_info *)__va(xen_start_info->shared_info);
@@ -854,27 +1021,43 @@ static __init void setup_shared_info(void)
854 /* In UP this is as good a place as any to set up shared info */ 1021 /* In UP this is as good a place as any to set up shared info */
855 xen_setup_vcpu_info_placement(); 1022 xen_setup_vcpu_info_placement();
856#endif 1023#endif
1024
1025 xen_setup_mfn_list_list();
857} 1026}
858 1027
859static __init void xen_pagetable_setup_done(pgd_t *base) 1028static __init void xen_pagetable_setup_done(pgd_t *base)
860{ 1029{
1030 xen_setup_shared_info();
1031}
1032
1033static __init void xen_post_allocator_init(void)
1034{
1035 pv_mmu_ops.set_pte = xen_set_pte;
1036 pv_mmu_ops.set_pmd = xen_set_pmd;
1037 pv_mmu_ops.set_pud = xen_set_pud;
1038#if PAGETABLE_LEVELS == 4
1039 pv_mmu_ops.set_pgd = xen_set_pgd;
1040#endif
1041
861 /* This will work as long as patching hasn't happened yet 1042 /* This will work as long as patching hasn't happened yet
862 (which it hasn't) */ 1043 (which it hasn't) */
863 pv_mmu_ops.alloc_pte = xen_alloc_pte; 1044 pv_mmu_ops.alloc_pte = xen_alloc_pte;
864 pv_mmu_ops.alloc_pmd = xen_alloc_pmd; 1045 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
865 pv_mmu_ops.release_pte = xen_release_pte; 1046 pv_mmu_ops.release_pte = xen_release_pte;
866 pv_mmu_ops.release_pmd = xen_release_pmd; 1047 pv_mmu_ops.release_pmd = xen_release_pmd;
867 pv_mmu_ops.set_pte = xen_set_pte; 1048#if PAGETABLE_LEVELS == 4
868 1049 pv_mmu_ops.alloc_pud = xen_alloc_pud;
869 setup_shared_info(); 1050 pv_mmu_ops.release_pud = xen_release_pud;
1051#endif
870 1052
871 /* Actually pin the pagetable down, but we can't set PG_pinned 1053#ifdef CONFIG_X86_64
872 yet because the page structures don't exist yet. */ 1054 SetPagePinned(virt_to_page(level3_user_vsyscall));
873 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base))); 1055#endif
1056 xen_mark_init_mm_pinned();
874} 1057}
875 1058
876/* This is called once we have the cpu_possible_map */ 1059/* This is called once we have the cpu_possible_map */
877void __init xen_setup_vcpu_info_placement(void) 1060void xen_setup_vcpu_info_placement(void)
878{ 1061{
879 int cpu; 1062 int cpu;
880 1063
@@ -947,6 +1130,49 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
947 return ret; 1130 return ret;
948} 1131}
949 1132
1133static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1134{
1135 pte_t pte;
1136
1137 phys >>= PAGE_SHIFT;
1138
1139 switch (idx) {
1140 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1141#ifdef CONFIG_X86_F00F_BUG
1142 case FIX_F00F_IDT:
1143#endif
1144#ifdef CONFIG_X86_32
1145 case FIX_WP_TEST:
1146 case FIX_VDSO:
1147# ifdef CONFIG_HIGHMEM
1148 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1149# endif
1150#else
1151 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1152#endif
1153#ifdef CONFIG_X86_LOCAL_APIC
1154 case FIX_APIC_BASE: /* maps dummy local APIC */
1155#endif
1156 pte = pfn_pte(phys, prot);
1157 break;
1158
1159 default:
1160 pte = mfn_pte(phys, prot);
1161 break;
1162 }
1163
1164 __native_set_fixmap(idx, pte);
1165
1166#ifdef CONFIG_X86_64
1167 /* Replicate changes to map the vsyscall page into the user
1168 pagetable vsyscall mapping. */
1169 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1170 unsigned long vaddr = __fix_to_virt(idx);
1171 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1172 }
1173#endif
1174}
1175
950static const struct pv_info xen_info __initdata = { 1176static const struct pv_info xen_info __initdata = {
951 .paravirt_enabled = 1, 1177 .paravirt_enabled = 1,
952 .shared_kernel_pmd = 0, 1178 .shared_kernel_pmd = 0,
@@ -960,7 +1186,7 @@ static const struct pv_init_ops xen_init_ops __initdata = {
960 .banner = xen_banner, 1186 .banner = xen_banner,
961 .memory_setup = xen_memory_setup, 1187 .memory_setup = xen_memory_setup,
962 .arch_setup = xen_arch_setup, 1188 .arch_setup = xen_arch_setup,
963 .post_allocator_init = xen_mark_init_mm_pinned, 1189 .post_allocator_init = xen_post_allocator_init,
964}; 1190};
965 1191
966static const struct pv_time_ops xen_time_ops __initdata = { 1192static const struct pv_time_ops xen_time_ops __initdata = {
@@ -968,7 +1194,7 @@ static const struct pv_time_ops xen_time_ops __initdata = {
968 1194
969 .set_wallclock = xen_set_wallclock, 1195 .set_wallclock = xen_set_wallclock,
970 .get_wallclock = xen_get_wallclock, 1196 .get_wallclock = xen_get_wallclock,
971 .get_cpu_khz = xen_cpu_khz, 1197 .get_tsc_khz = xen_tsc_khz,
972 .sched_clock = xen_sched_clock, 1198 .sched_clock = xen_sched_clock,
973}; 1199};
974 1200
@@ -978,10 +1204,10 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
978 .set_debugreg = xen_set_debugreg, 1204 .set_debugreg = xen_set_debugreg,
979 .get_debugreg = xen_get_debugreg, 1205 .get_debugreg = xen_get_debugreg,
980 1206
981 .clts = native_clts, 1207 .clts = xen_clts,
982 1208
983 .read_cr0 = native_read_cr0, 1209 .read_cr0 = native_read_cr0,
984 .write_cr0 = native_write_cr0, 1210 .write_cr0 = xen_write_cr0,
985 1211
986 .read_cr4 = native_read_cr4, 1212 .read_cr4 = native_read_cr4,
987 .read_cr4_safe = native_read_cr4_safe, 1213 .read_cr4_safe = native_read_cr4_safe,
@@ -990,18 +1216,28 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
990 .wbinvd = native_wbinvd, 1216 .wbinvd = native_wbinvd,
991 1217
992 .read_msr = native_read_msr_safe, 1218 .read_msr = native_read_msr_safe,
993 .write_msr = native_write_msr_safe, 1219 .write_msr = xen_write_msr_safe,
994 .read_tsc = native_read_tsc, 1220 .read_tsc = native_read_tsc,
995 .read_pmc = native_read_pmc, 1221 .read_pmc = native_read_pmc,
996 1222
997 .iret = xen_iret, 1223 .iret = xen_iret,
998 .irq_enable_syscall_ret = xen_sysexit, 1224 .irq_enable_sysexit = xen_sysexit,
1225#ifdef CONFIG_X86_64
1226 .usergs_sysret32 = xen_sysret32,
1227 .usergs_sysret64 = xen_sysret64,
1228#endif
999 1229
1000 .load_tr_desc = paravirt_nop, 1230 .load_tr_desc = paravirt_nop,
1001 .set_ldt = xen_set_ldt, 1231 .set_ldt = xen_set_ldt,
1002 .load_gdt = xen_load_gdt, 1232 .load_gdt = xen_load_gdt,
1003 .load_idt = xen_load_idt, 1233 .load_idt = xen_load_idt,
1004 .load_tls = xen_load_tls, 1234 .load_tls = xen_load_tls,
1235#ifdef CONFIG_X86_64
1236 .load_gs_index = xen_load_gs_index,
1237#endif
1238
1239 .alloc_ldt = xen_alloc_ldt,
1240 .free_ldt = xen_free_ldt,
1005 1241
1006 .store_gdt = native_store_gdt, 1242 .store_gdt = native_store_gdt,
1007 .store_idt = native_store_idt, 1243 .store_idt = native_store_idt,
@@ -1015,27 +1251,17 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1015 .set_iopl_mask = xen_set_iopl_mask, 1251 .set_iopl_mask = xen_set_iopl_mask,
1016 .io_delay = xen_io_delay, 1252 .io_delay = xen_io_delay,
1017 1253
1254 /* Xen takes care of %gs when switching to usermode for us */
1255 .swapgs = paravirt_nop,
1256
1018 .lazy_mode = { 1257 .lazy_mode = {
1019 .enter = paravirt_enter_lazy_cpu, 1258 .enter = paravirt_enter_lazy_cpu,
1020 .leave = xen_leave_lazy, 1259 .leave = xen_leave_lazy,
1021 }, 1260 },
1022}; 1261};
1023 1262
1024static const struct pv_irq_ops xen_irq_ops __initdata = {
1025 .init_IRQ = xen_init_IRQ,
1026 .save_fl = xen_save_fl,
1027 .restore_fl = xen_restore_fl,
1028 .irq_disable = xen_irq_disable,
1029 .irq_enable = xen_irq_enable,
1030 .safe_halt = xen_safe_halt,
1031 .halt = xen_halt,
1032};
1033
1034static const struct pv_apic_ops xen_apic_ops __initdata = { 1263static const struct pv_apic_ops xen_apic_ops __initdata = {
1035#ifdef CONFIG_X86_LOCAL_APIC 1264#ifdef CONFIG_X86_LOCAL_APIC
1036 .apic_write = xen_apic_write,
1037 .apic_write_atomic = xen_apic_write,
1038 .apic_read = xen_apic_read,
1039 .setup_boot_clock = paravirt_nop, 1265 .setup_boot_clock = paravirt_nop,
1040 .setup_secondary_clock = paravirt_nop, 1266 .setup_secondary_clock = paravirt_nop,
1041 .startup_ipi_hook = paravirt_nop, 1267 .startup_ipi_hook = paravirt_nop,
@@ -1060,6 +1286,9 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1060 .pte_update = paravirt_nop, 1286 .pte_update = paravirt_nop,
1061 .pte_update_defer = paravirt_nop, 1287 .pte_update_defer = paravirt_nop,
1062 1288
1289 .pgd_alloc = xen_pgd_alloc,
1290 .pgd_free = xen_pgd_free,
1291
1063 .alloc_pte = xen_alloc_pte_init, 1292 .alloc_pte = xen_alloc_pte_init,
1064 .release_pte = xen_release_pte_init, 1293 .release_pte = xen_release_pte_init,
1065 .alloc_pmd = xen_alloc_pte_init, 1294 .alloc_pmd = xen_alloc_pte_init,
@@ -1070,25 +1299,44 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1070 .kmap_atomic_pte = xen_kmap_atomic_pte, 1299 .kmap_atomic_pte = xen_kmap_atomic_pte,
1071#endif 1300#endif
1072 1301
1073 .set_pte = NULL, /* see xen_pagetable_setup_* */ 1302#ifdef CONFIG_X86_64
1303 .set_pte = xen_set_pte,
1304#else
1305 .set_pte = xen_set_pte_init,
1306#endif
1074 .set_pte_at = xen_set_pte_at, 1307 .set_pte_at = xen_set_pte_at,
1075 .set_pmd = xen_set_pmd, 1308 .set_pmd = xen_set_pmd_hyper,
1309
1310 .ptep_modify_prot_start = __ptep_modify_prot_start,
1311 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1076 1312
1077 .pte_val = xen_pte_val, 1313 .pte_val = xen_pte_val,
1314 .pte_flags = native_pte_flags,
1078 .pgd_val = xen_pgd_val, 1315 .pgd_val = xen_pgd_val,
1079 1316
1080 .make_pte = xen_make_pte, 1317 .make_pte = xen_make_pte,
1081 .make_pgd = xen_make_pgd, 1318 .make_pgd = xen_make_pgd,
1082 1319
1320#ifdef CONFIG_X86_PAE
1083 .set_pte_atomic = xen_set_pte_atomic, 1321 .set_pte_atomic = xen_set_pte_atomic,
1084 .set_pte_present = xen_set_pte_at, 1322 .set_pte_present = xen_set_pte_at,
1085 .set_pud = xen_set_pud,
1086 .pte_clear = xen_pte_clear, 1323 .pte_clear = xen_pte_clear,
1087 .pmd_clear = xen_pmd_clear, 1324 .pmd_clear = xen_pmd_clear,
1325#endif /* CONFIG_X86_PAE */
1326 .set_pud = xen_set_pud_hyper,
1088 1327
1089 .make_pmd = xen_make_pmd, 1328 .make_pmd = xen_make_pmd,
1090 .pmd_val = xen_pmd_val, 1329 .pmd_val = xen_pmd_val,
1091 1330
1331#if PAGETABLE_LEVELS == 4
1332 .pud_val = xen_pud_val,
1333 .make_pud = xen_make_pud,
1334 .set_pgd = xen_set_pgd_hyper,
1335
1336 .alloc_pud = xen_alloc_pte_init,
1337 .release_pud = xen_release_pte_init,
1338#endif /* PAGETABLE_LEVELS == 4 */
1339
1092 .activate_mm = xen_activate_mm, 1340 .activate_mm = xen_activate_mm,
1093 .dup_mmap = xen_dup_mmap, 1341 .dup_mmap = xen_dup_mmap,
1094 .exit_mmap = xen_exit_mmap, 1342 .exit_mmap = xen_exit_mmap,
@@ -1097,28 +1345,19 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1097 .enter = paravirt_enter_lazy_mmu, 1345 .enter = paravirt_enter_lazy_mmu,
1098 .leave = xen_leave_lazy, 1346 .leave = xen_leave_lazy,
1099 }, 1347 },
1100};
1101 1348
1102#ifdef CONFIG_SMP 1349 .set_fixmap = xen_set_fixmap,
1103static const struct smp_ops xen_smp_ops __initdata = {
1104 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
1105 .smp_prepare_cpus = xen_smp_prepare_cpus,
1106 .cpu_up = xen_cpu_up,
1107 .smp_cpus_done = xen_smp_cpus_done,
1108
1109 .smp_send_stop = xen_smp_send_stop,
1110 .smp_send_reschedule = xen_smp_send_reschedule,
1111 .smp_call_function_mask = xen_smp_call_function_mask,
1112}; 1350};
1113#endif /* CONFIG_SMP */
1114 1351
1115static void xen_reboot(int reason) 1352static void xen_reboot(int reason)
1116{ 1353{
1354 struct sched_shutdown r = { .reason = reason };
1355
1117#ifdef CONFIG_SMP 1356#ifdef CONFIG_SMP
1118 smp_send_stop(); 1357 smp_send_stop();
1119#endif 1358#endif
1120 1359
1121 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason)) 1360 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
1122 BUG(); 1361 BUG();
1123} 1362}
1124 1363
@@ -1154,15 +1393,219 @@ static const struct machine_ops __initdata xen_machine_ops = {
1154 1393
1155static void __init xen_reserve_top(void) 1394static void __init xen_reserve_top(void)
1156{ 1395{
1396#ifdef CONFIG_X86_32
1157 unsigned long top = HYPERVISOR_VIRT_START; 1397 unsigned long top = HYPERVISOR_VIRT_START;
1158 struct xen_platform_parameters pp; 1398 struct xen_platform_parameters pp;
1159 1399
1160 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) 1400 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1161 top = pp.virt_start; 1401 top = pp.virt_start;
1162 1402
1163 reserve_top_address(-top + 2 * PAGE_SIZE); 1403 reserve_top_address(-top);
1404#endif /* CONFIG_X86_32 */
1405}
1406
1407/*
1408 * Like __va(), but returns address in the kernel mapping (which is
1409 * all we have until the physical memory mapping has been set up.
1410 */
1411static void *__ka(phys_addr_t paddr)
1412{
1413#ifdef CONFIG_X86_64
1414 return (void *)(paddr + __START_KERNEL_map);
1415#else
1416 return __va(paddr);
1417#endif
1418}
1419
1420/* Convert a machine address to physical address */
1421static unsigned long m2p(phys_addr_t maddr)
1422{
1423 phys_addr_t paddr;
1424
1425 maddr &= PTE_PFN_MASK;
1426 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1427
1428 return paddr;
1164} 1429}
1165 1430
1431/* Convert a machine address to kernel virtual */
1432static void *m2v(phys_addr_t maddr)
1433{
1434 return __ka(m2p(maddr));
1435}
1436
1437static void set_page_prot(void *addr, pgprot_t prot)
1438{
1439 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1440 pte_t pte = pfn_pte(pfn, prot);
1441
1442 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1443 BUG();
1444}
1445
1446static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1447{
1448 unsigned pmdidx, pteidx;
1449 unsigned ident_pte;
1450 unsigned long pfn;
1451
1452 ident_pte = 0;
1453 pfn = 0;
1454 for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1455 pte_t *pte_page;
1456
1457 /* Reuse or allocate a page of ptes */
1458 if (pmd_present(pmd[pmdidx]))
1459 pte_page = m2v(pmd[pmdidx].pmd);
1460 else {
1461 /* Check for free pte pages */
1462 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1463 break;
1464
1465 pte_page = &level1_ident_pgt[ident_pte];
1466 ident_pte += PTRS_PER_PTE;
1467
1468 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1469 }
1470
1471 /* Install mappings */
1472 for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1473 pte_t pte;
1474
1475 if (pfn > max_pfn_mapped)
1476 max_pfn_mapped = pfn;
1477
1478 if (!pte_none(pte_page[pteidx]))
1479 continue;
1480
1481 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1482 pte_page[pteidx] = pte;
1483 }
1484 }
1485
1486 for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1487 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1488
1489 set_page_prot(pmd, PAGE_KERNEL_RO);
1490}
1491
1492#ifdef CONFIG_X86_64
1493static void convert_pfn_mfn(void *v)
1494{
1495 pte_t *pte = v;
1496 int i;
1497
1498 /* All levels are converted the same way, so just treat them
1499 as ptes. */
1500 for(i = 0; i < PTRS_PER_PTE; i++)
1501 pte[i] = xen_make_pte(pte[i].pte);
1502}
1503
1504/*
1505 * Set up the inital kernel pagetable.
1506 *
1507 * We can construct this by grafting the Xen provided pagetable into
1508 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1509 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1510 * means that only the kernel has a physical mapping to start with -
1511 * but that's enough to get __va working. We need to fill in the rest
1512 * of the physical mapping once some sort of allocator has been set
1513 * up.
1514 */
1515static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1516{
1517 pud_t *l3;
1518 pmd_t *l2;
1519
1520 /* Zap identity mapping */
1521 init_level4_pgt[0] = __pgd(0);
1522
1523 /* Pre-constructed entries are in pfn, so convert to mfn */
1524 convert_pfn_mfn(init_level4_pgt);
1525 convert_pfn_mfn(level3_ident_pgt);
1526 convert_pfn_mfn(level3_kernel_pgt);
1527
1528 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1529 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1530
1531 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1532 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1533
1534 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1535 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1536 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1537
1538 /* Set up identity map */
1539 xen_map_identity_early(level2_ident_pgt, max_pfn);
1540
1541 /* Make pagetable pieces RO */
1542 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1543 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1544 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1545 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1546 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1547 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1548
1549 /* Pin down new L4 */
1550 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1551 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1552
1553 /* Unpin Xen-provided one */
1554 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1555
1556 /* Switch over */
1557 pgd = init_level4_pgt;
1558
1559 /*
1560 * At this stage there can be no user pgd, and no page
1561 * structure to attach it to, so make sure we just set kernel
1562 * pgd.
1563 */
1564 xen_mc_batch();
1565 __xen_write_cr3(true, __pa(pgd));
1566 xen_mc_issue(PARAVIRT_LAZY_CPU);
1567
1568 reserve_early(__pa(xen_start_info->pt_base),
1569 __pa(xen_start_info->pt_base +
1570 xen_start_info->nr_pt_frames * PAGE_SIZE),
1571 "XEN PAGETABLES");
1572
1573 return pgd;
1574}
1575#else /* !CONFIG_X86_64 */
1576static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1577
1578static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1579{
1580 pmd_t *kernel_pmd;
1581
1582 init_pg_tables_start = __pa(pgd);
1583 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1584 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1585
1586 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1587 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1588
1589 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1590
1591 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1592 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1593 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1594
1595 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1596 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1597 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1598
1599 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1600
1601 xen_write_cr3(__pa(swapper_pg_dir));
1602
1603 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1604
1605 return swapper_pg_dir;
1606}
1607#endif /* CONFIG_X86_64 */
1608
1166/* First C function to be called on Xen boot */ 1609/* First C function to be called on Xen boot */
1167asmlinkage void __init xen_start_kernel(void) 1610asmlinkage void __init xen_start_kernel(void)
1168{ 1611{
@@ -1171,70 +1614,99 @@ asmlinkage void __init xen_start_kernel(void)
1171 if (!xen_start_info) 1614 if (!xen_start_info)
1172 return; 1615 return;
1173 1616
1617 xen_domain_type = XEN_PV_DOMAIN;
1618
1174 BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); 1619 BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0);
1175 1620
1621 xen_setup_features();
1622
1176 /* Install Xen paravirt ops */ 1623 /* Install Xen paravirt ops */
1177 pv_info = xen_info; 1624 pv_info = xen_info;
1178 pv_init_ops = xen_init_ops; 1625 pv_init_ops = xen_init_ops;
1179 pv_time_ops = xen_time_ops; 1626 pv_time_ops = xen_time_ops;
1180 pv_cpu_ops = xen_cpu_ops; 1627 pv_cpu_ops = xen_cpu_ops;
1181 pv_irq_ops = xen_irq_ops;
1182 pv_apic_ops = xen_apic_ops; 1628 pv_apic_ops = xen_apic_ops;
1183 pv_mmu_ops = xen_mmu_ops; 1629 pv_mmu_ops = xen_mmu_ops;
1184 1630
1631 xen_init_irq_ops();
1632
1633#ifdef CONFIG_X86_LOCAL_APIC
1634 /*
1635 * set up the basic apic ops.
1636 */
1637 apic_ops = &xen_basic_apic_ops;
1638#endif
1639
1640 if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
1641 pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
1642 pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
1643 }
1644
1185 machine_ops = xen_machine_ops; 1645 machine_ops = xen_machine_ops;
1186 1646
1187#ifdef CONFIG_SMP 1647#ifdef CONFIG_X86_64
1188 smp_ops = xen_smp_ops; 1648 /* Disable until direct per-cpu data access. */
1649 have_vcpu_info_placement = 0;
1650 x86_64_init_pda();
1189#endif 1651#endif
1190 1652
1191 xen_setup_features(); 1653 xen_smp_init();
1192 1654
1193 /* Get mfn list */ 1655 /* Get mfn list */
1194 if (!xen_feature(XENFEAT_auto_translated_physmap)) 1656 if (!xen_feature(XENFEAT_auto_translated_physmap))
1195 phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list; 1657 xen_build_dynamic_phys_to_machine();
1196 1658
1197 pgd = (pgd_t *)xen_start_info->pt_base; 1659 pgd = (pgd_t *)xen_start_info->pt_base;
1198 1660
1199 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; 1661 /* Prevent unwanted bits from being set in PTEs. */
1200 1662 __supported_pte_mask &= ~_PAGE_GLOBAL;
1201 init_mm.pgd = pgd; /* use the Xen pagetables to start */ 1663 if (!xen_initial_domain())
1202 1664 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1203 /* keep using Xen gdt for now; no urgent need to change it */
1204
1205 x86_write_percpu(xen_cr3, __pa(pgd));
1206 x86_write_percpu(xen_current_cr3, __pa(pgd));
1207 1665
1208 /* Don't do the full vcpu_info placement stuff until we have a 1666 /* Don't do the full vcpu_info placement stuff until we have a
1209 possible map and a non-dummy shared_info. */ 1667 possible map and a non-dummy shared_info. */
1210 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 1668 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
1211 1669
1670 xen_raw_console_write("mapping kernel into physical memory\n");
1671 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
1672
1673 init_mm.pgd = pgd;
1674
1675 /* keep using Xen gdt for now; no urgent need to change it */
1676
1212 pv_info.kernel_rpl = 1; 1677 pv_info.kernel_rpl = 1;
1213 if (xen_feature(XENFEAT_supervisor_mode_kernel)) 1678 if (xen_feature(XENFEAT_supervisor_mode_kernel))
1214 pv_info.kernel_rpl = 0; 1679 pv_info.kernel_rpl = 0;
1215 1680
1216 /* Prevent unwanted bits from being set in PTEs. */
1217 __supported_pte_mask &= ~_PAGE_GLOBAL;
1218 if (!is_initial_xendomain())
1219 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1220
1221 /* set the limit of our address space */ 1681 /* set the limit of our address space */
1222 xen_reserve_top(); 1682 xen_reserve_top();
1223 1683
1684#ifdef CONFIG_X86_32
1224 /* set up basic CPUID stuff */ 1685 /* set up basic CPUID stuff */
1225 cpu_detect(&new_cpu_data); 1686 cpu_detect(&new_cpu_data);
1226 new_cpu_data.hard_math = 1; 1687 new_cpu_data.hard_math = 1;
1227 new_cpu_data.x86_capability[0] = cpuid_edx(1); 1688 new_cpu_data.x86_capability[0] = cpuid_edx(1);
1689#endif
1228 1690
1229 /* Poke various useful things into boot_params */ 1691 /* Poke various useful things into boot_params */
1230 boot_params.hdr.type_of_loader = (9 << 4) | 0; 1692 boot_params.hdr.type_of_loader = (9 << 4) | 0;
1231 boot_params.hdr.ramdisk_image = xen_start_info->mod_start 1693 boot_params.hdr.ramdisk_image = xen_start_info->mod_start
1232 ? __pa(xen_start_info->mod_start) : 0; 1694 ? __pa(xen_start_info->mod_start) : 0;
1233 boot_params.hdr.ramdisk_size = xen_start_info->mod_len; 1695 boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
1696 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
1234 1697
1235 if (!is_initial_xendomain()) 1698 if (!xen_initial_domain()) {
1699 add_preferred_console("xenboot", 0, NULL);
1700 add_preferred_console("tty", 0, NULL);
1236 add_preferred_console("hvc", 0, NULL); 1701 add_preferred_console("hvc", 0, NULL);
1702 }
1703
1704 xen_raw_console_write("about to get started...\n");
1237 1705
1238 /* Start the world */ 1706 /* Start the world */
1239 start_kernel(); 1707#ifdef CONFIG_X86_32
1708 i386_start_kernel();
1709#else
1710 x86_64_start_reservations((char *)__pa_symbol(&boot_params));
1711#endif
1240} 1712}
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
new file mode 100644
index 000000000000..28b85ab8422e
--- /dev/null
+++ b/arch/x86/xen/irq.c
@@ -0,0 +1,143 @@
1#include <linux/hardirq.h>
2
3#include <xen/interface/xen.h>
4#include <xen/interface/sched.h>
5#include <xen/interface/vcpu.h>
6
7#include <asm/xen/hypercall.h>
8#include <asm/xen/hypervisor.h>
9
10#include "xen-ops.h"
11
12/*
13 * Force a proper event-channel callback from Xen after clearing the
14 * callback mask. We do this in a very simple manner, by making a call
15 * down into Xen. The pending flag will be checked by Xen on return.
16 */
17void xen_force_evtchn_callback(void)
18{
19 (void)HYPERVISOR_xen_version(0, NULL);
20}
21
22static void __init __xen_init_IRQ(void)
23{
24#ifdef CONFIG_X86_64
25 int i;
26
27 /* Create identity vector->irq map */
28 for(i = 0; i < NR_VECTORS; i++) {
29 int cpu;
30
31 for_each_possible_cpu(cpu)
32 per_cpu(vector_irq, cpu)[i] = i;
33 }
34#endif /* CONFIG_X86_64 */
35
36 xen_init_IRQ();
37}
38
39static unsigned long xen_save_fl(void)
40{
41 struct vcpu_info *vcpu;
42 unsigned long flags;
43
44 vcpu = x86_read_percpu(xen_vcpu);
45
46 /* flag has opposite sense of mask */
47 flags = !vcpu->evtchn_upcall_mask;
48
49 /* convert to IF type flag
50 -0 -> 0x00000000
51 -1 -> 0xffffffff
52 */
53 return (-flags) & X86_EFLAGS_IF;
54}
55
56static void xen_restore_fl(unsigned long flags)
57{
58 struct vcpu_info *vcpu;
59
60 /* convert from IF type flag */
61 flags = !(flags & X86_EFLAGS_IF);
62
63 /* There's a one instruction preempt window here. We need to
64 make sure we're don't switch CPUs between getting the vcpu
65 pointer and updating the mask. */
66 preempt_disable();
67 vcpu = x86_read_percpu(xen_vcpu);
68 vcpu->evtchn_upcall_mask = flags;
69 preempt_enable_no_resched();
70
71 /* Doesn't matter if we get preempted here, because any
72 pending event will get dealt with anyway. */
73
74 if (flags == 0) {
75 preempt_check_resched();
76 barrier(); /* unmask then check (avoid races) */
77 if (unlikely(vcpu->evtchn_upcall_pending))
78 xen_force_evtchn_callback();
79 }
80}
81
82static void xen_irq_disable(void)
83{
84 /* There's a one instruction preempt window here. We need to
85 make sure we're don't switch CPUs between getting the vcpu
86 pointer and updating the mask. */
87 preempt_disable();
88 x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
89 preempt_enable_no_resched();
90}
91
92static void xen_irq_enable(void)
93{
94 struct vcpu_info *vcpu;
95
96 /* We don't need to worry about being preempted here, since
97 either a) interrupts are disabled, so no preemption, or b)
98 the caller is confused and is trying to re-enable interrupts
99 on an indeterminate processor. */
100
101 vcpu = x86_read_percpu(xen_vcpu);
102 vcpu->evtchn_upcall_mask = 0;
103
104 /* Doesn't matter if we get preempted here, because any
105 pending event will get dealt with anyway. */
106
107 barrier(); /* unmask then check (avoid races) */
108 if (unlikely(vcpu->evtchn_upcall_pending))
109 xen_force_evtchn_callback();
110}
111
112static void xen_safe_halt(void)
113{
114 /* Blocking includes an implicit local_irq_enable(). */
115 if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
116 BUG();
117}
118
119static void xen_halt(void)
120{
121 if (irqs_disabled())
122 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
123 else
124 xen_safe_halt();
125}
126
127static const struct pv_irq_ops xen_irq_ops __initdata = {
128 .init_IRQ = __xen_init_IRQ,
129 .save_fl = xen_save_fl,
130 .restore_fl = xen_restore_fl,
131 .irq_disable = xen_irq_disable,
132 .irq_enable = xen_irq_enable,
133 .safe_halt = xen_safe_halt,
134 .halt = xen_halt,
135#ifdef CONFIG_X86_64
136 .adjust_exception_frame = xen_adjust_exception_frame,
137#endif
138};
139
140void __init xen_init_irq_ops()
141{
142 pv_irq_ops = xen_irq_ops;
143}
diff --git a/arch/x86/xen/manage.c b/arch/x86/xen/manage.c
deleted file mode 100644
index aa7af9e6abc0..000000000000
--- a/arch/x86/xen/manage.c
+++ /dev/null
@@ -1,143 +0,0 @@
1/*
2 * Handle extern requests for shutdown, reboot and sysrq
3 */
4#include <linux/kernel.h>
5#include <linux/err.h>
6#include <linux/reboot.h>
7#include <linux/sysrq.h>
8
9#include <xen/xenbus.h>
10
11#define SHUTDOWN_INVALID -1
12#define SHUTDOWN_POWEROFF 0
13#define SHUTDOWN_SUSPEND 2
14/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
15 * report a crash, not be instructed to crash!
16 * HALT is the same as POWEROFF, as far as we're concerned. The tools use
17 * the distinction when we return the reason code to them.
18 */
19#define SHUTDOWN_HALT 4
20
21/* Ignore multiple shutdown requests. */
22static int shutting_down = SHUTDOWN_INVALID;
23
24static void shutdown_handler(struct xenbus_watch *watch,
25 const char **vec, unsigned int len)
26{
27 char *str;
28 struct xenbus_transaction xbt;
29 int err;
30
31 if (shutting_down != SHUTDOWN_INVALID)
32 return;
33
34 again:
35 err = xenbus_transaction_start(&xbt);
36 if (err)
37 return;
38
39 str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
40 /* Ignore read errors and empty reads. */
41 if (XENBUS_IS_ERR_READ(str)) {
42 xenbus_transaction_end(xbt, 1);
43 return;
44 }
45
46 xenbus_write(xbt, "control", "shutdown", "");
47
48 err = xenbus_transaction_end(xbt, 0);
49 if (err == -EAGAIN) {
50 kfree(str);
51 goto again;
52 }
53
54 if (strcmp(str, "poweroff") == 0 ||
55 strcmp(str, "halt") == 0)
56 orderly_poweroff(false);
57 else if (strcmp(str, "reboot") == 0)
58 ctrl_alt_del();
59 else {
60 printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
61 shutting_down = SHUTDOWN_INVALID;
62 }
63
64 kfree(str);
65}
66
67static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
68 unsigned int len)
69{
70 char sysrq_key = '\0';
71 struct xenbus_transaction xbt;
72 int err;
73
74 again:
75 err = xenbus_transaction_start(&xbt);
76 if (err)
77 return;
78 if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
79 printk(KERN_ERR "Unable to read sysrq code in "
80 "control/sysrq\n");
81 xenbus_transaction_end(xbt, 1);
82 return;
83 }
84
85 if (sysrq_key != '\0')
86 xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
87
88 err = xenbus_transaction_end(xbt, 0);
89 if (err == -EAGAIN)
90 goto again;
91
92 if (sysrq_key != '\0')
93 handle_sysrq(sysrq_key, NULL);
94}
95
96static struct xenbus_watch shutdown_watch = {
97 .node = "control/shutdown",
98 .callback = shutdown_handler
99};
100
101static struct xenbus_watch sysrq_watch = {
102 .node = "control/sysrq",
103 .callback = sysrq_handler
104};
105
106static int setup_shutdown_watcher(void)
107{
108 int err;
109
110 err = register_xenbus_watch(&shutdown_watch);
111 if (err) {
112 printk(KERN_ERR "Failed to set shutdown watcher\n");
113 return err;
114 }
115
116 err = register_xenbus_watch(&sysrq_watch);
117 if (err) {
118 printk(KERN_ERR "Failed to set sysrq watcher\n");
119 return err;
120 }
121
122 return 0;
123}
124
125static int shutdown_event(struct notifier_block *notifier,
126 unsigned long event,
127 void *data)
128{
129 setup_shutdown_watcher();
130 return NOTIFY_DONE;
131}
132
133static int __init setup_shutdown_event(void)
134{
135 static struct notifier_block xenstore_notifier = {
136 .notifier_call = shutdown_event
137 };
138 register_xenstore_notifier(&xenstore_notifier);
139
140 return 0;
141}
142
143subsys_initcall(setup_shutdown_event);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index df40bf74ea75..ae173f6edd8b 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -40,12 +40,15 @@
40 */ 40 */
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/debugfs.h>
43#include <linux/bug.h> 44#include <linux/bug.h>
44 45
45#include <asm/pgtable.h> 46#include <asm/pgtable.h>
46#include <asm/tlbflush.h> 47#include <asm/tlbflush.h>
48#include <asm/fixmap.h>
47#include <asm/mmu_context.h> 49#include <asm/mmu_context.h>
48#include <asm/paravirt.h> 50#include <asm/paravirt.h>
51#include <asm/linkage.h>
49 52
50#include <asm/xen/hypercall.h> 53#include <asm/xen/hypercall.h>
51#include <asm/xen/hypervisor.h> 54#include <asm/xen/hypervisor.h>
@@ -55,16 +58,200 @@
55 58
56#include "multicalls.h" 59#include "multicalls.h"
57#include "mmu.h" 60#include "mmu.h"
61#include "debugfs.h"
58 62
59xmaddr_t arbitrary_virt_to_machine(unsigned long address) 63#define MMU_UPDATE_HISTO 30
64
65#ifdef CONFIG_XEN_DEBUG_FS
66
67static struct {
68 u32 pgd_update;
69 u32 pgd_update_pinned;
70 u32 pgd_update_batched;
71
72 u32 pud_update;
73 u32 pud_update_pinned;
74 u32 pud_update_batched;
75
76 u32 pmd_update;
77 u32 pmd_update_pinned;
78 u32 pmd_update_batched;
79
80 u32 pte_update;
81 u32 pte_update_pinned;
82 u32 pte_update_batched;
83
84 u32 mmu_update;
85 u32 mmu_update_extended;
86 u32 mmu_update_histo[MMU_UPDATE_HISTO];
87
88 u32 prot_commit;
89 u32 prot_commit_batched;
90
91 u32 set_pte_at;
92 u32 set_pte_at_batched;
93 u32 set_pte_at_pinned;
94 u32 set_pte_at_current;
95 u32 set_pte_at_kernel;
96} mmu_stats;
97
98static u8 zero_stats;
99
100static inline void check_zero(void)
101{
102 if (unlikely(zero_stats)) {
103 memset(&mmu_stats, 0, sizeof(mmu_stats));
104 zero_stats = 0;
105 }
106}
107
108#define ADD_STATS(elem, val) \
109 do { check_zero(); mmu_stats.elem += (val); } while(0)
110
111#else /* !CONFIG_XEN_DEBUG_FS */
112
113#define ADD_STATS(elem, val) do { (void)(val); } while(0)
114
115#endif /* CONFIG_XEN_DEBUG_FS */
116
117/*
118 * Just beyond the highest usermode address. STACK_TOP_MAX has a
119 * redzone above it, so round it up to a PGD boundary.
120 */
121#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
122
123
124#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
125#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
126
127/* Placeholder for holes in the address space */
128static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
129 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
130
131 /* Array of pointers to pages containing p2m entries */
132static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
133 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
134
135/* Arrays of p2m arrays expressed in mfns used for save/restore */
136static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
137
138static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
139 __page_aligned_bss;
140
141static inline unsigned p2m_top_index(unsigned long pfn)
142{
143 BUG_ON(pfn >= MAX_DOMAIN_PAGES);
144 return pfn / P2M_ENTRIES_PER_PAGE;
145}
146
147static inline unsigned p2m_index(unsigned long pfn)
148{
149 return pfn % P2M_ENTRIES_PER_PAGE;
150}
151
152/* Build the parallel p2m_top_mfn structures */
153void xen_setup_mfn_list_list(void)
60{ 154{
155 unsigned pfn, idx;
156
157 for(pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
158 unsigned topidx = p2m_top_index(pfn);
159
160 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
161 }
162
163 for(idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
164 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
165 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
166 }
167
168 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
169
170 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
171 virt_to_mfn(p2m_top_mfn_list);
172 HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
173}
174
175/* Set up p2m_top to point to the domain-builder provided p2m pages */
176void __init xen_build_dynamic_phys_to_machine(void)
177{
178 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
179 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
180 unsigned pfn;
181
182 for(pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
183 unsigned topidx = p2m_top_index(pfn);
184
185 p2m_top[topidx] = &mfn_list[pfn];
186 }
187}
188
189unsigned long get_phys_to_machine(unsigned long pfn)
190{
191 unsigned topidx, idx;
192
193 if (unlikely(pfn >= MAX_DOMAIN_PAGES))
194 return INVALID_P2M_ENTRY;
195
196 topidx = p2m_top_index(pfn);
197 idx = p2m_index(pfn);
198 return p2m_top[topidx][idx];
199}
200EXPORT_SYMBOL_GPL(get_phys_to_machine);
201
202static void alloc_p2m(unsigned long **pp, unsigned long *mfnp)
203{
204 unsigned long *p;
205 unsigned i;
206
207 p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
208 BUG_ON(p == NULL);
209
210 for(i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
211 p[i] = INVALID_P2M_ENTRY;
212
213 if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
214 free_page((unsigned long)p);
215 else
216 *mfnp = virt_to_mfn(p);
217}
218
219void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
220{
221 unsigned topidx, idx;
222
223 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
224 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
225 return;
226 }
227
228 if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
229 BUG_ON(mfn != INVALID_P2M_ENTRY);
230 return;
231 }
232
233 topidx = p2m_top_index(pfn);
234 if (p2m_top[topidx] == p2m_missing) {
235 /* no need to allocate a page to store an invalid entry */
236 if (mfn == INVALID_P2M_ENTRY)
237 return;
238 alloc_p2m(&p2m_top[topidx], &p2m_top_mfn[topidx]);
239 }
240
241 idx = p2m_index(pfn);
242 p2m_top[topidx][idx] = mfn;
243}
244
245xmaddr_t arbitrary_virt_to_machine(void *vaddr)
246{
247 unsigned long address = (unsigned long)vaddr;
61 unsigned int level; 248 unsigned int level;
62 pte_t *pte = lookup_address(address, &level); 249 pte_t *pte = lookup_address(address, &level);
63 unsigned offset = address & ~PAGE_MASK; 250 unsigned offset = address & ~PAGE_MASK;
64 251
65 BUG_ON(pte == NULL); 252 BUG_ON(pte == NULL);
66 253
67 return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset); 254 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
68} 255}
69 256
70void make_lowmem_page_readonly(void *vaddr) 257void make_lowmem_page_readonly(void *vaddr)
@@ -98,59 +285,84 @@ void make_lowmem_page_readwrite(void *vaddr)
98} 285}
99 286
100 287
101void xen_set_pmd(pmd_t *ptr, pmd_t val) 288static bool xen_page_pinned(void *ptr)
289{
290 struct page *page = virt_to_page(ptr);
291
292 return PagePinned(page);
293}
294
295static void xen_extend_mmu_update(const struct mmu_update *update)
102{ 296{
103 struct multicall_space mcs; 297 struct multicall_space mcs;
104 struct mmu_update *u; 298 struct mmu_update *u;
105 299
106 preempt_disable(); 300 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
301
302 if (mcs.mc != NULL) {
303 ADD_STATS(mmu_update_extended, 1);
304 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
305
306 mcs.mc->args[1]++;
307
308 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
309 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
310 else
311 ADD_STATS(mmu_update_histo[0], 1);
312 } else {
313 ADD_STATS(mmu_update, 1);
314 mcs = __xen_mc_entry(sizeof(*u));
315 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
316 ADD_STATS(mmu_update_histo[1], 1);
317 }
107 318
108 mcs = xen_mc_entry(sizeof(*u));
109 u = mcs.args; 319 u = mcs.args;
110 u->ptr = virt_to_machine(ptr).maddr; 320 *u = *update;
111 u->val = pmd_val_ma(val); 321}
112 MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); 322
323void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
324{
325 struct mmu_update u;
326
327 preempt_disable();
328
329 xen_mc_batch();
330
331 /* ptr may be ioremapped for 64-bit pagetable setup */
332 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
333 u.val = pmd_val_ma(val);
334 xen_extend_mmu_update(&u);
335
336 ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
113 337
114 xen_mc_issue(PARAVIRT_LAZY_MMU); 338 xen_mc_issue(PARAVIRT_LAZY_MMU);
115 339
116 preempt_enable(); 340 preempt_enable();
117} 341}
118 342
343void xen_set_pmd(pmd_t *ptr, pmd_t val)
344{
345 ADD_STATS(pmd_update, 1);
346
347 /* If page is not pinned, we can just update the entry
348 directly */
349 if (!xen_page_pinned(ptr)) {
350 *ptr = val;
351 return;
352 }
353
354 ADD_STATS(pmd_update_pinned, 1);
355
356 xen_set_pmd_hyper(ptr, val);
357}
358
119/* 359/*
120 * Associate a virtual page frame with a given physical page frame 360 * Associate a virtual page frame with a given physical page frame
121 * and protection flags for that frame. 361 * and protection flags for that frame.
122 */ 362 */
123void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) 363void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
124{ 364{
125 pgd_t *pgd; 365 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
126 pud_t *pud;
127 pmd_t *pmd;
128 pte_t *pte;
129
130 pgd = swapper_pg_dir + pgd_index(vaddr);
131 if (pgd_none(*pgd)) {
132 BUG();
133 return;
134 }
135 pud = pud_offset(pgd, vaddr);
136 if (pud_none(*pud)) {
137 BUG();
138 return;
139 }
140 pmd = pmd_offset(pud, vaddr);
141 if (pmd_none(*pmd)) {
142 BUG();
143 return;
144 }
145 pte = pte_offset_kernel(pmd, vaddr);
146 /* <mfn,flags> stored as-is, to permit clearing entries */
147 xen_set_pte(pte, mfn_pte(mfn, flags));
148
149 /*
150 * It's enough to flush this one mapping.
151 * (PGE mappings get flushed as well)
152 */
153 __flush_tlb_one(vaddr);
154} 366}
155 367
156void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 368void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -160,12 +372,18 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
160 if (mm == &init_mm) 372 if (mm == &init_mm)
161 preempt_disable(); 373 preempt_disable();
162 374
375 ADD_STATS(set_pte_at, 1);
376// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
377 ADD_STATS(set_pte_at_current, mm == current->mm);
378 ADD_STATS(set_pte_at_kernel, mm == &init_mm);
379
163 if (mm == current->mm || mm == &init_mm) { 380 if (mm == current->mm || mm == &init_mm) {
164 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { 381 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
165 struct multicall_space mcs; 382 struct multicall_space mcs;
166 mcs = xen_mc_entry(0); 383 mcs = xen_mc_entry(0);
167 384
168 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); 385 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
386 ADD_STATS(set_pte_at_batched, 1);
169 xen_mc_issue(PARAVIRT_LAZY_MMU); 387 xen_mc_issue(PARAVIRT_LAZY_MMU);
170 goto out; 388 goto out;
171 } else 389 } else
@@ -179,13 +397,36 @@ out:
179 preempt_enable(); 397 preempt_enable();
180} 398}
181 399
400pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
401{
402 /* Just return the pte as-is. We preserve the bits on commit */
403 return *ptep;
404}
405
406void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
407 pte_t *ptep, pte_t pte)
408{
409 struct mmu_update u;
410
411 xen_mc_batch();
412
413 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
414 u.val = pte_val_ma(pte);
415 xen_extend_mmu_update(&u);
416
417 ADD_STATS(prot_commit, 1);
418 ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
419
420 xen_mc_issue(PARAVIRT_LAZY_MMU);
421}
422
182/* Assume pteval_t is equivalent to all the other *val_t types. */ 423/* Assume pteval_t is equivalent to all the other *val_t types. */
183static pteval_t pte_mfn_to_pfn(pteval_t val) 424static pteval_t pte_mfn_to_pfn(pteval_t val)
184{ 425{
185 if (val & _PAGE_PRESENT) { 426 if (val & _PAGE_PRESENT) {
186 unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT; 427 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
187 pteval_t flags = val & ~PTE_MASK; 428 pteval_t flags = val & PTE_FLAGS_MASK;
188 val = (mfn_to_pfn(mfn) << PAGE_SHIFT) | flags; 429 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
189 } 430 }
190 431
191 return val; 432 return val;
@@ -194,9 +435,9 @@ static pteval_t pte_mfn_to_pfn(pteval_t val)
194static pteval_t pte_pfn_to_mfn(pteval_t val) 435static pteval_t pte_pfn_to_mfn(pteval_t val)
195{ 436{
196 if (val & _PAGE_PRESENT) { 437 if (val & _PAGE_PRESENT) {
197 unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT; 438 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
198 pteval_t flags = val & ~PTE_MASK; 439 pteval_t flags = val & PTE_FLAGS_MASK;
199 val = (pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; 440 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
200 } 441 }
201 442
202 return val; 443 return val;
@@ -229,34 +470,61 @@ pmdval_t xen_pmd_val(pmd_t pmd)
229 return pte_mfn_to_pfn(pmd.pmd); 470 return pte_mfn_to_pfn(pmd.pmd);
230} 471}
231 472
232void xen_set_pud(pud_t *ptr, pud_t val) 473void xen_set_pud_hyper(pud_t *ptr, pud_t val)
233{ 474{
234 struct multicall_space mcs; 475 struct mmu_update u;
235 struct mmu_update *u;
236 476
237 preempt_disable(); 477 preempt_disable();
238 478
239 mcs = xen_mc_entry(sizeof(*u)); 479 xen_mc_batch();
240 u = mcs.args; 480
241 u->ptr = virt_to_machine(ptr).maddr; 481 /* ptr may be ioremapped for 64-bit pagetable setup */
242 u->val = pud_val_ma(val); 482 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
243 MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); 483 u.val = pud_val_ma(val);
484 xen_extend_mmu_update(&u);
485
486 ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
244 487
245 xen_mc_issue(PARAVIRT_LAZY_MMU); 488 xen_mc_issue(PARAVIRT_LAZY_MMU);
246 489
247 preempt_enable(); 490 preempt_enable();
248} 491}
249 492
493void xen_set_pud(pud_t *ptr, pud_t val)
494{
495 ADD_STATS(pud_update, 1);
496
497 /* If page is not pinned, we can just update the entry
498 directly */
499 if (!xen_page_pinned(ptr)) {
500 *ptr = val;
501 return;
502 }
503
504 ADD_STATS(pud_update_pinned, 1);
505
506 xen_set_pud_hyper(ptr, val);
507}
508
250void xen_set_pte(pte_t *ptep, pte_t pte) 509void xen_set_pte(pte_t *ptep, pte_t pte)
251{ 510{
511 ADD_STATS(pte_update, 1);
512// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
513 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
514
515#ifdef CONFIG_X86_PAE
252 ptep->pte_high = pte.pte_high; 516 ptep->pte_high = pte.pte_high;
253 smp_wmb(); 517 smp_wmb();
254 ptep->pte_low = pte.pte_low; 518 ptep->pte_low = pte.pte_low;
519#else
520 *ptep = pte;
521#endif
255} 522}
256 523
524#ifdef CONFIG_X86_PAE
257void xen_set_pte_atomic(pte_t *ptep, pte_t pte) 525void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
258{ 526{
259 set_64bit((u64 *)ptep, pte_val_ma(pte)); 527 set_64bit((u64 *)ptep, native_pte_val(pte));
260} 528}
261 529
262void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 530void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
@@ -268,8 +536,9 @@ void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
268 536
269void xen_pmd_clear(pmd_t *pmdp) 537void xen_pmd_clear(pmd_t *pmdp)
270{ 538{
271 xen_set_pmd(pmdp, __pmd(0)); 539 set_pmd(pmdp, __pmd(0));
272} 540}
541#endif /* CONFIG_X86_PAE */
273 542
274pmd_t xen_make_pmd(pmdval_t pmd) 543pmd_t xen_make_pmd(pmdval_t pmd)
275{ 544{
@@ -277,95 +546,218 @@ pmd_t xen_make_pmd(pmdval_t pmd)
277 return native_make_pmd(pmd); 546 return native_make_pmd(pmd);
278} 547}
279 548
549#if PAGETABLE_LEVELS == 4
550pudval_t xen_pud_val(pud_t pud)
551{
552 return pte_mfn_to_pfn(pud.pud);
553}
554
555pud_t xen_make_pud(pudval_t pud)
556{
557 pud = pte_pfn_to_mfn(pud);
558
559 return native_make_pud(pud);
560}
561
562pgd_t *xen_get_user_pgd(pgd_t *pgd)
563{
564 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
565 unsigned offset = pgd - pgd_page;
566 pgd_t *user_ptr = NULL;
567
568 if (offset < pgd_index(USER_LIMIT)) {
569 struct page *page = virt_to_page(pgd_page);
570 user_ptr = (pgd_t *)page->private;
571 if (user_ptr)
572 user_ptr += offset;
573 }
574
575 return user_ptr;
576}
577
578static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
579{
580 struct mmu_update u;
581
582 u.ptr = virt_to_machine(ptr).maddr;
583 u.val = pgd_val_ma(val);
584 xen_extend_mmu_update(&u);
585}
586
587/*
588 * Raw hypercall-based set_pgd, intended for in early boot before
589 * there's a page structure. This implies:
590 * 1. The only existing pagetable is the kernel's
591 * 2. It is always pinned
592 * 3. It has no user pagetable attached to it
593 */
594void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
595{
596 preempt_disable();
597
598 xen_mc_batch();
599
600 __xen_set_pgd_hyper(ptr, val);
601
602 xen_mc_issue(PARAVIRT_LAZY_MMU);
603
604 preempt_enable();
605}
606
607void xen_set_pgd(pgd_t *ptr, pgd_t val)
608{
609 pgd_t *user_ptr = xen_get_user_pgd(ptr);
610
611 ADD_STATS(pgd_update, 1);
612
613 /* If page is not pinned, we can just update the entry
614 directly */
615 if (!xen_page_pinned(ptr)) {
616 *ptr = val;
617 if (user_ptr) {
618 WARN_ON(xen_page_pinned(user_ptr));
619 *user_ptr = val;
620 }
621 return;
622 }
623
624 ADD_STATS(pgd_update_pinned, 1);
625 ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
626
627 /* If it's pinned, then we can at least batch the kernel and
628 user updates together. */
629 xen_mc_batch();
630
631 __xen_set_pgd_hyper(ptr, val);
632 if (user_ptr)
633 __xen_set_pgd_hyper(user_ptr, val);
634
635 xen_mc_issue(PARAVIRT_LAZY_MMU);
636}
637#endif /* PAGETABLE_LEVELS == 4 */
638
280/* 639/*
281 (Yet another) pagetable walker. This one is intended for pinning a 640 * (Yet another) pagetable walker. This one is intended for pinning a
282 pagetable. This means that it walks a pagetable and calls the 641 * pagetable. This means that it walks a pagetable and calls the
283 callback function on each page it finds making up the page table, 642 * callback function on each page it finds making up the page table,
284 at every level. It walks the entire pagetable, but it only bothers 643 * at every level. It walks the entire pagetable, but it only bothers
285 pinning pte pages which are below pte_limit. In the normal case 644 * pinning pte pages which are below limit. In the normal case this
286 this will be TASK_SIZE, but at boot we need to pin up to 645 * will be STACK_TOP_MAX, but at boot we need to pin up to
287 FIXADDR_TOP. But the important bit is that we don't pin beyond 646 * FIXADDR_TOP.
288 there, because then we start getting into Xen's ptes. 647 *
289*/ 648 * For 32-bit the important bit is that we don't pin beyond there,
290static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level), 649 * because then we start getting into Xen's ptes.
291 unsigned long limit) 650 *
292{ 651 * For 64-bit, we must skip the Xen hole in the middle of the address
293 pgd_t *pgd = pgd_base; 652 * space, just after the big x86-64 virtual hole.
653 */
654static int xen_pgd_walk(struct mm_struct *mm,
655 int (*func)(struct mm_struct *mm, struct page *,
656 enum pt_level),
657 unsigned long limit)
658{
659 pgd_t *pgd = mm->pgd;
294 int flush = 0; 660 int flush = 0;
295 unsigned long addr = 0; 661 unsigned hole_low, hole_high;
296 unsigned long pgd_next; 662 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
663 unsigned pgdidx, pudidx, pmdidx;
297 664
298 BUG_ON(limit > FIXADDR_TOP); 665 /* The limit is the last byte to be touched */
666 limit--;
667 BUG_ON(limit >= FIXADDR_TOP);
299 668
300 if (xen_feature(XENFEAT_auto_translated_physmap)) 669 if (xen_feature(XENFEAT_auto_translated_physmap))
301 return 0; 670 return 0;
302 671
303 for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) { 672 /*
673 * 64-bit has a great big hole in the middle of the address
674 * space, which contains the Xen mappings. On 32-bit these
675 * will end up making a zero-sized hole and so is a no-op.
676 */
677 hole_low = pgd_index(USER_LIMIT);
678 hole_high = pgd_index(PAGE_OFFSET);
679
680 pgdidx_limit = pgd_index(limit);
681#if PTRS_PER_PUD > 1
682 pudidx_limit = pud_index(limit);
683#else
684 pudidx_limit = 0;
685#endif
686#if PTRS_PER_PMD > 1
687 pmdidx_limit = pmd_index(limit);
688#else
689 pmdidx_limit = 0;
690#endif
691
692 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
304 pud_t *pud; 693 pud_t *pud;
305 unsigned long pud_limit, pud_next;
306 694
307 pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP); 695 if (pgdidx >= hole_low && pgdidx < hole_high)
696 continue;
308 697
309 if (!pgd_val(*pgd)) 698 if (!pgd_val(pgd[pgdidx]))
310 continue; 699 continue;
311 700
312 pud = pud_offset(pgd, 0); 701 pud = pud_offset(&pgd[pgdidx], 0);
313 702
314 if (PTRS_PER_PUD > 1) /* not folded */ 703 if (PTRS_PER_PUD > 1) /* not folded */
315 flush |= (*func)(virt_to_page(pud), PT_PUD); 704 flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
316 705
317 for (; addr != pud_limit; pud++, addr = pud_next) { 706 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
318 pmd_t *pmd; 707 pmd_t *pmd;
319 unsigned long pmd_limit;
320
321 pud_next = pud_addr_end(addr, pud_limit);
322 708
323 if (pud_next < limit) 709 if (pgdidx == pgdidx_limit &&
324 pmd_limit = pud_next; 710 pudidx > pudidx_limit)
325 else 711 goto out;
326 pmd_limit = limit;
327 712
328 if (pud_none(*pud)) 713 if (pud_none(pud[pudidx]))
329 continue; 714 continue;
330 715
331 pmd = pmd_offset(pud, 0); 716 pmd = pmd_offset(&pud[pudidx], 0);
332 717
333 if (PTRS_PER_PMD > 1) /* not folded */ 718 if (PTRS_PER_PMD > 1) /* not folded */
334 flush |= (*func)(virt_to_page(pmd), PT_PMD); 719 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
720
721 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
722 struct page *pte;
335 723
336 for (; addr != pmd_limit; pmd++) { 724 if (pgdidx == pgdidx_limit &&
337 addr += (PAGE_SIZE * PTRS_PER_PTE); 725 pudidx == pudidx_limit &&
338 if ((pmd_limit-1) < (addr-1)) { 726 pmdidx > pmdidx_limit)
339 addr = pmd_limit; 727 goto out;
340 break;
341 }
342 728
343 if (pmd_none(*pmd)) 729 if (pmd_none(pmd[pmdidx]))
344 continue; 730 continue;
345 731
346 flush |= (*func)(pmd_page(*pmd), PT_PTE); 732 pte = pmd_page(pmd[pmdidx]);
733 flush |= (*func)(mm, pte, PT_PTE);
347 } 734 }
348 } 735 }
349 } 736 }
350 737
351 flush |= (*func)(virt_to_page(pgd_base), PT_PGD); 738out:
739 /* Do the top level last, so that the callbacks can use it as
740 a cue to do final things like tlb flushes. */
741 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
352 742
353 return flush; 743 return flush;
354} 744}
355 745
356static spinlock_t *lock_pte(struct page *page) 746/* If we're using split pte locks, then take the page's lock and
747 return a pointer to it. Otherwise return NULL. */
748static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
357{ 749{
358 spinlock_t *ptl = NULL; 750 spinlock_t *ptl = NULL;
359 751
360#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS 752#if USE_SPLIT_PTLOCKS
361 ptl = __pte_lockptr(page); 753 ptl = __pte_lockptr(page);
362 spin_lock(ptl); 754 spin_lock_nest_lock(ptl, &mm->page_table_lock);
363#endif 755#endif
364 756
365 return ptl; 757 return ptl;
366} 758}
367 759
368static void do_unlock(void *v) 760static void xen_pte_unlock(void *v)
369{ 761{
370 spinlock_t *ptl = v; 762 spinlock_t *ptl = v;
371 spin_unlock(ptl); 763 spin_unlock(ptl);
@@ -383,7 +775,8 @@ static void xen_do_pin(unsigned level, unsigned long pfn)
383 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 775 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
384} 776}
385 777
386static int pin_page(struct page *page, enum pt_level level) 778static int xen_pin_page(struct mm_struct *mm, struct page *page,
779 enum pt_level level)
387{ 780{
388 unsigned pgfl = TestSetPagePinned(page); 781 unsigned pgfl = TestSetPagePinned(page);
389 int flush; 782 int flush;
@@ -402,21 +795,40 @@ static int pin_page(struct page *page, enum pt_level level)
402 795
403 flush = 0; 796 flush = 0;
404 797
798 /*
799 * We need to hold the pagetable lock between the time
800 * we make the pagetable RO and when we actually pin
801 * it. If we don't, then other users may come in and
802 * attempt to update the pagetable by writing it,
803 * which will fail because the memory is RO but not
804 * pinned, so Xen won't do the trap'n'emulate.
805 *
806 * If we're using split pte locks, we can't hold the
807 * entire pagetable's worth of locks during the
808 * traverse, because we may wrap the preempt count (8
809 * bits). The solution is to mark RO and pin each PTE
810 * page while holding the lock. This means the number
811 * of locks we end up holding is never more than a
812 * batch size (~32 entries, at present).
813 *
814 * If we're not using split pte locks, we needn't pin
815 * the PTE pages independently, because we're
816 * protected by the overall pagetable lock.
817 */
405 ptl = NULL; 818 ptl = NULL;
406 if (level == PT_PTE) 819 if (level == PT_PTE)
407 ptl = lock_pte(page); 820 ptl = xen_pte_lock(page, mm);
408 821
409 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 822 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
410 pfn_pte(pfn, PAGE_KERNEL_RO), 823 pfn_pte(pfn, PAGE_KERNEL_RO),
411 level == PT_PGD ? UVMF_TLB_FLUSH : 0); 824 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
412 825
413 if (level == PT_PTE) 826 if (ptl) {
414 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); 827 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
415 828
416 if (ptl) {
417 /* Queue a deferred unlock for when this batch 829 /* Queue a deferred unlock for when this batch
418 is completed. */ 830 is completed. */
419 xen_mc_callback(do_unlock, ptl); 831 xen_mc_callback(xen_pte_unlock, ptl);
420 } 832 }
421 } 833 }
422 834
@@ -426,25 +838,78 @@ static int pin_page(struct page *page, enum pt_level level)
426/* This is called just after a mm has been created, but it has not 838/* This is called just after a mm has been created, but it has not
427 been used yet. We need to make sure that its pagetable is all 839 been used yet. We need to make sure that its pagetable is all
428 read-only, and can be pinned. */ 840 read-only, and can be pinned. */
429void xen_pgd_pin(pgd_t *pgd) 841static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
430{ 842{
431 xen_mc_batch(); 843 xen_mc_batch();
432 844
433 if (pgd_walk(pgd, pin_page, TASK_SIZE)) { 845 if (xen_pgd_walk(mm, xen_pin_page, USER_LIMIT)) {
434 /* re-enable interrupts for kmap_flush_unused */ 846 /* re-enable interrupts for kmap_flush_unused */
435 xen_mc_issue(0); 847 xen_mc_issue(0);
436 kmap_flush_unused(); 848 kmap_flush_unused();
437 xen_mc_batch(); 849 xen_mc_batch();
438 } 850 }
439 851
852#ifdef CONFIG_X86_64
853 {
854 pgd_t *user_pgd = xen_get_user_pgd(pgd);
855
856 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
857
858 if (user_pgd) {
859 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
860 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
861 }
862 }
863#else /* CONFIG_X86_32 */
864#ifdef CONFIG_X86_PAE
865 /* Need to make sure unshared kernel PMD is pinnable */
866 xen_pin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])),
867 PT_PMD);
868#endif
440 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); 869 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
870#endif /* CONFIG_X86_64 */
441 xen_mc_issue(0); 871 xen_mc_issue(0);
442} 872}
443 873
444/* The init_mm pagetable is really pinned as soon as its created, but 874static void xen_pgd_pin(struct mm_struct *mm)
445 that's before we have page structures to store the bits. So do all 875{
446 the book-keeping now. */ 876 __xen_pgd_pin(mm, mm->pgd);
447static __init int mark_pinned(struct page *page, enum pt_level level) 877}
878
879/*
880 * On save, we need to pin all pagetables to make sure they get their
881 * mfns turned into pfns. Search the list for any unpinned pgds and pin
882 * them (unpinned pgds are not currently in use, probably because the
883 * process is under construction or destruction).
884 *
885 * Expected to be called in stop_machine() ("equivalent to taking
886 * every spinlock in the system"), so the locking doesn't really
887 * matter all that much.
888 */
889void xen_mm_pin_all(void)
890{
891 unsigned long flags;
892 struct page *page;
893
894 spin_lock_irqsave(&pgd_lock, flags);
895
896 list_for_each_entry(page, &pgd_list, lru) {
897 if (!PagePinned(page)) {
898 __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
899 SetPageSavePinned(page);
900 }
901 }
902
903 spin_unlock_irqrestore(&pgd_lock, flags);
904}
905
906/*
907 * The init_mm pagetable is really pinned as soon as its created, but
908 * that's before we have page structures to store the bits. So do all
909 * the book-keeping now.
910 */
911static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
912 enum pt_level level)
448{ 913{
449 SetPagePinned(page); 914 SetPagePinned(page);
450 return 0; 915 return 0;
@@ -452,10 +917,11 @@ static __init int mark_pinned(struct page *page, enum pt_level level)
452 917
453void __init xen_mark_init_mm_pinned(void) 918void __init xen_mark_init_mm_pinned(void)
454{ 919{
455 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); 920 xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
456} 921}
457 922
458static int unpin_page(struct page *page, enum pt_level level) 923static int xen_unpin_page(struct mm_struct *mm, struct page *page,
924 enum pt_level level)
459{ 925{
460 unsigned pgfl = TestClearPagePinned(page); 926 unsigned pgfl = TestClearPagePinned(page);
461 927
@@ -465,10 +931,18 @@ static int unpin_page(struct page *page, enum pt_level level)
465 spinlock_t *ptl = NULL; 931 spinlock_t *ptl = NULL;
466 struct multicall_space mcs; 932 struct multicall_space mcs;
467 933
934 /*
935 * Do the converse to pin_page. If we're using split
936 * pte locks, we must be holding the lock for while
937 * the pte page is unpinned but still RO to prevent
938 * concurrent updates from seeing it in this
939 * partially-pinned state.
940 */
468 if (level == PT_PTE) { 941 if (level == PT_PTE) {
469 ptl = lock_pte(page); 942 ptl = xen_pte_lock(page, mm);
470 943
471 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); 944 if (ptl)
945 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
472 } 946 }
473 947
474 mcs = __xen_mc_entry(0); 948 mcs = __xen_mc_entry(0);
@@ -479,7 +953,7 @@ static int unpin_page(struct page *page, enum pt_level level)
479 953
480 if (ptl) { 954 if (ptl) {
481 /* unlock when batch completed */ 955 /* unlock when batch completed */
482 xen_mc_callback(do_unlock, ptl); 956 xen_mc_callback(xen_pte_unlock, ptl);
483 } 957 }
484 } 958 }
485 959
@@ -487,28 +961,72 @@ static int unpin_page(struct page *page, enum pt_level level)
487} 961}
488 962
489/* Release a pagetables pages back as normal RW */ 963/* Release a pagetables pages back as normal RW */
490static void xen_pgd_unpin(pgd_t *pgd) 964static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
491{ 965{
492 xen_mc_batch(); 966 xen_mc_batch();
493 967
494 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 968 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
495 969
496 pgd_walk(pgd, unpin_page, TASK_SIZE); 970#ifdef CONFIG_X86_64
971 {
972 pgd_t *user_pgd = xen_get_user_pgd(pgd);
973
974 if (user_pgd) {
975 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
976 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
977 }
978 }
979#endif
980
981#ifdef CONFIG_X86_PAE
982 /* Need to make sure unshared kernel PMD is unpinned */
983 xen_unpin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])),
984 PT_PMD);
985#endif
986
987 xen_pgd_walk(mm, xen_unpin_page, USER_LIMIT);
497 988
498 xen_mc_issue(0); 989 xen_mc_issue(0);
499} 990}
500 991
992static void xen_pgd_unpin(struct mm_struct *mm)
993{
994 __xen_pgd_unpin(mm, mm->pgd);
995}
996
997/*
998 * On resume, undo any pinning done at save, so that the rest of the
999 * kernel doesn't see any unexpected pinned pagetables.
1000 */
1001void xen_mm_unpin_all(void)
1002{
1003 unsigned long flags;
1004 struct page *page;
1005
1006 spin_lock_irqsave(&pgd_lock, flags);
1007
1008 list_for_each_entry(page, &pgd_list, lru) {
1009 if (PageSavePinned(page)) {
1010 BUG_ON(!PagePinned(page));
1011 __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
1012 ClearPageSavePinned(page);
1013 }
1014 }
1015
1016 spin_unlock_irqrestore(&pgd_lock, flags);
1017}
1018
501void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) 1019void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
502{ 1020{
503 spin_lock(&next->page_table_lock); 1021 spin_lock(&next->page_table_lock);
504 xen_pgd_pin(next->pgd); 1022 xen_pgd_pin(next);
505 spin_unlock(&next->page_table_lock); 1023 spin_unlock(&next->page_table_lock);
506} 1024}
507 1025
508void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) 1026void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
509{ 1027{
510 spin_lock(&mm->page_table_lock); 1028 spin_lock(&mm->page_table_lock);
511 xen_pgd_pin(mm->pgd); 1029 xen_pgd_pin(mm);
512 spin_unlock(&mm->page_table_lock); 1030 spin_unlock(&mm->page_table_lock);
513} 1031}
514 1032
@@ -519,8 +1037,15 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
519static void drop_other_mm_ref(void *info) 1037static void drop_other_mm_ref(void *info)
520{ 1038{
521 struct mm_struct *mm = info; 1039 struct mm_struct *mm = info;
1040 struct mm_struct *active_mm;
1041
1042#ifdef CONFIG_X86_64
1043 active_mm = read_pda(active_mm);
1044#else
1045 active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
1046#endif
522 1047
523 if (__get_cpu_var(cpu_tlbstate).active_mm == mm) 1048 if (active_mm == mm)
524 leave_mm(smp_processor_id()); 1049 leave_mm(smp_processor_id());
525 1050
526 /* If this cpu still has a stale cr3 reference, then make sure 1051 /* If this cpu still has a stale cr3 reference, then make sure
@@ -531,7 +1056,7 @@ static void drop_other_mm_ref(void *info)
531 } 1056 }
532} 1057}
533 1058
534static void drop_mm_ref(struct mm_struct *mm) 1059static void xen_drop_mm_ref(struct mm_struct *mm)
535{ 1060{
536 cpumask_t mask; 1061 cpumask_t mask;
537 unsigned cpu; 1062 unsigned cpu;
@@ -558,10 +1083,10 @@ static void drop_mm_ref(struct mm_struct *mm)
558 } 1083 }
559 1084
560 if (!cpus_empty(mask)) 1085 if (!cpus_empty(mask))
561 xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); 1086 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
562} 1087}
563#else 1088#else
564static void drop_mm_ref(struct mm_struct *mm) 1089static void xen_drop_mm_ref(struct mm_struct *mm)
565{ 1090{
566 if (current->active_mm == mm) 1091 if (current->active_mm == mm)
567 load_cr3(swapper_pg_dir); 1092 load_cr3(swapper_pg_dir);
@@ -585,14 +1110,77 @@ static void drop_mm_ref(struct mm_struct *mm)
585void xen_exit_mmap(struct mm_struct *mm) 1110void xen_exit_mmap(struct mm_struct *mm)
586{ 1111{
587 get_cpu(); /* make sure we don't move around */ 1112 get_cpu(); /* make sure we don't move around */
588 drop_mm_ref(mm); 1113 xen_drop_mm_ref(mm);
589 put_cpu(); 1114 put_cpu();
590 1115
591 spin_lock(&mm->page_table_lock); 1116 spin_lock(&mm->page_table_lock);
592 1117
593 /* pgd may not be pinned in the error exit path of execve */ 1118 /* pgd may not be pinned in the error exit path of execve */
594 if (PagePinned(virt_to_page(mm->pgd))) 1119 if (xen_page_pinned(mm->pgd))
595 xen_pgd_unpin(mm->pgd); 1120 xen_pgd_unpin(mm);
596 1121
597 spin_unlock(&mm->page_table_lock); 1122 spin_unlock(&mm->page_table_lock);
598} 1123}
1124
1125#ifdef CONFIG_XEN_DEBUG_FS
1126
1127static struct dentry *d_mmu_debug;
1128
1129static int __init xen_mmu_debugfs(void)
1130{
1131 struct dentry *d_xen = xen_init_debugfs();
1132
1133 if (d_xen == NULL)
1134 return -ENOMEM;
1135
1136 d_mmu_debug = debugfs_create_dir("mmu", d_xen);
1137
1138 debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
1139
1140 debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
1141 debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
1142 &mmu_stats.pgd_update_pinned);
1143 debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
1144 &mmu_stats.pgd_update_pinned);
1145
1146 debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
1147 debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
1148 &mmu_stats.pud_update_pinned);
1149 debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
1150 &mmu_stats.pud_update_pinned);
1151
1152 debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
1153 debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
1154 &mmu_stats.pmd_update_pinned);
1155 debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
1156 &mmu_stats.pmd_update_pinned);
1157
1158 debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
1159// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
1160// &mmu_stats.pte_update_pinned);
1161 debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
1162 &mmu_stats.pte_update_pinned);
1163
1164 debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
1165 debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
1166 &mmu_stats.mmu_update_extended);
1167 xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
1168 mmu_stats.mmu_update_histo, 20);
1169
1170 debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
1171 debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
1172 &mmu_stats.set_pte_at_batched);
1173 debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
1174 &mmu_stats.set_pte_at_current);
1175 debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
1176 &mmu_stats.set_pte_at_kernel);
1177
1178 debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
1179 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
1180 &mmu_stats.prot_commit_batched);
1181
1182 return 0;
1183}
1184fs_initcall(xen_mmu_debugfs);
1185
1186#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 5fe961caffd4..98d71659da5a 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -10,33 +10,14 @@ enum pt_level {
10 PT_PTE 10 PT_PTE
11}; 11};
12 12
13/*
14 * Page-directory addresses above 4GB do not fit into architectural %cr3.
15 * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
16 * must use the following accessor macros to pack/unpack valid MFNs.
17 *
18 * Note that Xen is using the fact that the pagetable base is always
19 * page-aligned, and putting the 12 MSB of the address into the 12 LSB
20 * of cr3.
21 */
22#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
23#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
24
25 13
26void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 14void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
27 15
28void xen_set_pte(pte_t *ptep, pte_t pteval);
29void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
30 pte_t *ptep, pte_t pteval);
31void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
32 16
33void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next); 17void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
34void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); 18void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
35void xen_exit_mmap(struct mm_struct *mm); 19void xen_exit_mmap(struct mm_struct *mm);
36 20
37void xen_pgd_pin(pgd_t *pgd);
38//void xen_pgd_unpin(pgd_t *pgd);
39
40pteval_t xen_pte_val(pte_t); 21pteval_t xen_pte_val(pte_t);
41pmdval_t xen_pmd_val(pmd_t); 22pmdval_t xen_pmd_val(pmd_t);
42pgdval_t xen_pgd_val(pgd_t); 23pgdval_t xen_pgd_val(pgd_t);
@@ -45,11 +26,32 @@ pte_t xen_make_pte(pteval_t);
45pmd_t xen_make_pmd(pmdval_t); 26pmd_t xen_make_pmd(pmdval_t);
46pgd_t xen_make_pgd(pgdval_t); 27pgd_t xen_make_pgd(pgdval_t);
47 28
29void xen_set_pte(pte_t *ptep, pte_t pteval);
48void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 30void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
49 pte_t *ptep, pte_t pteval); 31 pte_t *ptep, pte_t pteval);
32
33#ifdef CONFIG_X86_PAE
50void xen_set_pte_atomic(pte_t *ptep, pte_t pte); 34void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
51void xen_set_pud(pud_t *ptr, pud_t val);
52void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 35void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
53void xen_pmd_clear(pmd_t *pmdp); 36void xen_pmd_clear(pmd_t *pmdp);
37#endif /* CONFIG_X86_PAE */
38
39void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
40void xen_set_pud(pud_t *ptr, pud_t val);
41void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
42void xen_set_pud_hyper(pud_t *ptr, pud_t val);
43
44#if PAGETABLE_LEVELS == 4
45pudval_t xen_pud_val(pud_t pud);
46pud_t xen_make_pud(pudval_t pudval);
47void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
48void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
49#endif
50
51pgd_t *xen_get_user_pgd(pgd_t *pgd);
52
53pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
54void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
55 pte_t *ptep, pte_t pte);
54 56
55#endif /* _XEN_MMU_H */ 57#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 5791eb2e3750..8ea8a0d0b0de 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -21,22 +21,26 @@
21 */ 21 */
22#include <linux/percpu.h> 22#include <linux/percpu.h>
23#include <linux/hardirq.h> 23#include <linux/hardirq.h>
24#include <linux/debugfs.h>
24 25
25#include <asm/xen/hypercall.h> 26#include <asm/xen/hypercall.h>
26 27
27#include "multicalls.h" 28#include "multicalls.h"
29#include "debugfs.h"
30
31#define MC_BATCH 32
28 32
29#define MC_DEBUG 1 33#define MC_DEBUG 1
30 34
31#define MC_BATCH 32 35#define MC_ARGS (MC_BATCH * 16)
32#define MC_ARGS (MC_BATCH * 16 / sizeof(u64)) 36
33 37
34struct mc_buffer { 38struct mc_buffer {
35 struct multicall_entry entries[MC_BATCH]; 39 struct multicall_entry entries[MC_BATCH];
36#if MC_DEBUG 40#if MC_DEBUG
37 struct multicall_entry debug[MC_BATCH]; 41 struct multicall_entry debug[MC_BATCH];
38#endif 42#endif
39 u64 args[MC_ARGS]; 43 unsigned char args[MC_ARGS];
40 struct callback { 44 struct callback {
41 void (*fn)(void *); 45 void (*fn)(void *);
42 void *data; 46 void *data;
@@ -47,6 +51,76 @@ struct mc_buffer {
47static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); 51static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
48DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); 52DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
49 53
54/* flush reasons 0- slots, 1- args, 2- callbacks */
55enum flush_reasons
56{
57 FL_SLOTS,
58 FL_ARGS,
59 FL_CALLBACKS,
60
61 FL_N_REASONS
62};
63
64#ifdef CONFIG_XEN_DEBUG_FS
65#define NHYPERCALLS 40 /* not really */
66
67static struct {
68 unsigned histo[MC_BATCH+1];
69
70 unsigned issued;
71 unsigned arg_total;
72 unsigned hypercalls;
73 unsigned histo_hypercalls[NHYPERCALLS];
74
75 unsigned flush[FL_N_REASONS];
76} mc_stats;
77
78static u8 zero_stats;
79
80static inline void check_zero(void)
81{
82 if (unlikely(zero_stats)) {
83 memset(&mc_stats, 0, sizeof(mc_stats));
84 zero_stats = 0;
85 }
86}
87
88static void mc_add_stats(const struct mc_buffer *mc)
89{
90 int i;
91
92 check_zero();
93
94 mc_stats.issued++;
95 mc_stats.hypercalls += mc->mcidx;
96 mc_stats.arg_total += mc->argidx;
97
98 mc_stats.histo[mc->mcidx]++;
99 for(i = 0; i < mc->mcidx; i++) {
100 unsigned op = mc->entries[i].op;
101 if (op < NHYPERCALLS)
102 mc_stats.histo_hypercalls[op]++;
103 }
104}
105
106static void mc_stats_flush(enum flush_reasons idx)
107{
108 check_zero();
109
110 mc_stats.flush[idx]++;
111}
112
113#else /* !CONFIG_XEN_DEBUG_FS */
114
115static inline void mc_add_stats(const struct mc_buffer *mc)
116{
117}
118
119static inline void mc_stats_flush(enum flush_reasons idx)
120{
121}
122#endif /* CONFIG_XEN_DEBUG_FS */
123
50void xen_mc_flush(void) 124void xen_mc_flush(void)
51{ 125{
52 struct mc_buffer *b = &__get_cpu_var(mc_buffer); 126 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
@@ -60,6 +134,8 @@ void xen_mc_flush(void)
60 something in the middle */ 134 something in the middle */
61 local_irq_save(flags); 135 local_irq_save(flags);
62 136
137 mc_add_stats(b);
138
63 if (b->mcidx) { 139 if (b->mcidx) {
64#if MC_DEBUG 140#if MC_DEBUG
65 memcpy(b->debug, b->entries, 141 memcpy(b->debug, b->entries,
@@ -76,6 +152,7 @@ void xen_mc_flush(void)
76 if (ret) { 152 if (ret) {
77 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n", 153 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
78 ret, smp_processor_id()); 154 ret, smp_processor_id());
155 dump_stack();
79 for (i = 0; i < b->mcidx; i++) { 156 for (i = 0; i < b->mcidx; i++) {
80 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n", 157 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
81 i+1, b->mcidx, 158 i+1, b->mcidx,
@@ -107,20 +184,49 @@ struct multicall_space __xen_mc_entry(size_t args)
107{ 184{
108 struct mc_buffer *b = &__get_cpu_var(mc_buffer); 185 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
109 struct multicall_space ret; 186 struct multicall_space ret;
110 unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64); 187 unsigned argidx = roundup(b->argidx, sizeof(u64));
111 188
112 BUG_ON(preemptible()); 189 BUG_ON(preemptible());
113 BUG_ON(argspace > MC_ARGS); 190 BUG_ON(b->argidx > MC_ARGS);
114 191
115 if (b->mcidx == MC_BATCH || 192 if (b->mcidx == MC_BATCH ||
116 (b->argidx + argspace) > MC_ARGS) 193 (argidx + args) > MC_ARGS) {
194 mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS);
117 xen_mc_flush(); 195 xen_mc_flush();
196 argidx = roundup(b->argidx, sizeof(u64));
197 }
118 198
119 ret.mc = &b->entries[b->mcidx]; 199 ret.mc = &b->entries[b->mcidx];
120 b->mcidx++; 200 b->mcidx++;
201 ret.args = &b->args[argidx];
202 b->argidx = argidx + args;
203
204 BUG_ON(b->argidx > MC_ARGS);
205 return ret;
206}
207
208struct multicall_space xen_mc_extend_args(unsigned long op, size_t size)
209{
210 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
211 struct multicall_space ret = { NULL, NULL };
212
213 BUG_ON(preemptible());
214 BUG_ON(b->argidx > MC_ARGS);
215
216 if (b->mcidx == 0)
217 return ret;
218
219 if (b->entries[b->mcidx - 1].op != op)
220 return ret;
221
222 if ((b->argidx + size) > MC_ARGS)
223 return ret;
224
225 ret.mc = &b->entries[b->mcidx - 1];
121 ret.args = &b->args[b->argidx]; 226 ret.args = &b->args[b->argidx];
122 b->argidx += argspace; 227 b->argidx += size;
123 228
229 BUG_ON(b->argidx > MC_ARGS);
124 return ret; 230 return ret;
125} 231}
126 232
@@ -129,10 +235,44 @@ void xen_mc_callback(void (*fn)(void *), void *data)
129 struct mc_buffer *b = &__get_cpu_var(mc_buffer); 235 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
130 struct callback *cb; 236 struct callback *cb;
131 237
132 if (b->cbidx == MC_BATCH) 238 if (b->cbidx == MC_BATCH) {
239 mc_stats_flush(FL_CALLBACKS);
133 xen_mc_flush(); 240 xen_mc_flush();
241 }
134 242
135 cb = &b->callbacks[b->cbidx++]; 243 cb = &b->callbacks[b->cbidx++];
136 cb->fn = fn; 244 cb->fn = fn;
137 cb->data = data; 245 cb->data = data;
138} 246}
247
248#ifdef CONFIG_XEN_DEBUG_FS
249
250static struct dentry *d_mc_debug;
251
252static int __init xen_mc_debugfs(void)
253{
254 struct dentry *d_xen = xen_init_debugfs();
255
256 if (d_xen == NULL)
257 return -ENOMEM;
258
259 d_mc_debug = debugfs_create_dir("multicalls", d_xen);
260
261 debugfs_create_u8("zero_stats", 0644, d_mc_debug, &zero_stats);
262
263 debugfs_create_u32("batches", 0444, d_mc_debug, &mc_stats.issued);
264 debugfs_create_u32("hypercalls", 0444, d_mc_debug, &mc_stats.hypercalls);
265 debugfs_create_u32("arg_total", 0444, d_mc_debug, &mc_stats.arg_total);
266
267 xen_debugfs_create_u32_array("batch_histo", 0444, d_mc_debug,
268 mc_stats.histo, MC_BATCH);
269 xen_debugfs_create_u32_array("hypercall_histo", 0444, d_mc_debug,
270 mc_stats.histo_hypercalls, NHYPERCALLS);
271 xen_debugfs_create_u32_array("flush_reasons", 0444, d_mc_debug,
272 mc_stats.flush, FL_N_REASONS);
273
274 return 0;
275}
276fs_initcall(xen_mc_debugfs);
277
278#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index 8bae996d99a3..858938241616 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -45,4 +45,16 @@ static inline void xen_mc_issue(unsigned mode)
45/* Set up a callback to be called when the current batch is flushed */ 45/* Set up a callback to be called when the current batch is flushed */
46void xen_mc_callback(void (*fn)(void *), void *data); 46void xen_mc_callback(void (*fn)(void *), void *data);
47 47
48/*
49 * Try to extend the arguments of the previous multicall command. The
50 * previous command's op must match. If it does, then it attempts to
51 * extend the argument space allocated to the multicall entry by
52 * arg_size bytes.
53 *
54 * The returned multicall_space will return with mc pointing to the
55 * command on success, or NULL on failure, and args pointing to the
56 * newly allocated space.
57 */
58struct multicall_space xen_mc_extend_args(unsigned long op, size_t arg_size);
59
48#endif /* _XEN_MULTICALLS_H */ 60#endif /* _XEN_MULTICALLS_H */
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 82517e4a752a..d67901083888 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -13,9 +13,11 @@
13#include <asm/vdso.h> 13#include <asm/vdso.h>
14#include <asm/e820.h> 14#include <asm/e820.h>
15#include <asm/setup.h> 15#include <asm/setup.h>
16#include <asm/acpi.h>
16#include <asm/xen/hypervisor.h> 17#include <asm/xen/hypervisor.h>
17#include <asm/xen/hypercall.h> 18#include <asm/xen/hypercall.h>
18 19
20#include <xen/page.h>
19#include <xen/interface/callback.h> 21#include <xen/interface/callback.h>
20#include <xen/interface/physdev.h> 22#include <xen/interface/physdev.h>
21#include <xen/features.h> 23#include <xen/features.h>
@@ -27,8 +29,6 @@
27extern const char xen_hypervisor_callback[]; 29extern const char xen_hypervisor_callback[];
28extern const char xen_failsafe_callback[]; 30extern const char xen_failsafe_callback[];
29 31
30unsigned long *phys_to_machine_mapping;
31EXPORT_SYMBOL(phys_to_machine_mapping);
32 32
33/** 33/**
34 * machine_specific_memory_setup - Hook for machine specific memory setup. 34 * machine_specific_memory_setup - Hook for machine specific memory setup.
@@ -38,9 +38,31 @@ char * __init xen_memory_setup(void)
38{ 38{
39 unsigned long max_pfn = xen_start_info->nr_pages; 39 unsigned long max_pfn = xen_start_info->nr_pages;
40 40
41 max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
42
41 e820.nr_map = 0; 43 e820.nr_map = 0;
42 add_memory_region(0, LOWMEMSIZE(), E820_RAM); 44
43 add_memory_region(HIGH_MEMORY, PFN_PHYS(max_pfn)-HIGH_MEMORY, E820_RAM); 45 e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM);
46
47 /*
48 * Even though this is normal, usable memory under Xen, reserve
49 * ISA memory anyway because too many things think they can poke
50 * about in there.
51 */
52 e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
53 E820_RESERVED);
54
55 /*
56 * Reserve Xen bits:
57 * - mfn_list
58 * - xen_start_info
59 * See comment above "struct start_info" in <xen/interface/xen.h>
60 */
61 e820_add_region(__pa(xen_start_info->mfn_list),
62 xen_start_info->pt_base - xen_start_info->mfn_list,
63 E820_RESERVED);
64
65 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
44 66
45 return "Xen"; 67 return "Xen";
46} 68}
@@ -61,30 +83,72 @@ static void xen_idle(void)
61 83
62/* 84/*
63 * Set the bit indicating "nosegneg" library variants should be used. 85 * Set the bit indicating "nosegneg" library variants should be used.
86 * We only need to bother in pure 32-bit mode; compat 32-bit processes
87 * can have un-truncated segments, so wrapping around is allowed.
64 */ 88 */
65static void __init fiddle_vdso(void) 89static void __init fiddle_vdso(void)
66{ 90{
67 extern const char vdso32_default_start; 91#ifdef CONFIG_X86_32
68 u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK); 92 u32 *mask;
93 mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
94 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
95 mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
69 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; 96 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
97#endif
70} 98}
71 99
72void xen_enable_sysenter(void) 100static __cpuinit int register_callback(unsigned type, const void *func)
73{ 101{
74 int cpu = smp_processor_id(); 102 struct callback_register callback = {
75 extern void xen_sysenter_target(void); 103 .type = type,
76 /* Mask events on entry, even though they get enabled immediately */ 104 .address = XEN_CALLBACK(__KERNEL_CS, func),
77 static struct callback_register sysenter = {
78 .type = CALLBACKTYPE_sysenter,
79 .address = { __KERNEL_CS, (unsigned long)xen_sysenter_target },
80 .flags = CALLBACKF_mask_events, 105 .flags = CALLBACKF_mask_events,
81 }; 106 };
82 107
83 if (!boot_cpu_has(X86_FEATURE_SEP) || 108 return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
84 HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) { 109}
85 clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP); 110
86 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); 111void __cpuinit xen_enable_sysenter(void)
112{
113 extern void xen_sysenter_target(void);
114 int ret;
115 unsigned sysenter_feature;
116
117#ifdef CONFIG_X86_32
118 sysenter_feature = X86_FEATURE_SEP;
119#else
120 sysenter_feature = X86_FEATURE_SYSENTER32;
121#endif
122
123 if (!boot_cpu_has(sysenter_feature))
124 return;
125
126 ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
127 if(ret != 0)
128 setup_clear_cpu_cap(sysenter_feature);
129}
130
131void __cpuinit xen_enable_syscall(void)
132{
133#ifdef CONFIG_X86_64
134 int ret;
135 extern void xen_syscall_target(void);
136 extern void xen_syscall32_target(void);
137
138 ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
139 if (ret != 0) {
140 printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
141 /* Pretty fatal; 64-bit userspace has no other
142 mechanism for syscalls. */
143 }
144
145 if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
146 ret = register_callback(CALLBACKTYPE_syscall32,
147 xen_syscall32_target);
148 if (ret != 0)
149 setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
87 } 150 }
151#endif /* CONFIG_X86_64 */
88} 152}
89 153
90void __init xen_arch_setup(void) 154void __init xen_arch_setup(void)
@@ -98,10 +162,12 @@ void __init xen_arch_setup(void)
98 if (!xen_feature(XENFEAT_auto_translated_physmap)) 162 if (!xen_feature(XENFEAT_auto_translated_physmap))
99 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); 163 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
100 164
101 HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, 165 if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
102 __KERNEL_CS, (unsigned long)xen_failsafe_callback); 166 register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
167 BUG();
103 168
104 xen_enable_sysenter(); 169 xen_enable_sysenter();
170 xen_enable_syscall();
105 171
106 set_iopl.iopl = 1; 172 set_iopl.iopl = 1;
107 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 173 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
@@ -121,11 +187,6 @@ void __init xen_arch_setup(void)
121 187
122 pm_idle = xen_idle; 188 pm_idle = xen_idle;
123 189
124#ifdef CONFIG_SMP
125 /* fill cpus_possible with all available cpus */
126 xen_fill_possible_map();
127#endif
128
129 paravirt_disable_iospace(); 190 paravirt_disable_iospace();
130 191
131 fiddle_vdso(); 192 fiddle_vdso();
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 94e69000f982..d77da613b1d2 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -11,8 +11,6 @@
11 * useful topology information for the kernel to make use of. As a 11 * useful topology information for the kernel to make use of. As a
12 * result, all CPUs are treated as if they're single-core and 12 * result, all CPUs are treated as if they're single-core and
13 * single-threaded. 13 * single-threaded.
14 *
15 * This does not handle HOTPLUG_CPU yet.
16 */ 14 */
17#include <linux/sched.h> 15#include <linux/sched.h>
18#include <linux/err.h> 16#include <linux/err.h>
@@ -35,28 +33,15 @@
35#include "xen-ops.h" 33#include "xen-ops.h"
36#include "mmu.h" 34#include "mmu.h"
37 35
38static cpumask_t xen_cpu_initialized_map; 36cpumask_t xen_cpu_initialized_map;
39static DEFINE_PER_CPU(int, resched_irq) = -1;
40static DEFINE_PER_CPU(int, callfunc_irq) = -1;
41static DEFINE_PER_CPU(int, debug_irq) = -1;
42 37
43/* 38static DEFINE_PER_CPU(int, resched_irq);
44 * Structure and data for smp_call_function(). This is designed to minimise 39static DEFINE_PER_CPU(int, callfunc_irq);
45 * static memory requirements. It also looks cleaner. 40static DEFINE_PER_CPU(int, callfuncsingle_irq);
46 */ 41static DEFINE_PER_CPU(int, debug_irq) = -1;
47static DEFINE_SPINLOCK(call_lock);
48
49struct call_data_struct {
50 void (*func) (void *info);
51 void *info;
52 atomic_t started;
53 atomic_t finished;
54 int wait;
55};
56 42
57static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); 43static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
58 44static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
59static struct call_data_struct *call_data;
60 45
61/* 46/*
62 * Reschedule call back. Nothing to do, 47 * Reschedule call back. Nothing to do,
@@ -65,25 +50,46 @@ static struct call_data_struct *call_data;
65 */ 50 */
66static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) 51static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
67{ 52{
53#ifdef CONFIG_X86_32
54 __get_cpu_var(irq_stat).irq_resched_count++;
55#else
56 add_pda(irq_resched_count, 1);
57#endif
58
68 return IRQ_HANDLED; 59 return IRQ_HANDLED;
69} 60}
70 61
71static __cpuinit void cpu_bringup_and_idle(void) 62static __cpuinit void cpu_bringup(void)
72{ 63{
73 int cpu = smp_processor_id(); 64 int cpu = smp_processor_id();
74 65
75 cpu_init(); 66 cpu_init();
67 touch_softlockup_watchdog();
68 preempt_disable();
69
76 xen_enable_sysenter(); 70 xen_enable_sysenter();
71 xen_enable_syscall();
77 72
78 preempt_disable(); 73 cpu = smp_processor_id();
79 per_cpu(cpu_state, cpu) = CPU_ONLINE; 74 smp_store_cpu_info(cpu);
75 cpu_data(cpu).x86_max_cores = 1;
76 set_cpu_sibling_map(cpu);
80 77
81 xen_setup_cpu_clockevents(); 78 xen_setup_cpu_clockevents();
82 79
80 cpu_set(cpu, cpu_online_map);
81 x86_write_percpu(cpu_state, CPU_ONLINE);
82 wmb();
83
83 /* We can take interrupts now: we're officially "up". */ 84 /* We can take interrupts now: we're officially "up". */
84 local_irq_enable(); 85 local_irq_enable();
85 86
86 wmb(); /* make sure everything is out */ 87 wmb(); /* make sure everything is out */
88}
89
90static __cpuinit void cpu_bringup_and_idle(void)
91{
92 cpu_bringup();
87 cpu_idle(); 93 cpu_idle();
88} 94}
89 95
@@ -122,6 +128,17 @@ static int xen_smp_intr_init(unsigned int cpu)
122 goto fail; 128 goto fail;
123 per_cpu(debug_irq, cpu) = rc; 129 per_cpu(debug_irq, cpu) = rc;
124 130
131 callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu);
132 rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR,
133 cpu,
134 xen_call_function_single_interrupt,
135 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
136 callfunc_name,
137 NULL);
138 if (rc < 0)
139 goto fail;
140 per_cpu(callfuncsingle_irq, cpu) = rc;
141
125 return 0; 142 return 0;
126 143
127 fail: 144 fail:
@@ -131,59 +148,45 @@ static int xen_smp_intr_init(unsigned int cpu)
131 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); 148 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
132 if (per_cpu(debug_irq, cpu) >= 0) 149 if (per_cpu(debug_irq, cpu) >= 0)
133 unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); 150 unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
151 if (per_cpu(callfuncsingle_irq, cpu) >= 0)
152 unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL);
153
134 return rc; 154 return rc;
135} 155}
136 156
137void __init xen_fill_possible_map(void) 157static void __init xen_fill_possible_map(void)
138{ 158{
139 int i, rc; 159 int i, rc;
140 160
141 for (i = 0; i < NR_CPUS; i++) { 161 for (i = 0; i < NR_CPUS; i++) {
142 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); 162 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
143 if (rc >= 0) 163 if (rc >= 0) {
164 num_processors++;
144 cpu_set(i, cpu_possible_map); 165 cpu_set(i, cpu_possible_map);
166 }
145 } 167 }
146} 168}
147 169
148void __init xen_smp_prepare_boot_cpu(void) 170static void __init xen_smp_prepare_boot_cpu(void)
149{ 171{
150 int cpu;
151
152 BUG_ON(smp_processor_id() != 0); 172 BUG_ON(smp_processor_id() != 0);
153 native_smp_prepare_boot_cpu(); 173 native_smp_prepare_boot_cpu();
154 174
155 /* We've switched to the "real" per-cpu gdt, so make sure the 175 /* We've switched to the "real" per-cpu gdt, so make sure the
156 old memory can be recycled */ 176 old memory can be recycled */
157 make_lowmem_page_readwrite(&per_cpu__gdt_page); 177 make_lowmem_page_readwrite(&per_cpu_var(gdt_page));
158
159 for_each_possible_cpu(cpu) {
160 cpus_clear(per_cpu(cpu_sibling_map, cpu));
161 /*
162 * cpu_core_map lives in a per cpu area that is cleared
163 * when the per cpu array is allocated.
164 *
165 * cpus_clear(per_cpu(cpu_core_map, cpu));
166 */
167 }
168 178
169 xen_setup_vcpu_info_placement(); 179 xen_setup_vcpu_info_placement();
170} 180}
171 181
172void __init xen_smp_prepare_cpus(unsigned int max_cpus) 182static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
173{ 183{
174 unsigned cpu; 184 unsigned cpu;
175 185
176 for_each_possible_cpu(cpu) { 186 xen_init_lock_cpu(0);
177 cpus_clear(per_cpu(cpu_sibling_map, cpu));
178 /*
179 * cpu_core_ map will be zeroed when the per
180 * cpu area is allocated.
181 *
182 * cpus_clear(per_cpu(cpu_core_map, cpu));
183 */
184 }
185 187
186 smp_store_cpu_info(0); 188 smp_store_cpu_info(0);
189 cpu_data(0).x86_max_cores = 1;
187 set_cpu_sibling_map(0); 190 set_cpu_sibling_map(0);
188 191
189 if (xen_smp_intr_init(0)) 192 if (xen_smp_intr_init(0))
@@ -210,15 +213,13 @@ void __init xen_smp_prepare_cpus(unsigned int max_cpus)
210 213
211 cpu_set(cpu, cpu_present_map); 214 cpu_set(cpu, cpu_present_map);
212 } 215 }
213
214 //init_xenbus_allowed_cpumask();
215} 216}
216 217
217static __cpuinit int 218static __cpuinit int
218cpu_initialize_context(unsigned int cpu, struct task_struct *idle) 219cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
219{ 220{
220 struct vcpu_guest_context *ctxt; 221 struct vcpu_guest_context *ctxt;
221 struct gdt_page *gdt = &per_cpu(gdt_page, cpu); 222 struct desc_struct *gdt;
222 223
223 if (cpu_test_and_set(cpu, xen_cpu_initialized_map)) 224 if (cpu_test_and_set(cpu, xen_cpu_initialized_map))
224 return 0; 225 return 0;
@@ -227,12 +228,15 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
227 if (ctxt == NULL) 228 if (ctxt == NULL)
228 return -ENOMEM; 229 return -ENOMEM;
229 230
231 gdt = get_cpu_gdt_table(cpu);
232
230 ctxt->flags = VGCF_IN_KERNEL; 233 ctxt->flags = VGCF_IN_KERNEL;
231 ctxt->user_regs.ds = __USER_DS; 234 ctxt->user_regs.ds = __USER_DS;
232 ctxt->user_regs.es = __USER_DS; 235 ctxt->user_regs.es = __USER_DS;
233 ctxt->user_regs.fs = __KERNEL_PERCPU;
234 ctxt->user_regs.gs = 0;
235 ctxt->user_regs.ss = __KERNEL_DS; 236 ctxt->user_regs.ss = __KERNEL_DS;
237#ifdef CONFIG_X86_32
238 ctxt->user_regs.fs = __KERNEL_PERCPU;
239#endif
236 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; 240 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
237 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ 241 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
238 242
@@ -242,11 +246,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
242 246
243 ctxt->ldt_ents = 0; 247 ctxt->ldt_ents = 0;
244 248
245 BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK); 249 BUG_ON((unsigned long)gdt & ~PAGE_MASK);
246 make_lowmem_page_readonly(gdt->gdt); 250 make_lowmem_page_readonly(gdt);
247 251
248 ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt); 252 ctxt->gdt_frames[0] = virt_to_mfn(gdt);
249 ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt); 253 ctxt->gdt_ents = GDT_ENTRIES;
250 254
251 ctxt->user_regs.cs = __KERNEL_CS; 255 ctxt->user_regs.cs = __KERNEL_CS;
252 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); 256 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
@@ -254,9 +258,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
254 ctxt->kernel_ss = __KERNEL_DS; 258 ctxt->kernel_ss = __KERNEL_DS;
255 ctxt->kernel_sp = idle->thread.sp0; 259 ctxt->kernel_sp = idle->thread.sp0;
256 260
261#ifdef CONFIG_X86_32
257 ctxt->event_callback_cs = __KERNEL_CS; 262 ctxt->event_callback_cs = __KERNEL_CS;
258 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
259 ctxt->failsafe_callback_cs = __KERNEL_CS; 263 ctxt->failsafe_callback_cs = __KERNEL_CS;
264#endif
265 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
260 ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; 266 ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
261 267
262 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); 268 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
@@ -269,21 +275,33 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
269 return 0; 275 return 0;
270} 276}
271 277
272int __cpuinit xen_cpu_up(unsigned int cpu) 278static int __cpuinit xen_cpu_up(unsigned int cpu)
273{ 279{
274 struct task_struct *idle = idle_task(cpu); 280 struct task_struct *idle = idle_task(cpu);
275 int rc; 281 int rc;
276 282
277#if 0 283#ifdef CONFIG_X86_64
278 rc = cpu_up_check(cpu); 284 /* Allocate node local memory for AP pdas */
279 if (rc) 285 WARN_ON(cpu == 0);
280 return rc; 286 if (cpu > 0) {
287 rc = get_local_pda(cpu);
288 if (rc)
289 return rc;
290 }
281#endif 291#endif
282 292
293#ifdef CONFIG_X86_32
283 init_gdt(cpu); 294 init_gdt(cpu);
284 per_cpu(current_task, cpu) = idle; 295 per_cpu(current_task, cpu) = idle;
285 irq_ctx_init(cpu); 296 irq_ctx_init(cpu);
297#else
298 cpu_pda(cpu)->pcurrent = idle;
299 clear_tsk_thread_flag(idle, TIF_FORK);
300#endif
286 xen_setup_timer(cpu); 301 xen_setup_timer(cpu);
302 xen_init_lock_cpu(cpu);
303
304 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
287 305
288 /* make sure interrupts start blocked */ 306 /* make sure interrupts start blocked */
289 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; 307 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
@@ -299,23 +317,75 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
299 if (rc) 317 if (rc)
300 return rc; 318 return rc;
301 319
302 smp_store_cpu_info(cpu);
303 set_cpu_sibling_map(cpu);
304 /* This must be done before setting cpu_online_map */
305 wmb();
306
307 cpu_set(cpu, cpu_online_map);
308
309 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); 320 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
310 BUG_ON(rc); 321 BUG_ON(rc);
311 322
323 while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
324 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
325 barrier();
326 }
327
328 return 0;
329}
330
331static void xen_smp_cpus_done(unsigned int max_cpus)
332{
333}
334
335#ifdef CONFIG_HOTPLUG_CPU
336static int xen_cpu_disable(void)
337{
338 unsigned int cpu = smp_processor_id();
339 if (cpu == 0)
340 return -EBUSY;
341
342 cpu_disable_common();
343
344 load_cr3(swapper_pg_dir);
312 return 0; 345 return 0;
313} 346}
314 347
315void xen_smp_cpus_done(unsigned int max_cpus) 348static void xen_cpu_die(unsigned int cpu)
349{
350 while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
351 current->state = TASK_UNINTERRUPTIBLE;
352 schedule_timeout(HZ/10);
353 }
354 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
355 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
356 unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
357 unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL);
358 xen_uninit_lock_cpu(cpu);
359 xen_teardown_timer(cpu);
360
361 if (num_online_cpus() == 1)
362 alternatives_smp_switch(0);
363}
364
365static void xen_play_dead(void)
366{
367 play_dead_common();
368 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
369 cpu_bringup();
370}
371
372#else /* !CONFIG_HOTPLUG_CPU */
373static int xen_cpu_disable(void)
316{ 374{
375 return -ENOSYS;
317} 376}
318 377
378static void xen_cpu_die(unsigned int cpu)
379{
380 BUG();
381}
382
383static void xen_play_dead(void)
384{
385 BUG();
386}
387
388#endif
319static void stop_self(void *v) 389static void stop_self(void *v)
320{ 390{
321 int cpu = smp_processor_id(); 391 int cpu = smp_processor_id();
@@ -328,104 +398,94 @@ static void stop_self(void *v)
328 BUG(); 398 BUG();
329} 399}
330 400
331void xen_smp_send_stop(void) 401static void xen_smp_send_stop(void)
332{ 402{
333 smp_call_function(stop_self, NULL, 0, 0); 403 smp_call_function(stop_self, NULL, 0);
334} 404}
335 405
336void xen_smp_send_reschedule(int cpu) 406static void xen_smp_send_reschedule(int cpu)
337{ 407{
338 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); 408 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
339} 409}
340 410
341
342static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector) 411static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
343{ 412{
344 unsigned cpu; 413 unsigned cpu;
345 414
346 cpus_and(mask, mask, cpu_online_map); 415 cpus_and(mask, mask, cpu_online_map);
347 416
348 for_each_cpu_mask(cpu, mask) 417 for_each_cpu_mask_nr(cpu, mask)
349 xen_send_IPI_one(cpu, vector); 418 xen_send_IPI_one(cpu, vector);
350} 419}
351 420
421static void xen_smp_send_call_function_ipi(cpumask_t mask)
422{
423 int cpu;
424
425 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
426
427 /* Make sure other vcpus get a chance to run if they need to. */
428 for_each_cpu_mask_nr(cpu, mask) {
429 if (xen_vcpu_stolen(cpu)) {
430 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
431 break;
432 }
433 }
434}
435
436static void xen_smp_send_call_function_single_ipi(int cpu)
437{
438 xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
439}
440
352static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) 441static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
353{ 442{
354 void (*func) (void *info) = call_data->func;
355 void *info = call_data->info;
356 int wait = call_data->wait;
357
358 /*
359 * Notify initiating CPU that I've grabbed the data and am
360 * about to execute the function
361 */
362 mb();
363 atomic_inc(&call_data->started);
364 /*
365 * At this point the info structure may be out of scope unless wait==1
366 */
367 irq_enter(); 443 irq_enter();
368 (*func)(info); 444 generic_smp_call_function_interrupt();
445#ifdef CONFIG_X86_32
369 __get_cpu_var(irq_stat).irq_call_count++; 446 __get_cpu_var(irq_stat).irq_call_count++;
447#else
448 add_pda(irq_call_count, 1);
449#endif
370 irq_exit(); 450 irq_exit();
371 451
372 if (wait) {
373 mb(); /* commit everything before setting finished */
374 atomic_inc(&call_data->finished);
375 }
376
377 return IRQ_HANDLED; 452 return IRQ_HANDLED;
378} 453}
379 454
380int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), 455static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
381 void *info, int wait)
382{ 456{
383 struct call_data_struct data; 457 irq_enter();
384 int cpus, cpu; 458 generic_smp_call_function_single_interrupt();
385 bool yield; 459#ifdef CONFIG_X86_32
386 460 __get_cpu_var(irq_stat).irq_call_count++;
387 /* Holding any lock stops cpus from going down. */ 461#else
388 spin_lock(&call_lock); 462 add_pda(irq_call_count, 1);
389 463#endif
390 cpu_clear(smp_processor_id(), mask); 464 irq_exit();
391
392 cpus = cpus_weight(mask);
393 if (!cpus) {
394 spin_unlock(&call_lock);
395 return 0;
396 }
397
398 /* Can deadlock when called with interrupts disabled */
399 WARN_ON(irqs_disabled());
400
401 data.func = func;
402 data.info = info;
403 atomic_set(&data.started, 0);
404 data.wait = wait;
405 if (wait)
406 atomic_set(&data.finished, 0);
407
408 call_data = &data;
409 mb(); /* write everything before IPI */
410 465
411 /* Send a message to other CPUs and wait for them to respond */ 466 return IRQ_HANDLED;
412 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); 467}
413 468
414 /* Make sure other vcpus get a chance to run if they need to. */ 469static const struct smp_ops xen_smp_ops __initdata = {
415 yield = false; 470 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
416 for_each_cpu_mask(cpu, mask) 471 .smp_prepare_cpus = xen_smp_prepare_cpus,
417 if (xen_vcpu_stolen(cpu)) 472 .smp_cpus_done = xen_smp_cpus_done,
418 yield = true;
419 473
420 if (yield) 474 .cpu_up = xen_cpu_up,
421 HYPERVISOR_sched_op(SCHEDOP_yield, 0); 475 .cpu_die = xen_cpu_die,
476 .cpu_disable = xen_cpu_disable,
477 .play_dead = xen_play_dead,
422 478
423 /* Wait for response */ 479 .smp_send_stop = xen_smp_send_stop,
424 while (atomic_read(&data.started) != cpus || 480 .smp_send_reschedule = xen_smp_send_reschedule,
425 (wait && atomic_read(&data.finished) != cpus))
426 cpu_relax();
427 481
428 spin_unlock(&call_lock); 482 .send_call_func_ipi = xen_smp_send_call_function_ipi,
483 .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
484};
429 485
430 return 0; 486void __init xen_smp_init(void)
487{
488 smp_ops = xen_smp_ops;
489 xen_fill_possible_map();
490 xen_init_spinlocks();
431} 491}
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
new file mode 100644
index 000000000000..dd71e3a021cd
--- /dev/null
+++ b/arch/x86/xen/spinlock.c
@@ -0,0 +1,428 @@
1/*
2 * Split spinlock implementation out into its own file, so it can be
3 * compiled in a FTRACE-compatible way.
4 */
5#include <linux/kernel_stat.h>
6#include <linux/spinlock.h>
7#include <linux/debugfs.h>
8#include <linux/log2.h>
9
10#include <asm/paravirt.h>
11
12#include <xen/interface/xen.h>
13#include <xen/events.h>
14
15#include "xen-ops.h"
16#include "debugfs.h"
17
18#ifdef CONFIG_XEN_DEBUG_FS
19static struct xen_spinlock_stats
20{
21 u64 taken;
22 u32 taken_slow;
23 u32 taken_slow_nested;
24 u32 taken_slow_pickup;
25 u32 taken_slow_spurious;
26 u32 taken_slow_irqenable;
27
28 u64 released;
29 u32 released_slow;
30 u32 released_slow_kicked;
31
32#define HISTO_BUCKETS 30
33 u32 histo_spin_total[HISTO_BUCKETS+1];
34 u32 histo_spin_spinning[HISTO_BUCKETS+1];
35 u32 histo_spin_blocked[HISTO_BUCKETS+1];
36
37 u64 time_total;
38 u64 time_spinning;
39 u64 time_blocked;
40} spinlock_stats;
41
42static u8 zero_stats;
43
44static unsigned lock_timeout = 1 << 10;
45#define TIMEOUT lock_timeout
46
47static inline void check_zero(void)
48{
49 if (unlikely(zero_stats)) {
50 memset(&spinlock_stats, 0, sizeof(spinlock_stats));
51 zero_stats = 0;
52 }
53}
54
55#define ADD_STATS(elem, val) \
56 do { check_zero(); spinlock_stats.elem += (val); } while(0)
57
58static inline u64 spin_time_start(void)
59{
60 return xen_clocksource_read();
61}
62
63static void __spin_time_accum(u64 delta, u32 *array)
64{
65 unsigned index = ilog2(delta);
66
67 check_zero();
68
69 if (index < HISTO_BUCKETS)
70 array[index]++;
71 else
72 array[HISTO_BUCKETS]++;
73}
74
75static inline void spin_time_accum_spinning(u64 start)
76{
77 u32 delta = xen_clocksource_read() - start;
78
79 __spin_time_accum(delta, spinlock_stats.histo_spin_spinning);
80 spinlock_stats.time_spinning += delta;
81}
82
83static inline void spin_time_accum_total(u64 start)
84{
85 u32 delta = xen_clocksource_read() - start;
86
87 __spin_time_accum(delta, spinlock_stats.histo_spin_total);
88 spinlock_stats.time_total += delta;
89}
90
91static inline void spin_time_accum_blocked(u64 start)
92{
93 u32 delta = xen_clocksource_read() - start;
94
95 __spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
96 spinlock_stats.time_blocked += delta;
97}
98#else /* !CONFIG_XEN_DEBUG_FS */
99#define TIMEOUT (1 << 10)
100#define ADD_STATS(elem, val) do { (void)(val); } while(0)
101
102static inline u64 spin_time_start(void)
103{
104 return 0;
105}
106
107static inline void spin_time_accum_total(u64 start)
108{
109}
110static inline void spin_time_accum_spinning(u64 start)
111{
112}
113static inline void spin_time_accum_blocked(u64 start)
114{
115}
116#endif /* CONFIG_XEN_DEBUG_FS */
117
118struct xen_spinlock {
119 unsigned char lock; /* 0 -> free; 1 -> locked */
120 unsigned short spinners; /* count of waiting cpus */
121};
122
123static int xen_spin_is_locked(struct raw_spinlock *lock)
124{
125 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
126
127 return xl->lock != 0;
128}
129
130static int xen_spin_is_contended(struct raw_spinlock *lock)
131{
132 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
133
134 /* Not strictly true; this is only the count of contended
135 lock-takers entering the slow path. */
136 return xl->spinners != 0;
137}
138
139static int xen_spin_trylock(struct raw_spinlock *lock)
140{
141 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
142 u8 old = 1;
143
144 asm("xchgb %b0,%1"
145 : "+q" (old), "+m" (xl->lock) : : "memory");
146
147 return old == 0;
148}
149
150static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
151static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
152
153/*
154 * Mark a cpu as interested in a lock. Returns the CPU's previous
155 * lock of interest, in case we got preempted by an interrupt.
156 */
157static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
158{
159 struct xen_spinlock *prev;
160
161 prev = __get_cpu_var(lock_spinners);
162 __get_cpu_var(lock_spinners) = xl;
163
164 wmb(); /* set lock of interest before count */
165
166 asm(LOCK_PREFIX " incw %0"
167 : "+m" (xl->spinners) : : "memory");
168
169 return prev;
170}
171
172/*
173 * Mark a cpu as no longer interested in a lock. Restores previous
174 * lock of interest (NULL for none).
175 */
176static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev)
177{
178 asm(LOCK_PREFIX " decw %0"
179 : "+m" (xl->spinners) : : "memory");
180 wmb(); /* decrement count before restoring lock */
181 __get_cpu_var(lock_spinners) = prev;
182}
183
184static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enable)
185{
186 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
187 struct xen_spinlock *prev;
188 int irq = __get_cpu_var(lock_kicker_irq);
189 int ret;
190 unsigned long flags;
191 u64 start;
192
193 /* If kicker interrupts not initialized yet, just spin */
194 if (irq == -1)
195 return 0;
196
197 start = spin_time_start();
198
199 /* announce we're spinning */
200 prev = spinning_lock(xl);
201
202 flags = __raw_local_save_flags();
203 if (irq_enable) {
204 ADD_STATS(taken_slow_irqenable, 1);
205 raw_local_irq_enable();
206 }
207
208 ADD_STATS(taken_slow, 1);
209 ADD_STATS(taken_slow_nested, prev != NULL);
210
211 do {
212 /* clear pending */
213 xen_clear_irq_pending(irq);
214
215 /* check again make sure it didn't become free while
216 we weren't looking */
217 ret = xen_spin_trylock(lock);
218 if (ret) {
219 ADD_STATS(taken_slow_pickup, 1);
220
221 /*
222 * If we interrupted another spinlock while it
223 * was blocking, make sure it doesn't block
224 * without rechecking the lock.
225 */
226 if (prev != NULL)
227 xen_set_irq_pending(irq);
228 goto out;
229 }
230
231 /*
232 * Block until irq becomes pending. If we're
233 * interrupted at this point (after the trylock but
234 * before entering the block), then the nested lock
235 * handler guarantees that the irq will be left
236 * pending if there's any chance the lock became free;
237 * xen_poll_irq() returns immediately if the irq is
238 * pending.
239 */
240 xen_poll_irq(irq);
241 ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
242 } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
243
244 kstat_this_cpu.irqs[irq]++;
245
246out:
247 raw_local_irq_restore(flags);
248 unspinning_lock(xl, prev);
249 spin_time_accum_blocked(start);
250
251 return ret;
252}
253
254static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable)
255{
256 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
257 unsigned timeout;
258 u8 oldval;
259 u64 start_spin;
260
261 ADD_STATS(taken, 1);
262
263 start_spin = spin_time_start();
264
265 do {
266 u64 start_spin_fast = spin_time_start();
267
268 timeout = TIMEOUT;
269
270 asm("1: xchgb %1,%0\n"
271 " testb %1,%1\n"
272 " jz 3f\n"
273 "2: rep;nop\n"
274 " cmpb $0,%0\n"
275 " je 1b\n"
276 " dec %2\n"
277 " jnz 2b\n"
278 "3:\n"
279 : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
280 : "1" (1)
281 : "memory");
282
283 spin_time_accum_spinning(start_spin_fast);
284
285 } while (unlikely(oldval != 0 &&
286 (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable))));
287
288 spin_time_accum_total(start_spin);
289}
290
291static void xen_spin_lock(struct raw_spinlock *lock)
292{
293 __xen_spin_lock(lock, false);
294}
295
296static void xen_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
297{
298 __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags));
299}
300
301static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
302{
303 int cpu;
304
305 ADD_STATS(released_slow, 1);
306
307 for_each_online_cpu(cpu) {
308 /* XXX should mix up next cpu selection */
309 if (per_cpu(lock_spinners, cpu) == xl) {
310 ADD_STATS(released_slow_kicked, 1);
311 xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
312 break;
313 }
314 }
315}
316
317static void xen_spin_unlock(struct raw_spinlock *lock)
318{
319 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
320
321 ADD_STATS(released, 1);
322
323 smp_wmb(); /* make sure no writes get moved after unlock */
324 xl->lock = 0; /* release lock */
325
326 /* make sure unlock happens before kick */
327 barrier();
328
329 if (unlikely(xl->spinners))
330 xen_spin_unlock_slow(xl);
331}
332
333static irqreturn_t dummy_handler(int irq, void *dev_id)
334{
335 BUG();
336 return IRQ_HANDLED;
337}
338
339void __cpuinit xen_init_lock_cpu(int cpu)
340{
341 int irq;
342 const char *name;
343
344 name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
345 irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
346 cpu,
347 dummy_handler,
348 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
349 name,
350 NULL);
351
352 if (irq >= 0) {
353 disable_irq(irq); /* make sure it's never delivered */
354 per_cpu(lock_kicker_irq, cpu) = irq;
355 }
356
357 printk("cpu %d spinlock event irq %d\n", cpu, irq);
358}
359
360void xen_uninit_lock_cpu(int cpu)
361{
362 unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL);
363}
364
365void __init xen_init_spinlocks(void)
366{
367 pv_lock_ops.spin_is_locked = xen_spin_is_locked;
368 pv_lock_ops.spin_is_contended = xen_spin_is_contended;
369 pv_lock_ops.spin_lock = xen_spin_lock;
370 pv_lock_ops.spin_lock_flags = xen_spin_lock_flags;
371 pv_lock_ops.spin_trylock = xen_spin_trylock;
372 pv_lock_ops.spin_unlock = xen_spin_unlock;
373}
374
375#ifdef CONFIG_XEN_DEBUG_FS
376
377static struct dentry *d_spin_debug;
378
379static int __init xen_spinlock_debugfs(void)
380{
381 struct dentry *d_xen = xen_init_debugfs();
382
383 if (d_xen == NULL)
384 return -ENOMEM;
385
386 d_spin_debug = debugfs_create_dir("spinlocks", d_xen);
387
388 debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
389
390 debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout);
391
392 debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken);
393 debugfs_create_u32("taken_slow", 0444, d_spin_debug,
394 &spinlock_stats.taken_slow);
395 debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug,
396 &spinlock_stats.taken_slow_nested);
397 debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
398 &spinlock_stats.taken_slow_pickup);
399 debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug,
400 &spinlock_stats.taken_slow_spurious);
401 debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug,
402 &spinlock_stats.taken_slow_irqenable);
403
404 debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released);
405 debugfs_create_u32("released_slow", 0444, d_spin_debug,
406 &spinlock_stats.released_slow);
407 debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
408 &spinlock_stats.released_slow_kicked);
409
410 debugfs_create_u64("time_spinning", 0444, d_spin_debug,
411 &spinlock_stats.time_spinning);
412 debugfs_create_u64("time_blocked", 0444, d_spin_debug,
413 &spinlock_stats.time_blocked);
414 debugfs_create_u64("time_total", 0444, d_spin_debug,
415 &spinlock_stats.time_total);
416
417 xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug,
418 spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1);
419 xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug,
420 spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1);
421 xen_debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
422 spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
423
424 return 0;
425}
426fs_initcall(xen_spinlock_debugfs);
427
428#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
new file mode 100644
index 000000000000..2a234db5949b
--- /dev/null
+++ b/arch/x86/xen/suspend.c
@@ -0,0 +1,48 @@
1#include <linux/types.h>
2
3#include <xen/interface/xen.h>
4#include <xen/grant_table.h>
5#include <xen/events.h>
6
7#include <asm/xen/hypercall.h>
8#include <asm/xen/page.h>
9
10#include "xen-ops.h"
11#include "mmu.h"
12
13void xen_pre_suspend(void)
14{
15 xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
16 xen_start_info->console.domU.mfn =
17 mfn_to_pfn(xen_start_info->console.domU.mfn);
18
19 BUG_ON(!irqs_disabled());
20
21 HYPERVISOR_shared_info = &xen_dummy_shared_info;
22 if (HYPERVISOR_update_va_mapping(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
23 __pte_ma(0), 0))
24 BUG();
25}
26
27void xen_post_suspend(int suspend_cancelled)
28{
29 xen_setup_shared_info();
30
31 if (suspend_cancelled) {
32 xen_start_info->store_mfn =
33 pfn_to_mfn(xen_start_info->store_mfn);
34 xen_start_info->console.domU.mfn =
35 pfn_to_mfn(xen_start_info->console.domU.mfn);
36 } else {
37#ifdef CONFIG_SMP
38 xen_cpu_initialized_map = cpu_online_map;
39#endif
40 xen_vcpu_restore();
41 }
42
43}
44
45void xen_arch_resume(void)
46{
47 /* nothing */
48}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 41e217503c96..004ba86326ae 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -30,8 +30,6 @@
30#define TIMER_SLOP 100000 30#define TIMER_SLOP 100000
31#define NS_PER_TICK (1000000000LL / HZ) 31#define NS_PER_TICK (1000000000LL / HZ)
32 32
33static cycle_t xen_clocksource_read(void);
34
35/* runstate info updated by Xen */ 33/* runstate info updated by Xen */
36static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); 34static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
37 35
@@ -197,8 +195,8 @@ unsigned long long xen_sched_clock(void)
197} 195}
198 196
199 197
200/* Get the CPU speed from Xen */ 198/* Get the TSC speed from Xen */
201unsigned long xen_cpu_khz(void) 199unsigned long xen_tsc_khz(void)
202{ 200{
203 u64 xen_khz = 1000000ULL << 32; 201 u64 xen_khz = 1000000ULL << 32;
204 const struct pvclock_vcpu_time_info *info = 202 const struct pvclock_vcpu_time_info *info =
@@ -213,7 +211,7 @@ unsigned long xen_cpu_khz(void)
213 return xen_khz; 211 return xen_khz;
214} 212}
215 213
216static cycle_t xen_clocksource_read(void) 214cycle_t xen_clocksource_read(void)
217{ 215{
218 struct pvclock_vcpu_time_info *src; 216 struct pvclock_vcpu_time_info *src;
219 cycle_t ret; 217 cycle_t ret;
@@ -452,6 +450,14 @@ void xen_setup_timer(int cpu)
452 setup_runstate_info(cpu); 450 setup_runstate_info(cpu);
453} 451}
454 452
453void xen_teardown_timer(int cpu)
454{
455 struct clock_event_device *evt;
456 BUG_ON(cpu == 0);
457 evt = &per_cpu(xen_clock_events, cpu);
458 unbind_from_irqhandler(evt->irq, NULL);
459}
460
455void xen_setup_cpu_clockevents(void) 461void xen_setup_cpu_clockevents(void)
456{ 462{
457 BUG_ON(preemptible()); 463 BUG_ON(preemptible());
@@ -459,6 +465,19 @@ void xen_setup_cpu_clockevents(void)
459 clockevents_register_device(&__get_cpu_var(xen_clock_events)); 465 clockevents_register_device(&__get_cpu_var(xen_clock_events));
460} 466}
461 467
468void xen_timer_resume(void)
469{
470 int cpu;
471
472 if (xen_clockevent != &xen_vcpuop_clockevent)
473 return;
474
475 for_each_online_cpu(cpu) {
476 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
477 BUG();
478 }
479}
480
462__init void xen_time_init(void) 481__init void xen_time_init(void)
463{ 482{
464 int cpu = smp_processor_id(); 483 int cpu = smp_processor_id();
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm_32.S
index 2497a30f41de..42786f59d9c0 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -298,7 +298,7 @@ check_events:
298 push %eax 298 push %eax
299 push %ecx 299 push %ecx
300 push %edx 300 push %edx
301 call force_evtchn_callback 301 call xen_force_evtchn_callback
302 pop %edx 302 pop %edx
303 pop %ecx 303 pop %ecx
304 pop %eax 304 pop %eax
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
new file mode 100644
index 000000000000..05794c566e87
--- /dev/null
+++ b/arch/x86/xen/xen-asm_64.S
@@ -0,0 +1,285 @@
1/*
2 Asm versions of Xen pv-ops, suitable for either direct use or inlining.
3 The inline versions are the same as the direct-use versions, with the
4 pre- and post-amble chopped off.
5
6 This code is encoded for size rather than absolute efficiency,
7 with a view to being able to inline as much as possible.
8
9 We only bother with direct forms (ie, vcpu in pda) of the operations
10 here; the indirect forms are better handled in C, since they're
11 generally too large to inline anyway.
12 */
13
14#include <linux/linkage.h>
15
16#include <asm/asm-offsets.h>
17#include <asm/processor-flags.h>
18#include <asm/errno.h>
19#include <asm/segment.h>
20
21#include <xen/interface/xen.h>
22
23#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
24#define ENDPATCH(x) .globl x##_end; x##_end=.
25
26/* Pseudo-flag used for virtual NMI, which we don't implement yet */
27#define XEN_EFLAGS_NMI 0x80000000
28
29#if 1
30/*
31 x86-64 does not yet support direct access to percpu variables
32 via a segment override, so we just need to make sure this code
33 never gets used
34 */
35#define BUG ud2a
36#define PER_CPU_VAR(var, off) 0xdeadbeef
37#endif
38
39/*
40 Enable events. This clears the event mask and tests the pending
41 event status with one and operation. If there are pending
42 events, then enter the hypervisor to get them handled.
43 */
44ENTRY(xen_irq_enable_direct)
45 BUG
46
47 /* Unmask events */
48 movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
49
50 /* Preempt here doesn't matter because that will deal with
51 any pending interrupts. The pending check may end up being
52 run on the wrong CPU, but that doesn't hurt. */
53
54 /* Test for pending */
55 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
56 jz 1f
57
582: call check_events
591:
60ENDPATCH(xen_irq_enable_direct)
61 ret
62 ENDPROC(xen_irq_enable_direct)
63 RELOC(xen_irq_enable_direct, 2b+1)
64
65/*
66 Disabling events is simply a matter of making the event mask
67 non-zero.
68 */
69ENTRY(xen_irq_disable_direct)
70 BUG
71
72 movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
73ENDPATCH(xen_irq_disable_direct)
74 ret
75 ENDPROC(xen_irq_disable_direct)
76 RELOC(xen_irq_disable_direct, 0)
77
78/*
79 (xen_)save_fl is used to get the current interrupt enable status.
80 Callers expect the status to be in X86_EFLAGS_IF, and other bits
81 may be set in the return value. We take advantage of this by
82 making sure that X86_EFLAGS_IF has the right value (and other bits
83 in that byte are 0), but other bits in the return value are
84 undefined. We need to toggle the state of the bit, because
85 Xen and x86 use opposite senses (mask vs enable).
86 */
87ENTRY(xen_save_fl_direct)
88 BUG
89
90 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
91 setz %ah
92 addb %ah,%ah
93ENDPATCH(xen_save_fl_direct)
94 ret
95 ENDPROC(xen_save_fl_direct)
96 RELOC(xen_save_fl_direct, 0)
97
98/*
99 In principle the caller should be passing us a value return
100 from xen_save_fl_direct, but for robustness sake we test only
101 the X86_EFLAGS_IF flag rather than the whole byte. After
102 setting the interrupt mask state, it checks for unmasked
103 pending events and enters the hypervisor to get them delivered
104 if so.
105 */
106ENTRY(xen_restore_fl_direct)
107 BUG
108
109 testb $X86_EFLAGS_IF>>8, %ah
110 setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
111 /* Preempt here doesn't matter because that will deal with
112 any pending interrupts. The pending check may end up being
113 run on the wrong CPU, but that doesn't hurt. */
114
115 /* check for unmasked and pending */
116 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
117 jz 1f
1182: call check_events
1191:
120ENDPATCH(xen_restore_fl_direct)
121 ret
122 ENDPROC(xen_restore_fl_direct)
123 RELOC(xen_restore_fl_direct, 2b+1)
124
125
126/*
127 Force an event check by making a hypercall,
128 but preserve regs before making the call.
129 */
130check_events:
131 push %rax
132 push %rcx
133 push %rdx
134 push %rsi
135 push %rdi
136 push %r8
137 push %r9
138 push %r10
139 push %r11
140 call xen_force_evtchn_callback
141 pop %r11
142 pop %r10
143 pop %r9
144 pop %r8
145 pop %rdi
146 pop %rsi
147 pop %rdx
148 pop %rcx
149 pop %rax
150 ret
151
152ENTRY(xen_adjust_exception_frame)
153 mov 8+0(%rsp),%rcx
154 mov 8+8(%rsp),%r11
155 ret $16
156
157hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
158/*
159 Xen64 iret frame:
160
161 ss
162 rsp
163 rflags
164 cs
165 rip <-- standard iret frame
166
167 flags
168
169 rcx }
170 r11 }<-- pushed by hypercall page
171rsp -> rax }
172 */
173ENTRY(xen_iret)
174 pushq $0
1751: jmp hypercall_iret
176ENDPATCH(xen_iret)
177RELOC(xen_iret, 1b+1)
178
179/*
180 sysexit is not used for 64-bit processes, so it's
181 only ever used to return to 32-bit compat userspace.
182 */
183ENTRY(xen_sysexit)
184 pushq $__USER32_DS
185 pushq %rcx
186 pushq $X86_EFLAGS_IF
187 pushq $__USER32_CS
188 pushq %rdx
189
190 pushq $0
1911: jmp hypercall_iret
192ENDPATCH(xen_sysexit)
193RELOC(xen_sysexit, 1b+1)
194
195ENTRY(xen_sysret64)
196 /* We're already on the usermode stack at this point, but still
197 with the kernel gs, so we can easily switch back */
198 movq %rsp, %gs:pda_oldrsp
199 movq %gs:pda_kernelstack,%rsp
200
201 pushq $__USER_DS
202 pushq %gs:pda_oldrsp
203 pushq %r11
204 pushq $__USER_CS
205 pushq %rcx
206
207 pushq $VGCF_in_syscall
2081: jmp hypercall_iret
209ENDPATCH(xen_sysret64)
210RELOC(xen_sysret64, 1b+1)
211
212ENTRY(xen_sysret32)
213 /* We're already on the usermode stack at this point, but still
214 with the kernel gs, so we can easily switch back */
215 movq %rsp, %gs:pda_oldrsp
216 movq %gs:pda_kernelstack, %rsp
217
218 pushq $__USER32_DS
219 pushq %gs:pda_oldrsp
220 pushq %r11
221 pushq $__USER32_CS
222 pushq %rcx
223
224 pushq $VGCF_in_syscall
2251: jmp hypercall_iret
226ENDPATCH(xen_sysret32)
227RELOC(xen_sysret32, 1b+1)
228
229/*
230 Xen handles syscall callbacks much like ordinary exceptions,
231 which means we have:
232 - kernel gs
233 - kernel rsp
234 - an iret-like stack frame on the stack (including rcx and r11):
235 ss
236 rsp
237 rflags
238 cs
239 rip
240 r11
241 rsp-> rcx
242
243 In all the entrypoints, we undo all that to make it look
244 like a CPU-generated syscall/sysenter and jump to the normal
245 entrypoint.
246 */
247
248.macro undo_xen_syscall
249 mov 0*8(%rsp),%rcx
250 mov 1*8(%rsp),%r11
251 mov 5*8(%rsp),%rsp
252.endm
253
254/* Normal 64-bit system call target */
255ENTRY(xen_syscall_target)
256 undo_xen_syscall
257 jmp system_call_after_swapgs
258ENDPROC(xen_syscall_target)
259
260#ifdef CONFIG_IA32_EMULATION
261
262/* 32-bit compat syscall target */
263ENTRY(xen_syscall32_target)
264 undo_xen_syscall
265 jmp ia32_cstar_target
266ENDPROC(xen_syscall32_target)
267
268/* 32-bit compat sysenter target */
269ENTRY(xen_sysenter_target)
270 undo_xen_syscall
271 jmp ia32_sysenter_target
272ENDPROC(xen_sysenter_target)
273
274#else /* !CONFIG_IA32_EMULATION */
275
276ENTRY(xen_syscall32_target)
277ENTRY(xen_sysenter_target)
278 lea 16(%rsp), %rsp /* strip %rcx,%r11 */
279 mov $-ENOSYS, %rax
280 pushq $VGCF_in_syscall
281 jmp hypercall_iret
282ENDPROC(xen_syscall32_target)
283ENDPROC(xen_sysenter_target)
284
285#endif /* CONFIG_IA32_EMULATION */
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 6ec3b4f7719b..63d49a523ed3 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -5,14 +5,24 @@
5 5
6#include <linux/elfnote.h> 6#include <linux/elfnote.h>
7#include <linux/init.h> 7#include <linux/init.h>
8
8#include <asm/boot.h> 9#include <asm/boot.h>
10#include <asm/asm.h>
11#include <asm/page.h>
12
9#include <xen/interface/elfnote.h> 13#include <xen/interface/elfnote.h>
14#include <asm/xen/interface.h>
10 15
11 __INIT 16 __INIT
12ENTRY(startup_xen) 17ENTRY(startup_xen)
13 movl %esi,xen_start_info
14 cld 18 cld
15 movl $(init_thread_union+THREAD_SIZE),%esp 19#ifdef CONFIG_X86_32
20 mov %esi,xen_start_info
21 mov $init_thread_union+THREAD_SIZE,%esp
22#else
23 mov %rsi,xen_start_info
24 mov $init_thread_union+THREAD_SIZE,%rsp
25#endif
16 jmp xen_start_kernel 26 jmp xen_start_kernel
17 27
18 __FINIT 28 __FINIT
@@ -20,17 +30,26 @@ ENTRY(startup_xen)
20.pushsection .text 30.pushsection .text
21 .align PAGE_SIZE_asm 31 .align PAGE_SIZE_asm
22ENTRY(hypercall_page) 32ENTRY(hypercall_page)
23 .skip 0x1000 33 .skip PAGE_SIZE_asm
24.popsection 34.popsection
25 35
26 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") 36 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
27 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") 37 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
28 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") 38 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
29 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET) 39#ifdef CONFIG_X86_32
30 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) 40 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
31 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) 41#else
42 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
43#endif
44 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
45 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
32 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") 46 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
33 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") 47 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
34 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") 48 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
49 ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
50 .quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
51 ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
52 ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START)
53 ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0)
35 54
36#endif /*CONFIG_XEN */ 55#endif /*CONFIG_XEN */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index f1063ae08037..d7422dc2a55c 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -2,6 +2,7 @@
2#define XEN_OPS_H 2#define XEN_OPS_H
3 3
4#include <linux/init.h> 4#include <linux/init.h>
5#include <linux/clocksource.h>
5#include <linux/irqreturn.h> 6#include <linux/irqreturn.h>
6#include <xen/xen-ops.h> 7#include <xen/xen-ops.h>
7 8
@@ -9,22 +10,34 @@
9extern const char xen_hypervisor_callback[]; 10extern const char xen_hypervisor_callback[];
10extern const char xen_failsafe_callback[]; 11extern const char xen_failsafe_callback[];
11 12
13struct trap_info;
12void xen_copy_trap_info(struct trap_info *traps); 14void xen_copy_trap_info(struct trap_info *traps);
13 15
14DECLARE_PER_CPU(unsigned long, xen_cr3); 16DECLARE_PER_CPU(unsigned long, xen_cr3);
15DECLARE_PER_CPU(unsigned long, xen_current_cr3); 17DECLARE_PER_CPU(unsigned long, xen_current_cr3);
16 18
17extern struct start_info *xen_start_info; 19extern struct start_info *xen_start_info;
20extern struct shared_info xen_dummy_shared_info;
18extern struct shared_info *HYPERVISOR_shared_info; 21extern struct shared_info *HYPERVISOR_shared_info;
19 22
23void xen_setup_mfn_list_list(void);
24void xen_setup_shared_info(void);
25
20char * __init xen_memory_setup(void); 26char * __init xen_memory_setup(void);
21void __init xen_arch_setup(void); 27void __init xen_arch_setup(void);
22void __init xen_init_IRQ(void); 28void __init xen_init_IRQ(void);
23void xen_enable_sysenter(void); 29void xen_enable_sysenter(void);
30void xen_enable_syscall(void);
31void xen_vcpu_restore(void);
32
33void __init xen_build_dynamic_phys_to_machine(void);
24 34
35void xen_init_irq_ops(void);
25void xen_setup_timer(int cpu); 36void xen_setup_timer(int cpu);
37void xen_teardown_timer(int cpu);
38cycle_t xen_clocksource_read(void);
26void xen_setup_cpu_clockevents(void); 39void xen_setup_cpu_clockevents(void);
27unsigned long xen_cpu_khz(void); 40unsigned long xen_tsc_khz(void);
28void __init xen_time_init(void); 41void __init xen_time_init(void);
29unsigned long xen_get_wallclock(void); 42unsigned long xen_get_wallclock(void);
30int xen_set_wallclock(unsigned long time); 43int xen_set_wallclock(unsigned long time);
@@ -36,23 +49,19 @@ bool xen_vcpu_stolen(int vcpu);
36 49
37void xen_mark_init_mm_pinned(void); 50void xen_mark_init_mm_pinned(void);
38 51
39void __init xen_fill_possible_map(void);
40
41void __init xen_setup_vcpu_info_placement(void); 52void __init xen_setup_vcpu_info_placement(void);
42void xen_smp_prepare_boot_cpu(void);
43void xen_smp_prepare_cpus(unsigned int max_cpus);
44int xen_cpu_up(unsigned int cpu);
45void xen_smp_cpus_done(unsigned int max_cpus);
46 53
47void xen_smp_send_stop(void); 54#ifdef CONFIG_SMP
48void xen_smp_send_reschedule(int cpu); 55void xen_smp_init(void);
49int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic, 56
50 int wait); 57void __init xen_init_spinlocks(void);
51int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info, 58__cpuinit void xen_init_lock_cpu(int cpu);
52 int nonatomic, int wait); 59void xen_uninit_lock_cpu(int cpu);
53 60
54int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), 61extern cpumask_t xen_cpu_initialized_map;
55 void *info, int wait); 62#else
63static inline void xen_smp_init(void) {}
64#endif
56 65
57 66
58/* Declare an asm function, along with symbols needed to make it 67/* Declare an asm function, along with symbols needed to make it
@@ -67,7 +76,11 @@ DECL_ASM(void, xen_irq_disable_direct, void);
67DECL_ASM(unsigned long, xen_save_fl_direct, void); 76DECL_ASM(unsigned long, xen_save_fl_direct, void);
68DECL_ASM(void, xen_restore_fl_direct, unsigned long); 77DECL_ASM(void, xen_restore_fl_direct, unsigned long);
69 78
79/* These are not functions, and cannot be called normally */
70void xen_iret(void); 80void xen_iret(void);
71void xen_sysexit(void); 81void xen_sysexit(void);
82void xen_sysret32(void);
83void xen_sysret64(void);
84void xen_adjust_exception_frame(void);
72 85
73#endif /* XEN_OPS_H */ 86#endif /* XEN_OPS_H */