aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig296
-rw-r--r--arch/x86/Kconfig.cpu12
-rw-r--r--arch/x86/Kconfig.debug57
-rw-r--r--arch/x86/Makefile27
-rw-r--r--arch/x86/boot/a20.c5
-rw-r--r--arch/x86/boot/compressed/head_64.S5
-rw-r--r--arch/x86/boot/compressed/misc.c98
-rw-r--r--arch/x86/boot/compressed/relocs.c198
-rw-r--r--arch/x86/boot/cpu.c2
-rw-r--r--arch/x86/boot/edd.c5
-rw-r--r--arch/x86/boot/main.c4
-rw-r--r--arch/x86/boot/memory.c3
-rw-r--r--arch/x86/boot/pm.c6
-rw-r--r--arch/x86/boot/pmjump.S4
-rw-r--r--arch/x86/boot/video-vga.c3
-rw-r--r--arch/x86/configs/i386_defconfig1711
-rw-r--r--arch/x86/configs/x86_64_defconfig1735
-rw-r--r--arch/x86/ia32/ia32_aout.c6
-rw-r--r--arch/x86/ia32/ia32_signal.c11
-rw-r--r--arch/x86/ia32/ia32entry.S171
-rw-r--r--arch/x86/ia32/sys_ia32.c2
-rw-r--r--arch/x86/kernel/Makefile30
-rw-r--r--arch/x86/kernel/acpi/boot.c442
-rw-r--r--arch/x86/kernel/acpi/cstate.c3
-rw-r--r--arch/x86/kernel/acpi/processor.c6
-rw-r--r--arch/x86/kernel/acpi/sleep.c32
-rw-r--r--arch/x86/kernel/alternative.c22
-rw-r--r--arch/x86/kernel/amd_iommu.c1167
-rw-r--r--arch/x86/kernel/amd_iommu_init.c1060
-rw-r--r--arch/x86/kernel/aperture_64.c314
-rw-r--r--arch/x86/kernel/apic_32.c298
-rw-r--r--arch/x86/kernel/apic_64.c94
-rw-r--r--arch/x86/kernel/apm_32.c38
-rw-r--r--arch/x86/kernel/asm-offsets_32.c2
-rw-r--r--arch/x86/kernel/asm-offsets_64.c18
-rw-r--r--arch/x86/kernel/bios_uv.c48
-rw-r--r--arch/x86/kernel/cpu/Makefile4
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c17
-rw-r--r--arch/x86/kernel/cpu/amd.c44
-rw-r--r--arch/x86/kernel/cpu/amd_64.c224
-rw-r--r--arch/x86/kernel/cpu/bugs.c50
-rw-r--r--arch/x86/kernel/cpu/bugs_64.c (renamed from arch/x86/kernel/bugs_64.c)0
-rw-r--r--arch/x86/kernel/cpu/centaur_64.c35
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/common_64.c670
-rw-r--r--arch/x86/kernel/cpu/cpu.h5
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c16
-rw-r--r--arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c44
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.h1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c23
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c157
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c7
-rw-r--r--arch/x86/kernel/cpu/intel.c14
-rw-r--r--arch/x86/kernel/cpu/intel_64.c95
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c13
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c36
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c54
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c92
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c38
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c905
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h3
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c214
-rw-r--r--arch/x86/kernel/cpu/proc.c2
-rw-r--r--arch/x86/kernel/cpuid.c31
-rw-r--r--arch/x86/kernel/e820.c1365
-rw-r--r--arch/x86/kernel/e820_32.c775
-rw-r--r--arch/x86/kernel/e820_64.c952
-rw-r--r--arch/x86/kernel/early-quirks.c46
-rw-r--r--arch/x86/kernel/early_printk.c2
-rw-r--r--arch/x86/kernel/efi.c67
-rw-r--r--arch/x86/kernel/efi_64.c8
-rw-r--r--arch/x86/kernel/entry_32.S162
-rw-r--r--arch/x86/kernel/entry_64.S339
-rw-r--r--arch/x86/kernel/ftrace.c141
-rw-r--r--arch/x86/kernel/genapic_64.c2
-rw-r--r--arch/x86/kernel/genapic_flat_64.c2
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c241
-rw-r--r--arch/x86/kernel/head.c55
-rw-r--r--arch/x86/kernel/head32.c27
-rw-r--r--arch/x86/kernel/head64.c101
-rw-r--r--arch/x86/kernel/head_32.S13
-rw-r--r--arch/x86/kernel/head_64.S100
-rw-r--r--arch/x86/kernel/hpet.c73
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c9
-rw-r--r--arch/x86/kernel/i387.c4
-rw-r--r--arch/x86/kernel/i8259.c (renamed from arch/x86/kernel/i8259_32.c)136
-rw-r--r--arch/x86/kernel/i8259_64.c512
-rw-r--r--arch/x86/kernel/io_apic_32.c723
-rw-r--r--arch/x86/kernel/io_apic_64.c327
-rw-r--r--arch/x86/kernel/io_delay.c3
-rw-r--r--arch/x86/kernel/ipi.c7
-rw-r--r--arch/x86/kernel/irq_32.c253
-rw-r--r--arch/x86/kernel/irq_64.c28
-rw-r--r--arch/x86/kernel/irqinit_32.c114
-rw-r--r--arch/x86/kernel/irqinit_64.c222
-rw-r--r--arch/x86/kernel/kdebugfs.c8
-rw-r--r--arch/x86/kernel/kprobes.c7
-rw-r--r--arch/x86/kernel/kvmclock.c2
-rw-r--r--arch/x86/kernel/ldt.c12
-rw-r--r--arch/x86/kernel/machine_kexec_32.c47
-rw-r--r--arch/x86/kernel/machine_kexec_64.c8
-rw-r--r--arch/x86/kernel/microcode.c58
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c1
-rw-r--r--arch/x86/kernel/module_64.c11
-rw-r--r--arch/x86/kernel/mpparse.c883
-rw-r--r--arch/x86/kernel/msr.c20
-rw-r--r--arch/x86/kernel/nmi.c (renamed from arch/x86/kernel/nmi_64.c)225
-rw-r--r--arch/x86/kernel/nmi_32.c467
-rw-r--r--arch/x86/kernel/numaq_32.c215
-rw-r--r--arch/x86/kernel/paravirt.c65
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c4
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c9
-rw-r--r--arch/x86/kernel/pci-calgary_64.c160
-rw-r--r--arch/x86/kernel/pci-dma.c72
-rw-r--r--arch/x86/kernel/pci-gart_64.c103
-rw-r--r--arch/x86/kernel/pci-nommu.c16
-rw-r--r--arch/x86/kernel/pci-swiotlb_64.c6
-rw-r--r--arch/x86/kernel/probe_roms_32.c166
-rw-r--r--arch/x86/kernel/process.c225
-rw-r--r--arch/x86/kernel/process_32.c70
-rw-r--r--arch/x86/kernel/process_64.c142
-rw-r--r--arch/x86/kernel/ptrace.c155
-rw-r--r--arch/x86/kernel/quirks.c60
-rw-r--r--arch/x86/kernel/reboot.c40
-rw-r--r--arch/x86/kernel/reboot_fixups_32.c4
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S174
-rw-r--r--arch/x86/kernel/setup.c924
-rw-r--r--arch/x86/kernel/setup64.c287
-rw-r--r--arch/x86/kernel/setup_32.c964
-rw-r--r--arch/x86/kernel/setup_64.c1194
-rw-r--r--arch/x86/kernel/setup_percpu.c399
-rw-r--r--arch/x86/kernel/signal_32.c11
-rw-r--r--arch/x86/kernel/signal_64.c62
-rw-r--r--arch/x86/kernel/smp.c158
-rw-r--r--arch/x86/kernel/smpboot.c330
-rw-r--r--arch/x86/kernel/smpcommon.c56
-rw-r--r--arch/x86/kernel/smpcommon_32.c1
-rw-r--r--arch/x86/kernel/stacktrace.c2
-rw-r--r--arch/x86/kernel/step.c35
-rw-r--r--arch/x86/kernel/summit_32.c2
-rw-r--r--arch/x86/kernel/sys_i386_32.c64
-rw-r--r--arch/x86/kernel/syscall_table_32.S6
-rw-r--r--arch/x86/kernel/time_32.c7
-rw-r--r--arch/x86/kernel/time_64.c16
-rw-r--r--arch/x86/kernel/tlb_32.c2
-rw-r--r--arch/x86/kernel/tlb_64.c7
-rw-r--r--arch/x86/kernel/tlb_uv.c792
-rw-r--r--arch/x86/kernel/trampoline.c2
-rw-r--r--arch/x86/kernel/traps_32.c306
-rw-r--r--arch/x86/kernel/traps_64.c571
-rw-r--r--arch/x86/kernel/tsc.c535
-rw-r--r--arch/x86/kernel/tsc_32.c451
-rw-r--r--arch/x86/kernel/tsc_64.c357
-rw-r--r--arch/x86/kernel/visws_quirks.c707
-rw-r--r--arch/x86/kernel/vmi_32.c7
-rw-r--r--arch/x86/kernel/vmiclock_32.c7
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S15
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S18
-rw-r--r--arch/x86/kernel/vsmp_64.c3
-rw-r--r--arch/x86/kernel/vsyscall_64.c19
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c16
-rw-r--r--arch/x86/kvm/Makefile3
-rw-r--r--arch/x86/kvm/i8254.c24
-rw-r--r--arch/x86/kvm/i8259.c9
-rw-r--r--arch/x86/kvm/irq.h2
-rw-r--r--arch/x86/kvm/lapic.c14
-rw-r--r--arch/x86/kvm/lapic.h1
-rw-r--r--arch/x86/kvm/mmu.c62
-rw-r--r--arch/x86/kvm/mmu.h3
-rw-r--r--arch/x86/kvm/paging_tmpl.h28
-rw-r--r--arch/x86/kvm/svm.c131
-rw-r--r--arch/x86/kvm/vmx.c232
-rw-r--r--arch/x86/kvm/vmx.h12
-rw-r--r--arch/x86/kvm/x86.c301
-rw-r--r--arch/x86/kvm/x86_emulate.c257
-rw-r--r--arch/x86/lguest/Kconfig2
-rw-r--r--arch/x86/lguest/boot.c12
-rw-r--r--arch/x86/lib/Makefile5
-rw-r--r--arch/x86/lib/copy_user_64.S429
-rw-r--r--arch/x86/lib/copy_user_nocache_64.S283
-rw-r--r--arch/x86/lib/delay.c (renamed from arch/x86/lib/delay_32.c)38
-rw-r--r--arch/x86/lib/delay_64.c85
-rw-r--r--arch/x86/lib/getuser.S (renamed from arch/x86/lib/getuser_64.S)87
-rw-r--r--arch/x86/lib/getuser_32.S78
-rw-r--r--arch/x86/lib/msr-on-cpu.c8
-rw-r--r--arch/x86/lib/putuser.S (renamed from arch/x86/lib/putuser_32.S)73
-rw-r--r--arch/x86/lib/putuser_64.S106
-rw-r--r--arch/x86/lib/thunk_32.S47
-rw-r--r--arch/x86/lib/thunk_64.S19
-rw-r--r--arch/x86/lib/usercopy_64.c23
-rw-r--r--arch/x86/mach-default/setup.c76
-rw-r--r--arch/x86/mach-es7000/Makefile1
-rw-r--r--arch/x86/mach-es7000/es7000plat.c57
-rw-r--r--arch/x86/mach-generic/Makefile10
-rw-r--r--arch/x86/mach-generic/bigsmp.c4
-rw-r--r--arch/x86/mach-generic/numaq.c41
-rw-r--r--arch/x86/mach-generic/probe.c15
-rw-r--r--arch/x86/mach-visws/Makefile8
-rw-r--r--arch/x86/mach-visws/mpparse.c88
-rw-r--r--arch/x86/mach-visws/reboot.c55
-rw-r--r--arch/x86/mach-visws/setup.c183
-rw-r--r--arch/x86/mach-visws/traps.c69
-rw-r--r--arch/x86/mach-visws/visws_apic.c297
-rw-r--r--arch/x86/mach-voyager/setup.c37
-rw-r--r--arch/x86/mach-voyager/voyager_smp.c112
-rw-r--r--arch/x86/math-emu/reg_constant.c8
-rw-r--r--arch/x86/mm/Makefile10
-rw-r--r--arch/x86/mm/discontig_32.c288
-rw-r--r--arch/x86/mm/dump_pagetables.c12
-rw-r--r--arch/x86/mm/fault.c110
-rw-r--r--arch/x86/mm/gup.c295
-rw-r--r--arch/x86/mm/hugetlbpage.c78
-rw-r--r--arch/x86/mm/init_32.c528
-rw-r--r--arch/x86/mm/init_64.c675
-rw-r--r--arch/x86/mm/ioremap.c45
-rw-r--r--arch/x86/mm/k8topology_64.c21
-rw-r--r--arch/x86/mm/kmmio.c510
-rw-r--r--arch/x86/mm/memtest.c123
-rw-r--r--arch/x86/mm/mmio-mod.c515
-rw-r--r--arch/x86/mm/numa_64.c99
-rw-r--r--arch/x86/mm/pageattr-test.c21
-rw-r--r--arch/x86/mm/pageattr.c67
-rw-r--r--arch/x86/mm/pat.c470
-rw-r--r--arch/x86/mm/pf_in.c489
-rw-r--r--arch/x86/mm/pf_in.h39
-rw-r--r--arch/x86/mm/pgtable.c190
-rw-r--r--arch/x86/mm/pgtable_32.c103
-rw-r--r--arch/x86/mm/srat_32.c (renamed from arch/x86/kernel/srat_32.c)223
-rw-r--r--arch/x86/mm/srat_64.c21
-rw-r--r--arch/x86/mm/testmmiotrace.c71
-rw-r--r--arch/x86/oprofile/nmi_int.c49
-rw-r--r--arch/x86/pci/Makefile22
-rw-r--r--arch/x86/pci/Makefile_3224
-rw-r--r--arch/x86/pci/Makefile_6417
-rw-r--r--arch/x86/pci/acpi.c21
-rw-r--r--arch/x86/pci/amd_bus.c (renamed from arch/x86/pci/k8-bus_64.c)108
-rw-r--r--arch/x86/pci/common.c34
-rw-r--r--arch/x86/pci/direct.c25
-rw-r--r--arch/x86/pci/early.c72
-rw-r--r--arch/x86/pci/i386.c9
-rw-r--r--arch/x86/pci/init.c4
-rw-r--r--arch/x86/pci/irq.c386
-rw-r--r--arch/x86/pci/legacy.c19
-rw-r--r--arch/x86/pci/mmconfig-shared.c2
-rw-r--r--arch/x86/pci/mp_bus_to_node.c23
-rw-r--r--arch/x86/pci/numaq_32.c (renamed from arch/x86/pci/numa.c)35
-rw-r--r--arch/x86/pci/pci.h15
-rw-r--r--arch/x86/pci/visws.c21
-rw-r--r--arch/x86/power/hibernate_64.c2
-rw-r--r--arch/x86/vdso/Makefile2
-rw-r--r--arch/x86/vdso/vclock_gettime.c15
-rw-r--r--arch/x86/vdso/vdso32-setup.c30
-rw-r--r--arch/x86/vdso/vdso32.S13
-rw-r--r--arch/x86/vdso/vgetcpu.c3
-rw-r--r--arch/x86/vdso/vma.c13
-rw-r--r--arch/x86/xen/Kconfig20
-rw-r--r--arch/x86/xen/Makefile2
-rw-r--r--arch/x86/xen/enlighten.c835
-rw-r--r--arch/x86/xen/manage.c143
-rw-r--r--arch/x86/xen/mmu.c572
-rw-r--r--arch/x86/xen/mmu.h39
-rw-r--r--arch/x86/xen/multicalls.c41
-rw-r--r--arch/x86/xen/multicalls.h12
-rw-r--r--arch/x86/xen/setup.c109
-rw-r--r--arch/x86/xen/smp.c419
-rw-r--r--arch/x86/xen/suspend.c48
-rw-r--r--arch/x86/xen/time.c17
-rw-r--r--arch/x86/xen/xen-asm_32.S (renamed from arch/x86/xen/xen-asm.S)0
-rw-r--r--arch/x86/xen/xen-asm_64.S271
-rw-r--r--arch/x86/xen/xen-head.S31
-rw-r--r--arch/x86/xen/xen-ops.h35
275 files changed, 25722 insertions, 15595 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index bf07b6f50fa1..6bdde845818e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -21,10 +21,16 @@ config X86
21 select HAVE_UNSTABLE_SCHED_CLOCK 21 select HAVE_UNSTABLE_SCHED_CLOCK
22 select HAVE_IDE 22 select HAVE_IDE
23 select HAVE_OPROFILE 23 select HAVE_OPROFILE
24 select HAVE_IOREMAP_PROT
25 select HAVE_GET_USER_PAGES_FAST
24 select HAVE_KPROBES 26 select HAVE_KPROBES
27 select ARCH_WANT_OPTIONAL_GPIOLIB if !X86_RDC321X
25 select HAVE_KRETPROBES 28 select HAVE_KRETPROBES
29 select HAVE_DYNAMIC_FTRACE
30 select HAVE_FTRACE
26 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) 31 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
27 select HAVE_ARCH_KGDB if !X86_VOYAGER 32 select HAVE_ARCH_KGDB if !X86_VOYAGER
33 select HAVE_EFFICIENT_UNALIGNED_ACCESS
28 34
29config ARCH_DEFCONFIG 35config ARCH_DEFCONFIG
30 string 36 string
@@ -121,7 +127,7 @@ config ARCH_HAS_CACHE_LINE_SIZE
121 def_bool y 127 def_bool y
122 128
123config HAVE_SETUP_PER_CPU_AREA 129config HAVE_SETUP_PER_CPU_AREA
124 def_bool X86_64 || (X86_SMP && !X86_VOYAGER) 130 def_bool X86_64_SMP || (X86_SMP && !X86_VOYAGER)
125 131
126config HAVE_CPUMASK_OF_CPU_MAP 132config HAVE_CPUMASK_OF_CPU_MAP
127 def_bool X86_64_SMP 133 def_bool X86_64_SMP
@@ -168,6 +174,7 @@ config GENERIC_PENDING_IRQ
168config X86_SMP 174config X86_SMP
169 bool 175 bool
170 depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64) 176 depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64)
177 select USE_GENERIC_SMP_HELPERS
171 default y 178 default y
172 179
173config X86_32_SMP 180config X86_32_SMP
@@ -181,12 +188,12 @@ config X86_64_SMP
181config X86_HT 188config X86_HT
182 bool 189 bool
183 depends on SMP 190 depends on SMP
184 depends on (X86_32 && !(X86_VISWS || X86_VOYAGER)) || X86_64 191 depends on (X86_32 && !X86_VOYAGER) || X86_64
185 default y 192 default y
186 193
187config X86_BIOS_REBOOT 194config X86_BIOS_REBOOT
188 bool 195 bool
189 depends on !X86_VISWS && !X86_VOYAGER 196 depends on !X86_VOYAGER
190 default y 197 default y
191 198
192config X86_TRAMPOLINE 199config X86_TRAMPOLINE
@@ -230,6 +237,26 @@ config SMP
230 237
231 If you don't know what to do here, say N. 238 If you don't know what to do here, say N.
232 239
240config X86_FIND_SMP_CONFIG
241 def_bool y
242 depends on X86_MPPARSE || X86_VOYAGER
243
244if ACPI
245config X86_MPPARSE
246 def_bool y
247 bool "Enable MPS table"
248 depends on X86_LOCAL_APIC
249 help
250 For old smp systems that do not have proper acpi support. Newer systems
251 (esp with 64bit cpus) with acpi support, MADT and DSDT will override it
252endif
253
254if !ACPI
255config X86_MPPARSE
256 def_bool y
257 depends on X86_LOCAL_APIC
258endif
259
233choice 260choice
234 prompt "Subarchitecture Type" 261 prompt "Subarchitecture Type"
235 default X86_PC 262 default X86_PC
@@ -251,7 +278,7 @@ config X86_ELAN
251 278
252config X86_VOYAGER 279config X86_VOYAGER
253 bool "Voyager (NCR)" 280 bool "Voyager (NCR)"
254 depends on X86_32 && (SMP || BROKEN) 281 depends on X86_32 && (SMP || BROKEN) && !PCI
255 help 282 help
256 Voyager is an MCA-based 32-way capable SMP architecture proprietary 283 Voyager is an MCA-based 32-way capable SMP architecture proprietary
257 to NCR Corp. Machine classes 345x/35xx/4100/51xx are Voyager-based. 284 to NCR Corp. Machine classes 345x/35xx/4100/51xx are Voyager-based.
@@ -261,16 +288,27 @@ config X86_VOYAGER
261 If you do not specifically know you have a Voyager based machine, 288 If you do not specifically know you have a Voyager based machine,
262 say N here, otherwise the kernel you build will not be bootable. 289 say N here, otherwise the kernel you build will not be bootable.
263 290
291config X86_GENERICARCH
292 bool "Generic architecture"
293 depends on X86_32
294 help
295 This option compiles in the NUMAQ, Summit, bigsmp, ES7000, default
296 subarchitectures. It is intended for a generic binary kernel.
297 if you select them all, kernel will probe it one by one. and will
298 fallback to default.
299
300if X86_GENERICARCH
301
264config X86_NUMAQ 302config X86_NUMAQ
265 bool "NUMAQ (IBM/Sequent)" 303 bool "NUMAQ (IBM/Sequent)"
266 depends on SMP && X86_32 304 depends on SMP && X86_32 && PCI && X86_MPPARSE
267 select NUMA 305 select NUMA
268 help 306 help
269 This option is used for getting Linux to run on a (IBM/Sequent) NUMA 307 This option is used for getting Linux to run on a NUMAQ (IBM/Sequent)
270 multiquad box. This changes the way that processors are bootstrapped, 308 NUMA multiquad box. This changes the way that processors are
271 and uses Clustered Logical APIC addressing mode instead of Flat Logical. 309 bootstrapped, and uses Clustered Logical APIC addressing mode instead
272 You will need a new lynxer.elf file to flash your firmware with - send 310 of Flat Logical. You will need a new lynxer.elf file to flash your
273 email to <Martin.Bligh@us.ibm.com>. 311 firmware with - send email to <Martin.Bligh@us.ibm.com>.
274 312
275config X86_SUMMIT 313config X86_SUMMIT
276 bool "Summit/EXA (IBM x440)" 314 bool "Summit/EXA (IBM x440)"
@@ -279,46 +317,21 @@ config X86_SUMMIT
279 This option is needed for IBM systems that use the Summit/EXA chipset. 317 This option is needed for IBM systems that use the Summit/EXA chipset.
280 In particular, it is needed for the x440. 318 In particular, it is needed for the x440.
281 319
282 If you don't have one of these computers, you should say N here. 320config X86_ES7000
283 If you want to build a NUMA kernel, you must select ACPI. 321 bool "Support for Unisys ES7000 IA32 series"
322 depends on X86_32 && SMP
323 help
324 Support for Unisys ES7000 systems. Say 'Y' here if this kernel is
325 supposed to run on an IA32-based Unisys ES7000 system.
284 326
285config X86_BIGSMP 327config X86_BIGSMP
286 bool "Support for other sub-arch SMP systems with more than 8 CPUs" 328 bool "Support for big SMP systems with more than 8 CPUs"
287 depends on X86_32 && SMP 329 depends on X86_32 && SMP
288 help 330 help
289 This option is needed for the systems that have more than 8 CPUs 331 This option is needed for the systems that have more than 8 CPUs
290 and if the system is not of any sub-arch type above. 332 and if the system is not of any sub-arch type above.
291 333
292 If you don't have such a system, you should say N here. 334endif
293
294config X86_VISWS
295 bool "SGI 320/540 (Visual Workstation)"
296 depends on X86_32
297 help
298 The SGI Visual Workstation series is an IA32-based workstation
299 based on SGI systems chips with some legacy PC hardware attached.
300
301 Say Y here to create a kernel to run on the SGI 320 or 540.
302
303 A kernel compiled for the Visual Workstation will not run on PCs
304 and vice versa. See <file:Documentation/sgi-visws.txt> for details.
305
306config X86_GENERICARCH
307 bool "Generic architecture (Summit, bigsmp, ES7000, default)"
308 depends on X86_32
309 help
310 This option compiles in the Summit, bigsmp, ES7000, default subarchitectures.
311 It is intended for a generic binary kernel.
312 If you want a NUMA kernel, select ACPI. We need SRAT for NUMA.
313
314config X86_ES7000
315 bool "Support for Unisys ES7000 IA32 series"
316 depends on X86_32 && SMP
317 help
318 Support for Unisys ES7000 systems. Say 'Y' here if this kernel is
319 supposed to run on an IA32-based Unisys ES7000 system.
320 Only choose this option if you have such a system, otherwise you
321 should say N here.
322 335
323config X86_RDC321X 336config X86_RDC321X
324 bool "RDC R-321x SoC" 337 bool "RDC R-321x SoC"
@@ -337,7 +350,7 @@ config X86_RDC321X
337config X86_VSMP 350config X86_VSMP
338 bool "Support for ScaleMP vSMP" 351 bool "Support for ScaleMP vSMP"
339 select PARAVIRT 352 select PARAVIRT
340 depends on X86_64 353 depends on X86_64 && PCI
341 help 354 help
342 Support for ScaleMP vSMP systems. Say 'Y' here if this kernel is 355 Support for ScaleMP vSMP systems. Say 'Y' here if this kernel is
343 supposed to run on these EM64T-based machines. Only choose this option 356 supposed to run on these EM64T-based machines. Only choose this option
@@ -345,6 +358,18 @@ config X86_VSMP
345 358
346endchoice 359endchoice
347 360
361config X86_VISWS
362 bool "SGI 320/540 (Visual Workstation)"
363 depends on X86_32 && PCI && !X86_VOYAGER && X86_MPPARSE && PCI_GODIRECT
364 help
365 The SGI Visual Workstation series is an IA32-based workstation
366 based on SGI systems chips with some legacy PC hardware attached.
367
368 Say Y here to create a kernel to run on the SGI 320 or 540.
369
370 A kernel compiled for the Visual Workstation will run on general
371 PCs as well. See <file:Documentation/sgi-visws.txt> for details.
372
348config SCHED_NO_NO_OMIT_FRAME_POINTER 373config SCHED_NO_NO_OMIT_FRAME_POINTER
349 def_bool y 374 def_bool y
350 prompt "Single-depth WCHAN output" 375 prompt "Single-depth WCHAN output"
@@ -373,7 +398,7 @@ config VMI
373 bool "VMI Guest support" 398 bool "VMI Guest support"
374 select PARAVIRT 399 select PARAVIRT
375 depends on X86_32 400 depends on X86_32
376 depends on !(X86_VISWS || X86_VOYAGER) 401 depends on !X86_VOYAGER
377 help 402 help
378 VMI provides a paravirtualized interface to the VMware ESX server 403 VMI provides a paravirtualized interface to the VMware ESX server
379 (it could be used by other hypervisors in theory too, but is not 404 (it could be used by other hypervisors in theory too, but is not
@@ -384,7 +409,7 @@ config KVM_CLOCK
384 bool "KVM paravirtualized clock" 409 bool "KVM paravirtualized clock"
385 select PARAVIRT 410 select PARAVIRT
386 select PARAVIRT_CLOCK 411 select PARAVIRT_CLOCK
387 depends on !(X86_VISWS || X86_VOYAGER) 412 depends on !X86_VOYAGER
388 help 413 help
389 Turning on this option will allow you to run a paravirtualized clock 414 Turning on this option will allow you to run a paravirtualized clock
390 when running over the KVM hypervisor. Instead of relying on a PIT 415 when running over the KVM hypervisor. Instead of relying on a PIT
@@ -395,7 +420,7 @@ config KVM_CLOCK
395config KVM_GUEST 420config KVM_GUEST
396 bool "KVM Guest support" 421 bool "KVM Guest support"
397 select PARAVIRT 422 select PARAVIRT
398 depends on !(X86_VISWS || X86_VOYAGER) 423 depends on !X86_VOYAGER
399 help 424 help
400 This option enables various optimizations for running under the KVM 425 This option enables various optimizations for running under the KVM
401 hypervisor. 426 hypervisor.
@@ -404,7 +429,7 @@ source "arch/x86/lguest/Kconfig"
404 429
405config PARAVIRT 430config PARAVIRT
406 bool "Enable paravirtualization code" 431 bool "Enable paravirtualization code"
407 depends on !(X86_VISWS || X86_VOYAGER) 432 depends on !X86_VOYAGER
408 help 433 help
409 This changes the kernel so it can modify itself when it is run 434 This changes the kernel so it can modify itself when it is run
410 under a hypervisor, potentially improving performance significantly 435 under a hypervisor, potentially improving performance significantly
@@ -417,51 +442,31 @@ config PARAVIRT_CLOCK
417 442
418endif 443endif
419 444
420config MEMTEST_BOOTPARAM 445config PARAVIRT_DEBUG
421 bool "Memtest boot parameter" 446 bool "paravirt-ops debugging"
422 depends on X86_64 447 depends on PARAVIRT && DEBUG_KERNEL
423 default y 448 help
424 help 449 Enable to debug paravirt_ops internals. Specifically, BUG if
425 This option adds a kernel parameter 'memtest', which allows memtest 450 a paravirt_op is missing when it is called.
426 to be disabled at boot. If this option is selected, memtest
427 functionality can be disabled with memtest=0 on the kernel
428 command line. The purpose of this option is to allow a single
429 kernel image to be distributed with memtest built in, but not
430 necessarily enabled.
431
432 If you are unsure how to answer this question, answer Y.
433 451
434config MEMTEST_BOOTPARAM_VALUE 452config MEMTEST
435 int "Memtest boot parameter default value (0-4)" 453 bool "Memtest"
436 depends on MEMTEST_BOOTPARAM
437 range 0 4
438 default 0
439 help 454 help
440 This option sets the default value for the kernel parameter 455 This option adds a kernel parameter 'memtest', which allows memtest
441 'memtest', which allows memtest to be disabled at boot. If this 456 to be set.
442 option is set to 0 (zero), the memtest kernel parameter will 457 memtest=0, mean disabled; -- default
443 default to 0, disabling memtest at bootup. If this option is 458 memtest=1, mean do 1 test pattern;
444 set to 4, the memtest kernel parameter will default to 4, 459 ...
445 enabling memtest at bootup, and use that as pattern number. 460 memtest=4, mean do 4 test patterns.
446 461 If you are unsure how to answer this question, answer N.
447 If you are unsure how to answer this question, answer 0.
448
449config ACPI_SRAT
450 def_bool y
451 depends on X86_32 && ACPI && NUMA && (X86_SUMMIT || X86_GENERICARCH)
452 select ACPI_NUMA
453
454config HAVE_ARCH_PARSE_SRAT
455 def_bool y
456 depends on ACPI_SRAT
457 462
458config X86_SUMMIT_NUMA 463config X86_SUMMIT_NUMA
459 def_bool y 464 def_bool y
460 depends on X86_32 && NUMA && (X86_SUMMIT || X86_GENERICARCH) 465 depends on X86_32 && NUMA && X86_GENERICARCH
461 466
462config X86_CYCLONE_TIMER 467config X86_CYCLONE_TIMER
463 def_bool y 468 def_bool y
464 depends on X86_32 && X86_SUMMIT || X86_GENERICARCH 469 depends on X86_GENERICARCH
465 470
466config ES7000_CLUSTERED_APIC 471config ES7000_CLUSTERED_APIC
467 def_bool y 472 def_bool y
@@ -549,6 +554,21 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
549 Calgary anyway, pass 'iommu=calgary' on the kernel command line. 554 Calgary anyway, pass 'iommu=calgary' on the kernel command line.
550 If unsure, say Y. 555 If unsure, say Y.
551 556
557config AMD_IOMMU
558 bool "AMD IOMMU support"
559 select SWIOTLB
560 depends on X86_64 && PCI && ACPI
561 help
562 With this option you can enable support for AMD IOMMU hardware in
563 your system. An IOMMU is a hardware component which provides
564 remapping of DMA memory accesses from devices. With an AMD IOMMU you
565 can isolate the the DMA memory of different devices and protect the
566 system from misbehaving device drivers or hardware.
567
568 You can find out if your system has an AMD IOMMU if you look into
569 your BIOS for an option to enable it or if you have an IVRS ACPI
570 table.
571
552# need this always selected by IOMMU for the VIA workaround 572# need this always selected by IOMMU for the VIA workaround
553config SWIOTLB 573config SWIOTLB
554 bool 574 bool
@@ -560,21 +580,36 @@ config SWIOTLB
560 3 GB of memory. If unsure, say Y. 580 3 GB of memory. If unsure, say Y.
561 581
562config IOMMU_HELPER 582config IOMMU_HELPER
563 def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB) 583 def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU)
584config MAXSMP
585 bool "Configure Maximum number of SMP Processors and NUMA Nodes"
586 depends on X86_64 && SMP
587 default n
588 help
589 Configure maximum number of CPUS and NUMA Nodes for this architecture.
590 If unsure, say N.
564 591
592if MAXSMP
565config NR_CPUS 593config NR_CPUS
566 int "Maximum number of CPUs (2-255)" 594 int
567 range 2 255 595 default "4096"
596endif
597
598if !MAXSMP
599config NR_CPUS
600 int "Maximum number of CPUs (2-4096)"
601 range 2 4096
568 depends on SMP 602 depends on SMP
569 default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000 603 default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
570 default "8" 604 default "8"
571 help 605 help
572 This allows you to specify the maximum number of CPUs which this 606 This allows you to specify the maximum number of CPUs which this
573 kernel will support. The maximum supported value is 255 and the 607 kernel will support. The maximum supported value is 4096 and the
574 minimum value which makes sense is 2. 608 minimum value which makes sense is 2.
575 609
576 This is purely to save memory - each supported CPU adds 610 This is purely to save memory - each supported CPU adds
577 approximately eight kilobytes to the kernel image. 611 approximately eight kilobytes to the kernel image.
612endif
578 613
579config SCHED_SMT 614config SCHED_SMT
580 bool "SMT (Hyperthreading) scheduler support" 615 bool "SMT (Hyperthreading) scheduler support"
@@ -598,7 +633,7 @@ source "kernel/Kconfig.preempt"
598 633
599config X86_UP_APIC 634config X86_UP_APIC
600 bool "Local APIC support on uniprocessors" 635 bool "Local APIC support on uniprocessors"
601 depends on X86_32 && !SMP && !(X86_VISWS || X86_VOYAGER || X86_GENERICARCH) 636 depends on X86_32 && !SMP && !(X86_VOYAGER || X86_GENERICARCH)
602 help 637 help
603 A local APIC (Advanced Programmable Interrupt Controller) is an 638 A local APIC (Advanced Programmable Interrupt Controller) is an
604 integrated interrupt controller in the CPU. If you have a single-CPU 639 integrated interrupt controller in the CPU. If you have a single-CPU
@@ -623,11 +658,11 @@ config X86_UP_IOAPIC
623 658
624config X86_LOCAL_APIC 659config X86_LOCAL_APIC
625 def_bool y 660 def_bool y
626 depends on X86_64 || (X86_32 && (X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER) || X86_GENERICARCH)) 661 depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
627 662
628config X86_IO_APIC 663config X86_IO_APIC
629 def_bool y 664 def_bool y
630 depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER)) || X86_GENERICARCH)) 665 depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
631 666
632config X86_VISWS_APIC 667config X86_VISWS_APIC
633 def_bool y 668 def_bool y
@@ -681,7 +716,7 @@ config X86_MCE_NONFATAL
681 716
682config X86_MCE_P4THERMAL 717config X86_MCE_P4THERMAL
683 bool "check for P4 thermal throttling interrupt." 718 bool "check for P4 thermal throttling interrupt."
684 depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP) && !X86_VISWS 719 depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP)
685 help 720 help
686 Enabling this feature will cause a message to be printed when the P4 721 Enabling this feature will cause a message to be printed when the P4
687 enters thermal throttling. 722 enters thermal throttling.
@@ -911,9 +946,9 @@ config X86_PAE
911config NUMA 946config NUMA
912 bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" 947 bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)"
913 depends on SMP 948 depends on SMP
914 depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || (X86_SUMMIT || X86_GENERICARCH) && ACPI) && EXPERIMENTAL) 949 depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL)
915 default n if X86_PC 950 default n if X86_PC
916 default y if (X86_NUMAQ || X86_SUMMIT) 951 default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP)
917 help 952 help
918 Enable NUMA (Non Uniform Memory Access) support. 953 Enable NUMA (Non Uniform Memory Access) support.
919 The kernel will try to allocate memory used by a CPU on the 954 The kernel will try to allocate memory used by a CPU on the
@@ -965,13 +1000,25 @@ config NUMA_EMU
965 into virtual nodes when booted with "numa=fake=N", where N is the 1000 into virtual nodes when booted with "numa=fake=N", where N is the
966 number of nodes. This is only useful for debugging. 1001 number of nodes. This is only useful for debugging.
967 1002
1003if MAXSMP
1004
968config NODES_SHIFT 1005config NODES_SHIFT
969 int "Max num nodes shift(1-9)" 1006 int
970 range 1 9 if X86_64 1007 default "9"
1008endif
1009
1010if !MAXSMP
1011config NODES_SHIFT
1012 int "Maximum NUMA Nodes (as a power of 2)"
1013 range 1 9 if X86_64
971 default "6" if X86_64 1014 default "6" if X86_64
972 default "4" if X86_NUMAQ 1015 default "4" if X86_NUMAQ
973 default "3" 1016 default "3"
974 depends on NEED_MULTIPLE_NODES 1017 depends on NEED_MULTIPLE_NODES
1018 help
1019 Specify the maximum number of NUMA Nodes available on the target
1020 system. Increases memory reserved to accomodate various tables.
1021endif
975 1022
976config HAVE_ARCH_BOOTMEM_NODE 1023config HAVE_ARCH_BOOTMEM_NODE
977 def_bool y 1024 def_bool y
@@ -1090,6 +1137,37 @@ config MTRR
1090 1137
1091 See <file:Documentation/mtrr.txt> for more information. 1138 See <file:Documentation/mtrr.txt> for more information.
1092 1139
1140config MTRR_SANITIZER
1141 bool
1142 prompt "MTRR cleanup support"
1143 depends on MTRR
1144 help
1145 Convert MTRR layout from continuous to discrete, so X drivers can
1146 add writeback entries.
1147
1148 Can be disabled with disable_mtrr_cleanup on the kernel command line.
1149 The largest mtrr entry size for a continous block can be set with
1150 mtrr_chunk_size.
1151
1152 If unsure, say N.
1153
1154config MTRR_SANITIZER_ENABLE_DEFAULT
1155 int "MTRR cleanup enable value (0-1)"
1156 range 0 1
1157 default "0"
1158 depends on MTRR_SANITIZER
1159 help
1160 Enable mtrr cleanup default value
1161
1162config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
1163 int "MTRR cleanup spare reg num (0-7)"
1164 range 0 7
1165 default "1"
1166 depends on MTRR_SANITIZER
1167 help
1168 mtrr cleanup spare entries default, it can be changed via
1169 mtrr_spare_reg_nr=N on the kernel command line.
1170
1093config X86_PAT 1171config X86_PAT
1094 bool 1172 bool
1095 prompt "x86 PAT support" 1173 prompt "x86 PAT support"
@@ -1190,7 +1268,6 @@ config KEXEC
1190 1268
1191config CRASH_DUMP 1269config CRASH_DUMP
1192 bool "kernel crash dumps (EXPERIMENTAL)" 1270 bool "kernel crash dumps (EXPERIMENTAL)"
1193 depends on EXPERIMENTAL
1194 depends on X86_64 || (X86_32 && HIGHMEM) 1271 depends on X86_64 || (X86_32 && HIGHMEM)
1195 help 1272 help
1196 Generate crash dump after being started by kexec. 1273 Generate crash dump after being started by kexec.
@@ -1203,6 +1280,14 @@ config CRASH_DUMP
1203 (CONFIG_RELOCATABLE=y). 1280 (CONFIG_RELOCATABLE=y).
1204 For more details see Documentation/kdump/kdump.txt 1281 For more details see Documentation/kdump/kdump.txt
1205 1282
1283config KEXEC_JUMP
1284 bool "kexec jump (EXPERIMENTAL)"
1285 depends on EXPERIMENTAL
1286 depends on KEXEC && HIBERNATION && X86_32
1287 help
1288 Jump between original kernel and kexeced kernel and invoke
1289 code in physical address mode via KEXEC
1290
1206config PHYSICAL_START 1291config PHYSICAL_START
1207 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) 1292 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
1208 default "0x1000000" if X86_NUMAQ 1293 default "0x1000000" if X86_NUMAQ
@@ -1339,7 +1424,7 @@ config X86_APM_BOOT
1339 1424
1340menuconfig APM 1425menuconfig APM
1341 tristate "APM (Advanced Power Management) BIOS support" 1426 tristate "APM (Advanced Power Management) BIOS support"
1342 depends on X86_32 && PM_SLEEP && !X86_VISWS 1427 depends on X86_32 && PM_SLEEP
1343 ---help--- 1428 ---help---
1344 APM is a BIOS specification for saving power using several different 1429 APM is a BIOS specification for saving power using several different
1345 techniques. This is mostly useful for battery powered laptops with 1430 techniques. This is mostly useful for battery powered laptops with
@@ -1475,8 +1560,7 @@ endmenu
1475menu "Bus options (PCI etc.)" 1560menu "Bus options (PCI etc.)"
1476 1561
1477config PCI 1562config PCI
1478 bool "PCI support" if !X86_VISWS && !X86_VSMP 1563 bool "PCI support"
1479 depends on !X86_VOYAGER
1480 default y 1564 default y
1481 select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC) 1565 select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC)
1482 help 1566 help
@@ -1487,7 +1571,7 @@ config PCI
1487 1571
1488choice 1572choice
1489 prompt "PCI access mode" 1573 prompt "PCI access mode"
1490 depends on X86_32 && PCI && !X86_VISWS 1574 depends on X86_32 && PCI
1491 default PCI_GOANY 1575 default PCI_GOANY
1492 ---help--- 1576 ---help---
1493 On PCI systems, the BIOS can be used to detect the PCI devices and 1577 On PCI systems, the BIOS can be used to detect the PCI devices and
@@ -1524,12 +1608,12 @@ endchoice
1524 1608
1525config PCI_BIOS 1609config PCI_BIOS
1526 def_bool y 1610 def_bool y
1527 depends on X86_32 && !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY) 1611 depends on X86_32 && PCI && (PCI_GOBIOS || PCI_GOANY)
1528 1612
1529# x86-64 doesn't support PCI BIOS access from long mode so always go direct. 1613# x86-64 doesn't support PCI BIOS access from long mode so always go direct.
1530config PCI_DIRECT 1614config PCI_DIRECT
1531 def_bool y 1615 def_bool y
1532 depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY || PCI_GOOLPC) || X86_VISWS) 1616 depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY || PCI_GOOLPC))
1533 1617
1534config PCI_MMCONFIG 1618config PCI_MMCONFIG
1535 def_bool y 1619 def_bool y
@@ -1589,7 +1673,7 @@ if X86_32
1589 1673
1590config ISA 1674config ISA
1591 bool "ISA support" 1675 bool "ISA support"
1592 depends on !(X86_VOYAGER || X86_VISWS) 1676 depends on !X86_VOYAGER
1593 help 1677 help
1594 Find out whether you have ISA slots on your motherboard. ISA is the 1678 Find out whether you have ISA slots on your motherboard. ISA is the
1595 name of a bus system, i.e. the way the CPU talks to the other stuff 1679 name of a bus system, i.e. the way the CPU talks to the other stuff
@@ -1616,7 +1700,7 @@ config EISA
1616source "drivers/eisa/Kconfig" 1700source "drivers/eisa/Kconfig"
1617 1701
1618config MCA 1702config MCA
1619 bool "MCA support" if !(X86_VISWS || X86_VOYAGER) 1703 bool "MCA support" if !X86_VOYAGER
1620 default y if X86_VOYAGER 1704 default y if X86_VOYAGER
1621 help 1705 help
1622 MicroChannel Architecture is found in some IBM PS/2 machines and 1706 MicroChannel Architecture is found in some IBM PS/2 machines and
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 2ad6301849a1..2c518fbc52ec 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -344,7 +344,7 @@ config X86_F00F_BUG
344 344
345config X86_WP_WORKS_OK 345config X86_WP_WORKS_OK
346 def_bool y 346 def_bool y
347 depends on X86_32 && !M386 347 depends on !M386
348 348
349config X86_INVLPG 349config X86_INVLPG
350 def_bool y 350 def_bool y
@@ -362,10 +362,6 @@ config X86_ALIGNMENT_16
362 def_bool y 362 def_bool y
363 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 363 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
364 364
365config X86_GOOD_APIC
366 def_bool y
367 depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2 || MVIAC7 || X86_64
368
369config X86_INTEL_USERCOPY 365config X86_INTEL_USERCOPY
370 def_bool y 366 def_bool y
371 depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 367 depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
@@ -399,6 +395,10 @@ config X86_TSC
399 def_bool y 395 def_bool y
400 depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 396 depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64
401 397
398config X86_CMPXCHG64
399 def_bool y
400 depends on X86_PAE || X86_64
401
402# this should be set for all -march=.. options where the compiler 402# this should be set for all -march=.. options where the compiler
403# generates cmov. 403# generates cmov.
404config X86_CMOV 404config X86_CMOV
@@ -414,4 +414,4 @@ config X86_MINIMUM_CPU_FAMILY
414 414
415config X86_DEBUGCTLMSR 415config X86_DEBUGCTLMSR
416 def_bool y 416 def_bool y
417 depends on !(M586MMX || M586TSC || M586 || M486 || M386) 417 depends on !(MK6 || MWINCHIPC6 || MWINCHIP2 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386)
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 18363374d51a..092f019e033a 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -5,13 +5,15 @@ config TRACE_IRQFLAGS_SUPPORT
5 5
6source "lib/Kconfig.debug" 6source "lib/Kconfig.debug"
7 7
8config NONPROMISC_DEVMEM 8config STRICT_DEVMEM
9 bool "Filter access to /dev/mem" 9 bool "Filter access to /dev/mem"
10 help 10 help
11 If this option is left off, you allow userspace access to all 11 If this option is disabled, you allow userspace (root) access to all
12 of memory, including kernel and userspace memory. Accidental 12 of memory, including kernel and userspace memory. Accidental
13 access to this is obviously disastrous, but specific access can 13 access to this is obviously disastrous, but specific access can
14 be used by people debugging the kernel. 14 be used by people debugging the kernel. Note that with PAT support
15 enabled, even in this case there are restrictions on /dev/mem
16 use due to the cache aliasing requirements.
15 17
16 If this option is switched on, the /dev/mem file only allows 18 If this option is switched on, the /dev/mem file only allows
17 userspace access to PCI space and the BIOS code and data regions. 19 userspace access to PCI space and the BIOS code and data regions.
@@ -20,6 +22,14 @@ config NONPROMISC_DEVMEM
20 22
21 If in doubt, say Y. 23 If in doubt, say Y.
22 24
25config X86_VERBOSE_BOOTUP
26 bool "Enable verbose x86 bootup info messages"
27 default y
28 help
29 Enables the informational output from the decompression stage
30 (e.g. bzImage) of the boot. If you disable this you will still
31 see errors. Disable this if you want silent bootup.
32
23config EARLY_PRINTK 33config EARLY_PRINTK
24 bool "Early printk" if EMBEDDED 34 bool "Early printk" if EMBEDDED
25 default y 35 default y
@@ -60,7 +70,7 @@ config DEBUG_PAGEALLOC
60config DEBUG_PER_CPU_MAPS 70config DEBUG_PER_CPU_MAPS
61 bool "Debug access to per_cpu maps" 71 bool "Debug access to per_cpu maps"
62 depends on DEBUG_KERNEL 72 depends on DEBUG_KERNEL
63 depends on X86_64_SMP 73 depends on X86_SMP
64 default n 74 default n
65 help 75 help
66 Say Y to verify that the per_cpu map being accessed has 76 Say Y to verify that the per_cpu map being accessed has
@@ -129,15 +139,6 @@ config 4KSTACKS
129 on the VM subsystem for higher order allocations. This option 139 on the VM subsystem for higher order allocations. This option
130 will also use IRQ stacks to compensate for the reduced stackspace. 140 will also use IRQ stacks to compensate for the reduced stackspace.
131 141
132config X86_FIND_SMP_CONFIG
133 def_bool y
134 depends on X86_LOCAL_APIC || X86_VOYAGER
135 depends on X86_32
136
137config X86_MPPARSE
138 def_bool y
139 depends on (X86_32 && (X86_LOCAL_APIC && !X86_VISWS)) || X86_64
140
141config DOUBLEFAULT 142config DOUBLEFAULT
142 default y 143 default y
143 bool "Enable doublefault exception handler" if EMBEDDED 144 bool "Enable doublefault exception handler" if EMBEDDED
@@ -172,6 +173,33 @@ config IOMMU_LEAK
172 Add a simple leak tracer to the IOMMU code. This is useful when you 173 Add a simple leak tracer to the IOMMU code. This is useful when you
173 are debugging a buggy device driver that leaks IOMMU mappings. 174 are debugging a buggy device driver that leaks IOMMU mappings.
174 175
176config MMIOTRACE_HOOKS
177 bool
178
179config MMIOTRACE
180 bool "Memory mapped IO tracing"
181 depends on DEBUG_KERNEL && PCI
182 select TRACING
183 select MMIOTRACE_HOOKS
184 help
185 Mmiotrace traces Memory Mapped I/O access and is meant for
186 debugging and reverse engineering. It is called from the ioremap
187 implementation and works via page faults. Tracing is disabled by
188 default and can be enabled at run-time.
189
190 See Documentation/tracers/mmiotrace.txt.
191 If you are not helping to develop drivers, say N.
192
193config MMIOTRACE_TEST
194 tristate "Test module for mmiotrace"
195 depends on MMIOTRACE && m
196 help
197 This is a dumb module for testing mmiotrace. It is very dangerous
198 as it will write garbage to IO memory starting at a given address.
199 However, it should be safe to use on e.g. unused portion of VRAM.
200
201 Say N, unless you absolutely know what you are doing.
202
175# 203#
176# IO delay types: 204# IO delay types:
177# 205#
@@ -261,7 +289,6 @@ config CPA_DEBUG
261 289
262config OPTIMIZE_INLINING 290config OPTIMIZE_INLINING
263 bool "Allow gcc to uninline functions marked 'inline'" 291 bool "Allow gcc to uninline functions marked 'inline'"
264 depends on BROKEN
265 help 292 help
266 This option determines if the kernel forces gcc to inline the functions 293 This option determines if the kernel forces gcc to inline the functions
267 developers have marked 'inline'. Doing so takes away freedom from gcc to 294 developers have marked 'inline'. Doing so takes away freedom from gcc to
@@ -272,5 +299,7 @@ config OPTIMIZE_INLINING
272 become the default in the future, until then this option is there to 299 become the default in the future, until then this option is there to
273 test gcc for this. 300 test gcc for this.
274 301
302 If unsure, say N.
303
275endmenu 304endmenu
276 305
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 3cff3c894cf3..919ce21ea654 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -113,33 +113,11 @@ mcore-y := arch/x86/mach-default/
113mflags-$(CONFIG_X86_VOYAGER) := -Iinclude/asm-x86/mach-voyager 113mflags-$(CONFIG_X86_VOYAGER) := -Iinclude/asm-x86/mach-voyager
114mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/ 114mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/
115 115
116# VISWS subarch support
117mflags-$(CONFIG_X86_VISWS) := -Iinclude/asm-x86/mach-visws
118mcore-$(CONFIG_X86_VISWS) := arch/x86/mach-visws/
119
120# NUMAQ subarch support
121mflags-$(CONFIG_X86_NUMAQ) := -Iinclude/asm-x86/mach-numaq
122mcore-$(CONFIG_X86_NUMAQ) := arch/x86/mach-default/
123
124# BIGSMP subarch support
125mflags-$(CONFIG_X86_BIGSMP) := -Iinclude/asm-x86/mach-bigsmp
126mcore-$(CONFIG_X86_BIGSMP) := arch/x86/mach-default/
127
128#Summit subarch support
129mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-x86/mach-summit
130mcore-$(CONFIG_X86_SUMMIT) := arch/x86/mach-default/
131
132# generic subarchitecture 116# generic subarchitecture
133mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic 117mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic
134fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/ 118fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/
135mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/ 119mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/
136 120
137
138# ES7000 subarch support
139mflags-$(CONFIG_X86_ES7000) := -Iinclude/asm-x86/mach-es7000
140fcore-$(CONFIG_X86_ES7000) := arch/x86/mach-es7000/
141mcore-$(CONFIG_X86_ES7000) := arch/x86/mach-default/
142
143# RDC R-321x subarch support 121# RDC R-321x subarch support
144mflags-$(CONFIG_X86_RDC321X) := -Iinclude/asm-x86/mach-rdc321x 122mflags-$(CONFIG_X86_RDC321X) := -Iinclude/asm-x86/mach-rdc321x
145mcore-$(CONFIG_X86_RDC321X) := arch/x86/mach-default/ 123mcore-$(CONFIG_X86_RDC321X) := arch/x86/mach-default/
@@ -160,6 +138,7 @@ KBUILD_AFLAGS += $(mflags-y)
160 138
161head-y := arch/x86/kernel/head_$(BITS).o 139head-y := arch/x86/kernel/head_$(BITS).o
162head-y += arch/x86/kernel/head$(BITS).o 140head-y += arch/x86/kernel/head$(BITS).o
141head-y += arch/x86/kernel/head.o
163head-y += arch/x86/kernel/init_task.o 142head-y += arch/x86/kernel/init_task.o
164 143
165libs-y += arch/x86/lib/ 144libs-y += arch/x86/lib/
@@ -210,12 +189,12 @@ all: bzImage
210 189
211# KBUILD_IMAGE specify target image being built 190# KBUILD_IMAGE specify target image being built
212 KBUILD_IMAGE := $(boot)/bzImage 191 KBUILD_IMAGE := $(boot)/bzImage
213zImage zlilo zdisk: KBUILD_IMAGE := arch/x86/boot/zImage 192zImage zlilo zdisk: KBUILD_IMAGE := $(boot)/zImage
214 193
215zImage bzImage: vmlinux 194zImage bzImage: vmlinux
216 $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) 195 $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
217 $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot 196 $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
218 $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/bzImage 197 $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@
219 198
220compressed: zImage 199compressed: zImage
221 200
diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c
index e01aafd03bde..4063d630deff 100644
--- a/arch/x86/boot/a20.c
+++ b/arch/x86/boot/a20.c
@@ -1,7 +1,7 @@
1/* -*- linux-c -*- ------------------------------------------------------- * 1/* -*- linux-c -*- ------------------------------------------------------- *
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007-2008 rPath, Inc. - All Rights Reserved
5 * 5 *
6 * This file is part of the Linux kernel, and is made available under 6 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 7 * the terms of the GNU General Public License version 2.
@@ -95,6 +95,9 @@ static void enable_a20_kbc(void)
95 95
96 outb(0xdf, 0x60); /* A20 on */ 96 outb(0xdf, 0x60); /* A20 on */
97 empty_8042(); 97 empty_8042();
98
99 outb(0xff, 0x64); /* Null command, but UHCI wants it */
100 empty_8042();
98} 101}
99 102
100static void enable_a20_fast(void) 103static void enable_a20_fast(void)
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index d8819efac81d..1d5dff4123e1 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -30,6 +30,7 @@
30#include <asm/page.h> 30#include <asm/page.h>
31#include <asm/boot.h> 31#include <asm/boot.h>
32#include <asm/msr.h> 32#include <asm/msr.h>
33#include <asm/processor-flags.h>
33#include <asm/asm-offsets.h> 34#include <asm/asm-offsets.h>
34 35
35.section ".text.head" 36.section ".text.head"
@@ -109,7 +110,7 @@ startup_32:
109 110
110 /* Enable PAE mode */ 111 /* Enable PAE mode */
111 xorl %eax, %eax 112 xorl %eax, %eax
112 orl $(1 << 5), %eax 113 orl $(X86_CR4_PAE), %eax
113 movl %eax, %cr4 114 movl %eax, %cr4
114 115
115 /* 116 /*
@@ -170,7 +171,7 @@ startup_32:
170 pushl %eax 171 pushl %eax
171 172
172 /* Enter paged protected Mode, activating Long Mode */ 173 /* Enter paged protected Mode, activating Long Mode */
173 movl $0x80000001, %eax /* Enable Paging and Protected mode */ 174 movl $(X86_CR0_PG | X86_CR0_PE), %eax /* Enable Paging and Protected mode */
174 movl %eax, %cr0 175 movl %eax, %cr0
175 176
176 /* Jump from 32bit compatibility mode into 64bit mode. */ 177 /* Jump from 32bit compatibility mode into 64bit mode. */
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 90456cee47c3..9fea73706479 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -30,6 +30,7 @@
30#include <asm/io.h> 30#include <asm/io.h>
31#include <asm/page.h> 31#include <asm/page.h>
32#include <asm/boot.h> 32#include <asm/boot.h>
33#include <asm/bootparam.h>
33 34
34/* WARNING!! 35/* WARNING!!
35 * This code is compiled with -fPIC and it is relocated dynamically 36 * This code is compiled with -fPIC and it is relocated dynamically
@@ -181,32 +182,23 @@ static unsigned outcnt;
181static int fill_inbuf(void); 182static int fill_inbuf(void);
182static void flush_window(void); 183static void flush_window(void);
183static void error(char *m); 184static void error(char *m);
184static void gzip_mark(void **);
185static void gzip_release(void **);
186 185
187/* 186/*
188 * This is set up by the setup-routine at boot-time 187 * This is set up by the setup-routine at boot-time
189 */ 188 */
190static unsigned char *real_mode; /* Pointer to real-mode data */ 189static struct boot_params *real_mode; /* Pointer to real-mode data */
191 190static int quiet;
192#define RM_EXT_MEM_K (*(unsigned short *)(real_mode + 0x2))
193#ifndef STANDARD_MEMORY_BIOS_CALL
194#define RM_ALT_MEM_K (*(unsigned long *)(real_mode + 0x1e0))
195#endif
196#define RM_SCREEN_INFO (*(struct screen_info *)(real_mode+0))
197 191
198extern unsigned char input_data[]; 192extern unsigned char input_data[];
199extern int input_len; 193extern int input_len;
200 194
201static long bytes_out; 195static long bytes_out;
202 196
203static void *malloc(int size);
204static void free(void *where);
205
206static void *memset(void *s, int c, unsigned n); 197static void *memset(void *s, int c, unsigned n);
207static void *memcpy(void *dest, const void *src, unsigned n); 198static void *memcpy(void *dest, const void *src, unsigned n);
208 199
209static void putstr(const char *); 200static void __putstr(int, const char *);
201#define putstr(__x) __putstr(0, __x)
210 202
211#ifdef CONFIG_X86_64 203#ifdef CONFIG_X86_64
212#define memptr long 204#define memptr long
@@ -221,46 +213,8 @@ static char *vidmem;
221static int vidport; 213static int vidport;
222static int lines, cols; 214static int lines, cols;
223 215
224#ifdef CONFIG_X86_NUMAQ
225void *xquad_portio;
226#endif
227
228#include "../../../../lib/inflate.c" 216#include "../../../../lib/inflate.c"
229 217
230static void *malloc(int size)
231{
232 void *p;
233
234 if (size < 0)
235 error("Malloc error");
236 if (free_mem_ptr <= 0)
237 error("Memory error");
238
239 free_mem_ptr = (free_mem_ptr + 3) & ~3; /* Align */
240
241 p = (void *)free_mem_ptr;
242 free_mem_ptr += size;
243
244 if (free_mem_ptr >= free_mem_end_ptr)
245 error("Out of memory");
246
247 return p;
248}
249
250static void free(void *where)
251{ /* Don't care */
252}
253
254static void gzip_mark(void **ptr)
255{
256 *ptr = (void *) free_mem_ptr;
257}
258
259static void gzip_release(void **ptr)
260{
261 free_mem_ptr = (memptr) *ptr;
262}
263
264static void scroll(void) 218static void scroll(void)
265{ 219{
266 int i; 220 int i;
@@ -270,18 +224,24 @@ static void scroll(void)
270 vidmem[i] = ' '; 224 vidmem[i] = ' ';
271} 225}
272 226
273static void putstr(const char *s) 227static void __putstr(int error, const char *s)
274{ 228{
275 int x, y, pos; 229 int x, y, pos;
276 char c; 230 char c;
277 231
232#ifndef CONFIG_X86_VERBOSE_BOOTUP
233 if (!error)
234 return;
235#endif
236
278#ifdef CONFIG_X86_32 237#ifdef CONFIG_X86_32
279 if (RM_SCREEN_INFO.orig_video_mode == 0 && lines == 0 && cols == 0) 238 if (real_mode->screen_info.orig_video_mode == 0 &&
239 lines == 0 && cols == 0)
280 return; 240 return;
281#endif 241#endif
282 242
283 x = RM_SCREEN_INFO.orig_x; 243 x = real_mode->screen_info.orig_x;
284 y = RM_SCREEN_INFO.orig_y; 244 y = real_mode->screen_info.orig_y;
285 245
286 while ((c = *s++) != '\0') { 246 while ((c = *s++) != '\0') {
287 if (c == '\n') { 247 if (c == '\n') {
@@ -302,8 +262,8 @@ static void putstr(const char *s)
302 } 262 }
303 } 263 }
304 264
305 RM_SCREEN_INFO.orig_x = x; 265 real_mode->screen_info.orig_x = x;
306 RM_SCREEN_INFO.orig_y = y; 266 real_mode->screen_info.orig_y = y;
307 267
308 pos = (x + cols * y) * 2; /* Update cursor position */ 268 pos = (x + cols * y) * 2; /* Update cursor position */
309 outb(14, vidport); 269 outb(14, vidport);
@@ -366,9 +326,9 @@ static void flush_window(void)
366 326
367static void error(char *x) 327static void error(char *x)
368{ 328{
369 putstr("\n\n"); 329 __putstr(1, "\n\n");
370 putstr(x); 330 __putstr(1, x);
371 putstr("\n\n -- System halted"); 331 __putstr(1, "\n\n -- System halted");
372 332
373 while (1) 333 while (1)
374 asm("hlt"); 334 asm("hlt");
@@ -395,7 +355,8 @@ static void parse_elf(void *output)
395 return; 355 return;
396 } 356 }
397 357
398 putstr("Parsing ELF... "); 358 if (!quiet)
359 putstr("Parsing ELF... ");
399 360
400 phdrs = malloc(sizeof(*phdrs) * ehdr.e_phnum); 361 phdrs = malloc(sizeof(*phdrs) * ehdr.e_phnum);
401 if (!phdrs) 362 if (!phdrs)
@@ -430,7 +391,10 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
430{ 391{
431 real_mode = rmode; 392 real_mode = rmode;
432 393
433 if (RM_SCREEN_INFO.orig_video_mode == 7) { 394 if (real_mode->hdr.loadflags & QUIET_FLAG)
395 quiet = 1;
396
397 if (real_mode->screen_info.orig_video_mode == 7) {
434 vidmem = (char *) 0xb0000; 398 vidmem = (char *) 0xb0000;
435 vidport = 0x3b4; 399 vidport = 0x3b4;
436 } else { 400 } else {
@@ -438,8 +402,8 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
438 vidport = 0x3d4; 402 vidport = 0x3d4;
439 } 403 }
440 404
441 lines = RM_SCREEN_INFO.orig_video_lines; 405 lines = real_mode->screen_info.orig_video_lines;
442 cols = RM_SCREEN_INFO.orig_video_cols; 406 cols = real_mode->screen_info.orig_video_cols;
443 407
444 window = output; /* Output buffer (Normally at 1M) */ 408 window = output; /* Output buffer (Normally at 1M) */
445 free_mem_ptr = heap; /* Heap */ 409 free_mem_ptr = heap; /* Heap */
@@ -465,9 +429,11 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
465#endif 429#endif
466 430
467 makecrc(); 431 makecrc();
468 putstr("\nDecompressing Linux... "); 432 if (!quiet)
433 putstr("\nDecompressing Linux... ");
469 gunzip(); 434 gunzip();
470 parse_elf(output); 435 parse_elf(output);
471 putstr("done.\nBooting the kernel.\n"); 436 if (!quiet)
437 putstr("done.\nBooting the kernel.\n");
472 return; 438 return;
473} 439}
diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c
index edaadea90aaf..a1310c52fc0c 100644
--- a/arch/x86/boot/compressed/relocs.c
+++ b/arch/x86/boot/compressed/relocs.c
@@ -10,16 +10,20 @@
10#define USE_BSD 10#define USE_BSD
11#include <endian.h> 11#include <endian.h>
12 12
13#define MAX_SHDRS 100
14#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 13#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
15static Elf32_Ehdr ehdr; 14static Elf32_Ehdr ehdr;
16static Elf32_Shdr shdr[MAX_SHDRS];
17static Elf32_Sym *symtab[MAX_SHDRS];
18static Elf32_Rel *reltab[MAX_SHDRS];
19static char *strtab[MAX_SHDRS];
20static unsigned long reloc_count, reloc_idx; 15static unsigned long reloc_count, reloc_idx;
21static unsigned long *relocs; 16static unsigned long *relocs;
22 17
18struct section {
19 Elf32_Shdr shdr;
20 struct section *link;
21 Elf32_Sym *symtab;
22 Elf32_Rel *reltab;
23 char *strtab;
24};
25static struct section *secs;
26
23/* 27/*
24 * Following symbols have been audited. There values are constant and do 28 * Following symbols have been audited. There values are constant and do
25 * not change if bzImage is loaded at a different physical address than 29 * not change if bzImage is loaded at a different physical address than
@@ -35,7 +39,7 @@ static int is_safe_abs_reloc(const char* sym_name)
35{ 39{
36 int i; 40 int i;
37 41
38 for(i = 0; i < ARRAY_SIZE(safe_abs_relocs); i++) { 42 for (i = 0; i < ARRAY_SIZE(safe_abs_relocs); i++) {
39 if (!strcmp(sym_name, safe_abs_relocs[i])) 43 if (!strcmp(sym_name, safe_abs_relocs[i]))
40 /* Match found */ 44 /* Match found */
41 return 1; 45 return 1;
@@ -137,10 +141,10 @@ static const char *sec_name(unsigned shndx)
137{ 141{
138 const char *sec_strtab; 142 const char *sec_strtab;
139 const char *name; 143 const char *name;
140 sec_strtab = strtab[ehdr.e_shstrndx]; 144 sec_strtab = secs[ehdr.e_shstrndx].strtab;
141 name = "<noname>"; 145 name = "<noname>";
142 if (shndx < ehdr.e_shnum) { 146 if (shndx < ehdr.e_shnum) {
143 name = sec_strtab + shdr[shndx].sh_name; 147 name = sec_strtab + secs[shndx].shdr.sh_name;
144 } 148 }
145 else if (shndx == SHN_ABS) { 149 else if (shndx == SHN_ABS) {
146 name = "ABSOLUTE"; 150 name = "ABSOLUTE";
@@ -159,7 +163,7 @@ static const char *sym_name(const char *sym_strtab, Elf32_Sym *sym)
159 name = sym_strtab + sym->st_name; 163 name = sym_strtab + sym->st_name;
160 } 164 }
161 else { 165 else {
162 name = sec_name(shdr[sym->st_shndx].sh_name); 166 name = sec_name(secs[sym->st_shndx].shdr.sh_name);
163 } 167 }
164 return name; 168 return name;
165} 169}
@@ -244,29 +248,34 @@ static void read_ehdr(FILE *fp)
244static void read_shdrs(FILE *fp) 248static void read_shdrs(FILE *fp)
245{ 249{
246 int i; 250 int i;
247 if (ehdr.e_shnum > MAX_SHDRS) { 251 Elf32_Shdr shdr;
248 die("%d section headers supported: %d\n", 252
249 ehdr.e_shnum, MAX_SHDRS); 253 secs = calloc(ehdr.e_shnum, sizeof(struct section));
254 if (!secs) {
255 die("Unable to allocate %d section headers\n",
256 ehdr.e_shnum);
250 } 257 }
251 if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) { 258 if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) {
252 die("Seek to %d failed: %s\n", 259 die("Seek to %d failed: %s\n",
253 ehdr.e_shoff, strerror(errno)); 260 ehdr.e_shoff, strerror(errno));
254 } 261 }
255 if (fread(&shdr, sizeof(shdr[0]), ehdr.e_shnum, fp) != ehdr.e_shnum) { 262 for (i = 0; i < ehdr.e_shnum; i++) {
256 die("Cannot read ELF section headers: %s\n", 263 struct section *sec = &secs[i];
257 strerror(errno)); 264 if (fread(&shdr, sizeof shdr, 1, fp) != 1)
258 } 265 die("Cannot read ELF section headers %d/%d: %s\n",
259 for(i = 0; i < ehdr.e_shnum; i++) { 266 i, ehdr.e_shnum, strerror(errno));
260 shdr[i].sh_name = elf32_to_cpu(shdr[i].sh_name); 267 sec->shdr.sh_name = elf32_to_cpu(shdr.sh_name);
261 shdr[i].sh_type = elf32_to_cpu(shdr[i].sh_type); 268 sec->shdr.sh_type = elf32_to_cpu(shdr.sh_type);
262 shdr[i].sh_flags = elf32_to_cpu(shdr[i].sh_flags); 269 sec->shdr.sh_flags = elf32_to_cpu(shdr.sh_flags);
263 shdr[i].sh_addr = elf32_to_cpu(shdr[i].sh_addr); 270 sec->shdr.sh_addr = elf32_to_cpu(shdr.sh_addr);
264 shdr[i].sh_offset = elf32_to_cpu(shdr[i].sh_offset); 271 sec->shdr.sh_offset = elf32_to_cpu(shdr.sh_offset);
265 shdr[i].sh_size = elf32_to_cpu(shdr[i].sh_size); 272 sec->shdr.sh_size = elf32_to_cpu(shdr.sh_size);
266 shdr[i].sh_link = elf32_to_cpu(shdr[i].sh_link); 273 sec->shdr.sh_link = elf32_to_cpu(shdr.sh_link);
267 shdr[i].sh_info = elf32_to_cpu(shdr[i].sh_info); 274 sec->shdr.sh_info = elf32_to_cpu(shdr.sh_info);
268 shdr[i].sh_addralign = elf32_to_cpu(shdr[i].sh_addralign); 275 sec->shdr.sh_addralign = elf32_to_cpu(shdr.sh_addralign);
269 shdr[i].sh_entsize = elf32_to_cpu(shdr[i].sh_entsize); 276 sec->shdr.sh_entsize = elf32_to_cpu(shdr.sh_entsize);
277 if (sec->shdr.sh_link < ehdr.e_shnum)
278 sec->link = &secs[sec->shdr.sh_link];
270 } 279 }
271 280
272} 281}
@@ -274,20 +283,22 @@ static void read_shdrs(FILE *fp)
274static void read_strtabs(FILE *fp) 283static void read_strtabs(FILE *fp)
275{ 284{
276 int i; 285 int i;
277 for(i = 0; i < ehdr.e_shnum; i++) { 286 for (i = 0; i < ehdr.e_shnum; i++) {
278 if (shdr[i].sh_type != SHT_STRTAB) { 287 struct section *sec = &secs[i];
288 if (sec->shdr.sh_type != SHT_STRTAB) {
279 continue; 289 continue;
280 } 290 }
281 strtab[i] = malloc(shdr[i].sh_size); 291 sec->strtab = malloc(sec->shdr.sh_size);
282 if (!strtab[i]) { 292 if (!sec->strtab) {
283 die("malloc of %d bytes for strtab failed\n", 293 die("malloc of %d bytes for strtab failed\n",
284 shdr[i].sh_size); 294 sec->shdr.sh_size);
285 } 295 }
286 if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) { 296 if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
287 die("Seek to %d failed: %s\n", 297 die("Seek to %d failed: %s\n",
288 shdr[i].sh_offset, strerror(errno)); 298 sec->shdr.sh_offset, strerror(errno));
289 } 299 }
290 if (fread(strtab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) { 300 if (fread(sec->strtab, 1, sec->shdr.sh_size, fp)
301 != sec->shdr.sh_size) {
291 die("Cannot read symbol table: %s\n", 302 die("Cannot read symbol table: %s\n",
292 strerror(errno)); 303 strerror(errno));
293 } 304 }
@@ -297,28 +308,31 @@ static void read_strtabs(FILE *fp)
297static void read_symtabs(FILE *fp) 308static void read_symtabs(FILE *fp)
298{ 309{
299 int i,j; 310 int i,j;
300 for(i = 0; i < ehdr.e_shnum; i++) { 311 for (i = 0; i < ehdr.e_shnum; i++) {
301 if (shdr[i].sh_type != SHT_SYMTAB) { 312 struct section *sec = &secs[i];
313 if (sec->shdr.sh_type != SHT_SYMTAB) {
302 continue; 314 continue;
303 } 315 }
304 symtab[i] = malloc(shdr[i].sh_size); 316 sec->symtab = malloc(sec->shdr.sh_size);
305 if (!symtab[i]) { 317 if (!sec->symtab) {
306 die("malloc of %d bytes for symtab failed\n", 318 die("malloc of %d bytes for symtab failed\n",
307 shdr[i].sh_size); 319 sec->shdr.sh_size);
308 } 320 }
309 if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) { 321 if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
310 die("Seek to %d failed: %s\n", 322 die("Seek to %d failed: %s\n",
311 shdr[i].sh_offset, strerror(errno)); 323 sec->shdr.sh_offset, strerror(errno));
312 } 324 }
313 if (fread(symtab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) { 325 if (fread(sec->symtab, 1, sec->shdr.sh_size, fp)
326 != sec->shdr.sh_size) {
314 die("Cannot read symbol table: %s\n", 327 die("Cannot read symbol table: %s\n",
315 strerror(errno)); 328 strerror(errno));
316 } 329 }
317 for(j = 0; j < shdr[i].sh_size/sizeof(symtab[i][0]); j++) { 330 for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) {
318 symtab[i][j].st_name = elf32_to_cpu(symtab[i][j].st_name); 331 Elf32_Sym *sym = &sec->symtab[j];
319 symtab[i][j].st_value = elf32_to_cpu(symtab[i][j].st_value); 332 sym->st_name = elf32_to_cpu(sym->st_name);
320 symtab[i][j].st_size = elf32_to_cpu(symtab[i][j].st_size); 333 sym->st_value = elf32_to_cpu(sym->st_value);
321 symtab[i][j].st_shndx = elf16_to_cpu(symtab[i][j].st_shndx); 334 sym->st_size = elf32_to_cpu(sym->st_size);
335 sym->st_shndx = elf16_to_cpu(sym->st_shndx);
322 } 336 }
323 } 337 }
324} 338}
@@ -327,26 +341,29 @@ static void read_symtabs(FILE *fp)
327static void read_relocs(FILE *fp) 341static void read_relocs(FILE *fp)
328{ 342{
329 int i,j; 343 int i,j;
330 for(i = 0; i < ehdr.e_shnum; i++) { 344 for (i = 0; i < ehdr.e_shnum; i++) {
331 if (shdr[i].sh_type != SHT_REL) { 345 struct section *sec = &secs[i];
346 if (sec->shdr.sh_type != SHT_REL) {
332 continue; 347 continue;
333 } 348 }
334 reltab[i] = malloc(shdr[i].sh_size); 349 sec->reltab = malloc(sec->shdr.sh_size);
335 if (!reltab[i]) { 350 if (!sec->reltab) {
336 die("malloc of %d bytes for relocs failed\n", 351 die("malloc of %d bytes for relocs failed\n",
337 shdr[i].sh_size); 352 sec->shdr.sh_size);
338 } 353 }
339 if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) { 354 if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
340 die("Seek to %d failed: %s\n", 355 die("Seek to %d failed: %s\n",
341 shdr[i].sh_offset, strerror(errno)); 356 sec->shdr.sh_offset, strerror(errno));
342 } 357 }
343 if (fread(reltab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) { 358 if (fread(sec->reltab, 1, sec->shdr.sh_size, fp)
359 != sec->shdr.sh_size) {
344 die("Cannot read symbol table: %s\n", 360 die("Cannot read symbol table: %s\n",
345 strerror(errno)); 361 strerror(errno));
346 } 362 }
347 for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) { 363 for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) {
348 reltab[i][j].r_offset = elf32_to_cpu(reltab[i][j].r_offset); 364 Elf32_Rel *rel = &sec->reltab[j];
349 reltab[i][j].r_info = elf32_to_cpu(reltab[i][j].r_info); 365 rel->r_offset = elf32_to_cpu(rel->r_offset);
366 rel->r_info = elf32_to_cpu(rel->r_info);
350 } 367 }
351 } 368 }
352} 369}
@@ -357,19 +374,21 @@ static void print_absolute_symbols(void)
357 int i; 374 int i;
358 printf("Absolute symbols\n"); 375 printf("Absolute symbols\n");
359 printf(" Num: Value Size Type Bind Visibility Name\n"); 376 printf(" Num: Value Size Type Bind Visibility Name\n");
360 for(i = 0; i < ehdr.e_shnum; i++) { 377 for (i = 0; i < ehdr.e_shnum; i++) {
378 struct section *sec = &secs[i];
361 char *sym_strtab; 379 char *sym_strtab;
362 Elf32_Sym *sh_symtab; 380 Elf32_Sym *sh_symtab;
363 int j; 381 int j;
364 if (shdr[i].sh_type != SHT_SYMTAB) { 382
383 if (sec->shdr.sh_type != SHT_SYMTAB) {
365 continue; 384 continue;
366 } 385 }
367 sh_symtab = symtab[i]; 386 sh_symtab = sec->symtab;
368 sym_strtab = strtab[shdr[i].sh_link]; 387 sym_strtab = sec->link->strtab;
369 for(j = 0; j < shdr[i].sh_size/sizeof(symtab[0][0]); j++) { 388 for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) {
370 Elf32_Sym *sym; 389 Elf32_Sym *sym;
371 const char *name; 390 const char *name;
372 sym = &symtab[i][j]; 391 sym = &sec->symtab[j];
373 name = sym_name(sym_strtab, sym); 392 name = sym_name(sym_strtab, sym);
374 if (sym->st_shndx != SHN_ABS) { 393 if (sym->st_shndx != SHN_ABS) {
375 continue; 394 continue;
@@ -389,26 +408,27 @@ static void print_absolute_relocs(void)
389{ 408{
390 int i, printed = 0; 409 int i, printed = 0;
391 410
392 for(i = 0; i < ehdr.e_shnum; i++) { 411 for (i = 0; i < ehdr.e_shnum; i++) {
412 struct section *sec = &secs[i];
413 struct section *sec_applies, *sec_symtab;
393 char *sym_strtab; 414 char *sym_strtab;
394 Elf32_Sym *sh_symtab; 415 Elf32_Sym *sh_symtab;
395 unsigned sec_applies, sec_symtab;
396 int j; 416 int j;
397 if (shdr[i].sh_type != SHT_REL) { 417 if (sec->shdr.sh_type != SHT_REL) {
398 continue; 418 continue;
399 } 419 }
400 sec_symtab = shdr[i].sh_link; 420 sec_symtab = sec->link;
401 sec_applies = shdr[i].sh_info; 421 sec_applies = &secs[sec->shdr.sh_info];
402 if (!(shdr[sec_applies].sh_flags & SHF_ALLOC)) { 422 if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) {
403 continue; 423 continue;
404 } 424 }
405 sh_symtab = symtab[sec_symtab]; 425 sh_symtab = sec_symtab->symtab;
406 sym_strtab = strtab[shdr[sec_symtab].sh_link]; 426 sym_strtab = sec_symtab->link->strtab;
407 for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) { 427 for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) {
408 Elf32_Rel *rel; 428 Elf32_Rel *rel;
409 Elf32_Sym *sym; 429 Elf32_Sym *sym;
410 const char *name; 430 const char *name;
411 rel = &reltab[i][j]; 431 rel = &sec->reltab[j];
412 sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; 432 sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
413 name = sym_name(sym_strtab, sym); 433 name = sym_name(sym_strtab, sym);
414 if (sym->st_shndx != SHN_ABS) { 434 if (sym->st_shndx != SHN_ABS) {
@@ -456,26 +476,28 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym))
456{ 476{
457 int i; 477 int i;
458 /* Walk through the relocations */ 478 /* Walk through the relocations */
459 for(i = 0; i < ehdr.e_shnum; i++) { 479 for (i = 0; i < ehdr.e_shnum; i++) {
460 char *sym_strtab; 480 char *sym_strtab;
461 Elf32_Sym *sh_symtab; 481 Elf32_Sym *sh_symtab;
462 unsigned sec_applies, sec_symtab; 482 struct section *sec_applies, *sec_symtab;
463 int j; 483 int j;
464 if (shdr[i].sh_type != SHT_REL) { 484 struct section *sec = &secs[i];
485
486 if (sec->shdr.sh_type != SHT_REL) {
465 continue; 487 continue;
466 } 488 }
467 sec_symtab = shdr[i].sh_link; 489 sec_symtab = sec->link;
468 sec_applies = shdr[i].sh_info; 490 sec_applies = &secs[sec->shdr.sh_info];
469 if (!(shdr[sec_applies].sh_flags & SHF_ALLOC)) { 491 if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) {
470 continue; 492 continue;
471 } 493 }
472 sh_symtab = symtab[sec_symtab]; 494 sh_symtab = sec_symtab->symtab;
473 sym_strtab = strtab[shdr[sec_symtab].sh_link]; 495 sym_strtab = sec->link->strtab;
474 for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) { 496 for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) {
475 Elf32_Rel *rel; 497 Elf32_Rel *rel;
476 Elf32_Sym *sym; 498 Elf32_Sym *sym;
477 unsigned r_type; 499 unsigned r_type;
478 rel = &reltab[i][j]; 500 rel = &sec->reltab[j];
479 sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; 501 sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
480 r_type = ELF32_R_TYPE(rel->r_info); 502 r_type = ELF32_R_TYPE(rel->r_info);
481 /* Don't visit relocations to absolute symbols */ 503 /* Don't visit relocations to absolute symbols */
@@ -539,7 +561,7 @@ static void emit_relocs(int as_text)
539 */ 561 */
540 printf(".section \".data.reloc\",\"a\"\n"); 562 printf(".section \".data.reloc\",\"a\"\n");
541 printf(".balign 4\n"); 563 printf(".balign 4\n");
542 for(i = 0; i < reloc_count; i++) { 564 for (i = 0; i < reloc_count; i++) {
543 printf("\t .long 0x%08lx\n", relocs[i]); 565 printf("\t .long 0x%08lx\n", relocs[i]);
544 } 566 }
545 printf("\n"); 567 printf("\n");
@@ -550,7 +572,7 @@ static void emit_relocs(int as_text)
550 /* Print a stop */ 572 /* Print a stop */
551 printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]); 573 printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]);
552 /* Now print each relocation */ 574 /* Now print each relocation */
553 for(i = 0; i < reloc_count; i++) { 575 for (i = 0; i < reloc_count; i++) {
554 buf[0] = (relocs[i] >> 0) & 0xff; 576 buf[0] = (relocs[i] >> 0) & 0xff;
555 buf[1] = (relocs[i] >> 8) & 0xff; 577 buf[1] = (relocs[i] >> 8) & 0xff;
556 buf[2] = (relocs[i] >> 16) & 0xff; 578 buf[2] = (relocs[i] >> 16) & 0xff;
@@ -577,7 +599,7 @@ int main(int argc, char **argv)
577 show_absolute_relocs = 0; 599 show_absolute_relocs = 0;
578 as_text = 0; 600 as_text = 0;
579 fname = NULL; 601 fname = NULL;
580 for(i = 1; i < argc; i++) { 602 for (i = 1; i < argc; i++) {
581 char *arg = argv[i]; 603 char *arg = argv[i];
582 if (*arg == '-') { 604 if (*arg == '-') {
583 if (strcmp(argv[1], "--abs-syms") == 0) { 605 if (strcmp(argv[1], "--abs-syms") == 0) {
diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c
index 00e19edd852c..92d6fd73dc7d 100644
--- a/arch/x86/boot/cpu.c
+++ b/arch/x86/boot/cpu.c
@@ -28,6 +28,8 @@ static char *cpu_name(int level)
28 if (level == 64) { 28 if (level == 64) {
29 return "x86-64"; 29 return "x86-64";
30 } else { 30 } else {
31 if (level == 15)
32 level = 6;
31 sprintf(buf, "i%d86", level); 33 sprintf(buf, "i%d86", level);
32 return buf; 34 return buf;
33 } 35 }
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c
index 03399d64013b..d93cbc6464d0 100644
--- a/arch/x86/boot/edd.c
+++ b/arch/x86/boot/edd.c
@@ -167,9 +167,8 @@ void query_edd(void)
167 * Scan the BIOS-supported hard disks and query EDD 167 * Scan the BIOS-supported hard disks and query EDD
168 * information... 168 * information...
169 */ 169 */
170 get_edd_info(devno, &ei); 170 if (!get_edd_info(devno, &ei)
171 171 && boot_params.eddbuf_entries < EDDMAXNR) {
172 if (boot_params.eddbuf_entries < EDDMAXNR) {
173 memcpy(edp, &ei, sizeof ei); 172 memcpy(edp, &ei, sizeof ei);
174 edp++; 173 edp++;
175 boot_params.eddbuf_entries++; 174 boot_params.eddbuf_entries++;
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 77569a4a3be1..2296164b54d2 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -165,6 +165,10 @@ void main(void)
165 /* Set the video mode */ 165 /* Set the video mode */
166 set_video(); 166 set_video();
167 167
168 /* Parse command line for 'quiet' and pass it to decompressor. */
169 if (cmdline_find_option_bool("quiet"))
170 boot_params.hdr.loadflags |= QUIET_FLAG;
171
168 /* Do the last things and invoke protected mode */ 172 /* Do the last things and invoke protected mode */
169 go_to_protected_mode(); 173 go_to_protected_mode();
170} 174}
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index acad32eb4290..53165c97336b 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -13,6 +13,7 @@
13 */ 13 */
14 14
15#include "boot.h" 15#include "boot.h"
16#include <linux/kernel.h>
16 17
17#define SMAP 0x534d4150 /* ASCII "SMAP" */ 18#define SMAP 0x534d4150 /* ASCII "SMAP" */
18 19
@@ -53,7 +54,7 @@ static int detect_memory_e820(void)
53 54
54 count++; 55 count++;
55 desc++; 56 desc++;
56 } while (next && count < E820MAX); 57 } while (next && count < ARRAY_SIZE(boot_params.e820_map));
57 58
58 return boot_params.e820_entries = count; 59 return boot_params.e820_entries = count;
59} 60}
diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c
index 328956fdb59e..85a1cd8a8ff8 100644
--- a/arch/x86/boot/pm.c
+++ b/arch/x86/boot/pm.c
@@ -98,12 +98,6 @@ static void reset_coprocessor(void)
98/* 98/*
99 * Set up the GDT 99 * Set up the GDT
100 */ 100 */
101#define GDT_ENTRY(flags, base, limit) \
102 (((u64)(base & 0xff000000) << 32) | \
103 ((u64)flags << 40) | \
104 ((u64)(limit & 0x00ff0000) << 32) | \
105 ((u64)(base & 0x00ffffff) << 16) | \
106 ((u64)(limit & 0x0000ffff)))
107 101
108struct gdt_ptr { 102struct gdt_ptr {
109 u16 len; 103 u16 len;
diff --git a/arch/x86/boot/pmjump.S b/arch/x86/boot/pmjump.S
index ab049d40a884..141b6e20ed31 100644
--- a/arch/x86/boot/pmjump.S
+++ b/arch/x86/boot/pmjump.S
@@ -33,6 +33,8 @@ protected_mode_jump:
33 movw %cs, %bx 33 movw %cs, %bx
34 shll $4, %ebx 34 shll $4, %ebx
35 addl %ebx, 2f 35 addl %ebx, 2f
36 jmp 1f # Short jump to serialize on 386/486
371:
36 38
37 movw $__BOOT_DS, %cx 39 movw $__BOOT_DS, %cx
38 movw $__BOOT_TSS, %di 40 movw $__BOOT_TSS, %di
@@ -40,8 +42,6 @@ protected_mode_jump:
40 movl %cr0, %edx 42 movl %cr0, %edx
41 orb $X86_CR0_PE, %dl # Protected mode 43 orb $X86_CR0_PE, %dl # Protected mode
42 movl %edx, %cr0 44 movl %edx, %cr0
43 jmp 1f # Short jump to serialize on 386/486
441:
45 45
46 # Transition to 32-bit mode 46 # Transition to 32-bit mode
47 .byte 0x66, 0xea # ljmpl opcode 47 .byte 0x66, 0xea # ljmpl opcode
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
index 40ecb8d7688c..b939cb476dec 100644
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -259,8 +259,7 @@ static int vga_probe(void)
259 return mode_count[adapter]; 259 return mode_count[adapter];
260} 260}
261 261
262__videocard video_vga = 262__videocard video_vga = {
263{
264 .card_name = "VGA", 263 .card_name = "VGA",
265 .probe = vga_probe, 264 .probe = vga_probe,
266 .set_mode = vga_set_mode, 265 .set_mode = vga_set_mode,
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index ad7ddaaff588..4d73f53287b6 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,54 +1,103 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.22-git14 3# Linux kernel version: 2.6.26-rc1
4# Fri Jul 20 09:53:15 2007 4# Sun May 4 19:59:02 2008
5# 5#
6# CONFIG_64BIT is not set
6CONFIG_X86_32=y 7CONFIG_X86_32=y
8# CONFIG_X86_64 is not set
9CONFIG_X86=y
10CONFIG_DEFCONFIG_LIST="arch/x86/configs/i386_defconfig"
11# CONFIG_GENERIC_LOCKBREAK is not set
7CONFIG_GENERIC_TIME=y 12CONFIG_GENERIC_TIME=y
13CONFIG_GENERIC_CMOS_UPDATE=y
8CONFIG_CLOCKSOURCE_WATCHDOG=y 14CONFIG_CLOCKSOURCE_WATCHDOG=y
9CONFIG_GENERIC_CLOCKEVENTS=y 15CONFIG_GENERIC_CLOCKEVENTS=y
10CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y 16CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y
11CONFIG_LOCKDEP_SUPPORT=y 17CONFIG_LOCKDEP_SUPPORT=y
12CONFIG_STACKTRACE_SUPPORT=y 18CONFIG_STACKTRACE_SUPPORT=y
13CONFIG_SEMAPHORE_SLEEPERS=y 19CONFIG_HAVE_LATENCYTOP_SUPPORT=y
14CONFIG_X86=y 20CONFIG_FAST_CMPXCHG_LOCAL=y
15CONFIG_MMU=y 21CONFIG_MMU=y
16CONFIG_ZONE_DMA=y 22CONFIG_ZONE_DMA=y
17CONFIG_QUICKLIST=y
18CONFIG_GENERIC_ISA_DMA=y 23CONFIG_GENERIC_ISA_DMA=y
19CONFIG_GENERIC_IOMAP=y 24CONFIG_GENERIC_IOMAP=y
20CONFIG_GENERIC_BUG=y 25CONFIG_GENERIC_BUG=y
21CONFIG_GENERIC_HWEIGHT=y 26CONFIG_GENERIC_HWEIGHT=y
27# CONFIG_GENERIC_GPIO is not set
22CONFIG_ARCH_MAY_HAVE_PC_FDC=y 28CONFIG_ARCH_MAY_HAVE_PC_FDC=y
23CONFIG_DMI=y 29# CONFIG_RWSEM_GENERIC_SPINLOCK is not set
24CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" 30CONFIG_RWSEM_XCHGADD_ALGORITHM=y
31# CONFIG_ARCH_HAS_ILOG2_U32 is not set
32# CONFIG_ARCH_HAS_ILOG2_U64 is not set
33CONFIG_ARCH_HAS_CPU_IDLE_WAIT=y
34CONFIG_GENERIC_CALIBRATE_DELAY=y
35# CONFIG_GENERIC_TIME_VSYSCALL is not set
36CONFIG_ARCH_HAS_CPU_RELAX=y
37CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
38CONFIG_HAVE_SETUP_PER_CPU_AREA=y
39# CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set
40CONFIG_ARCH_HIBERNATION_POSSIBLE=y
41CONFIG_ARCH_SUSPEND_POSSIBLE=y
42# CONFIG_ZONE_DMA32 is not set
43CONFIG_ARCH_POPULATES_NODE_MAP=y
44# CONFIG_AUDIT_ARCH is not set
45CONFIG_ARCH_SUPPORTS_AOUT=y
46CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
47CONFIG_GENERIC_HARDIRQS=y
48CONFIG_GENERIC_IRQ_PROBE=y
49CONFIG_GENERIC_PENDING_IRQ=y
50CONFIG_X86_SMP=y
51CONFIG_X86_32_SMP=y
52CONFIG_X86_HT=y
53CONFIG_X86_BIOS_REBOOT=y
54CONFIG_X86_TRAMPOLINE=y
55CONFIG_KTIME_SCALAR=y
25 56
26# 57#
27# Code maturity level options 58# General setup
28# 59#
29CONFIG_EXPERIMENTAL=y 60CONFIG_EXPERIMENTAL=y
30CONFIG_LOCK_KERNEL=y 61CONFIG_LOCK_KERNEL=y
31CONFIG_INIT_ENV_ARG_LIMIT=32 62CONFIG_INIT_ENV_ARG_LIMIT=32
32
33#
34# General setup
35#
36CONFIG_LOCALVERSION="" 63CONFIG_LOCALVERSION=""
37CONFIG_LOCALVERSION_AUTO=y 64# CONFIG_LOCALVERSION_AUTO is not set
38CONFIG_SWAP=y 65CONFIG_SWAP=y
39CONFIG_SYSVIPC=y 66CONFIG_SYSVIPC=y
40CONFIG_SYSVIPC_SYSCTL=y 67CONFIG_SYSVIPC_SYSCTL=y
41CONFIG_POSIX_MQUEUE=y 68CONFIG_POSIX_MQUEUE=y
42# CONFIG_BSD_PROCESS_ACCT is not set 69CONFIG_BSD_PROCESS_ACCT=y
43# CONFIG_TASKSTATS is not set 70# CONFIG_BSD_PROCESS_ACCT_V3 is not set
44# CONFIG_USER_NS is not set 71CONFIG_TASKSTATS=y
45# CONFIG_AUDIT is not set 72CONFIG_TASK_DELAY_ACCT=y
46CONFIG_IKCONFIG=y 73CONFIG_TASK_XACCT=y
47CONFIG_IKCONFIG_PROC=y 74CONFIG_TASK_IO_ACCOUNTING=y
48CONFIG_LOG_BUF_SHIFT=18 75CONFIG_AUDIT=y
49# CONFIG_CPUSETS is not set 76CONFIG_AUDITSYSCALL=y
50CONFIG_SYSFS_DEPRECATED=y 77CONFIG_AUDIT_TREE=y
78# CONFIG_IKCONFIG is not set
79CONFIG_LOG_BUF_SHIFT=17
80CONFIG_CGROUPS=y
81# CONFIG_CGROUP_DEBUG is not set
82CONFIG_CGROUP_NS=y
83# CONFIG_CGROUP_DEVICE is not set
84CONFIG_CPUSETS=y
85CONFIG_GROUP_SCHED=y
86CONFIG_FAIR_GROUP_SCHED=y
87# CONFIG_RT_GROUP_SCHED is not set
88# CONFIG_USER_SCHED is not set
89CONFIG_CGROUP_SCHED=y
90CONFIG_CGROUP_CPUACCT=y
91CONFIG_RESOURCE_COUNTERS=y
92# CONFIG_CGROUP_MEM_RES_CTLR is not set
93# CONFIG_SYSFS_DEPRECATED_V2 is not set
94CONFIG_PROC_PID_CPUSET=y
51CONFIG_RELAY=y 95CONFIG_RELAY=y
96CONFIG_NAMESPACES=y
97CONFIG_UTS_NS=y
98CONFIG_IPC_NS=y
99CONFIG_USER_NS=y
100CONFIG_PID_NS=y
52CONFIG_BLK_DEV_INITRD=y 101CONFIG_BLK_DEV_INITRD=y
53CONFIG_INITRAMFS_SOURCE="" 102CONFIG_INITRAMFS_SOURCE=""
54CONFIG_CC_OPTIMIZE_FOR_SIZE=y 103CONFIG_CC_OPTIMIZE_FOR_SIZE=y
@@ -56,13 +105,15 @@ CONFIG_SYSCTL=y
56# CONFIG_EMBEDDED is not set 105# CONFIG_EMBEDDED is not set
57CONFIG_UID16=y 106CONFIG_UID16=y
58CONFIG_SYSCTL_SYSCALL=y 107CONFIG_SYSCTL_SYSCALL=y
108CONFIG_SYSCTL_SYSCALL_CHECK=y
59CONFIG_KALLSYMS=y 109CONFIG_KALLSYMS=y
60CONFIG_KALLSYMS_ALL=y 110CONFIG_KALLSYMS_ALL=y
61# CONFIG_KALLSYMS_EXTRA_PASS is not set 111CONFIG_KALLSYMS_EXTRA_PASS=y
62CONFIG_HOTPLUG=y 112CONFIG_HOTPLUG=y
63CONFIG_PRINTK=y 113CONFIG_PRINTK=y
64CONFIG_BUG=y 114CONFIG_BUG=y
65CONFIG_ELF_CORE=y 115CONFIG_ELF_CORE=y
116# CONFIG_COMPAT_BRK is not set
66CONFIG_BASE_FULL=y 117CONFIG_BASE_FULL=y
67CONFIG_FUTEX=y 118CONFIG_FUTEX=y
68CONFIG_ANON_INODES=y 119CONFIG_ANON_INODES=y
@@ -76,6 +127,17 @@ CONFIG_SLUB_DEBUG=y
76# CONFIG_SLAB is not set 127# CONFIG_SLAB is not set
77CONFIG_SLUB=y 128CONFIG_SLUB=y
78# CONFIG_SLOB is not set 129# CONFIG_SLOB is not set
130CONFIG_PROFILING=y
131CONFIG_MARKERS=y
132# CONFIG_OPROFILE is not set
133CONFIG_HAVE_OPROFILE=y
134CONFIG_KPROBES=y
135CONFIG_KRETPROBES=y
136CONFIG_HAVE_KPROBES=y
137CONFIG_HAVE_KRETPROBES=y
138# CONFIG_HAVE_DMA_ATTRS is not set
139CONFIG_PROC_PAGE_MONITOR=y
140CONFIG_SLABINFO=y
79CONFIG_RT_MUTEXES=y 141CONFIG_RT_MUTEXES=y
80# CONFIG_TINY_SHMEM is not set 142# CONFIG_TINY_SHMEM is not set
81CONFIG_BASE_SMALL=0 143CONFIG_BASE_SMALL=0
@@ -87,10 +149,10 @@ CONFIG_MODULE_FORCE_UNLOAD=y
87# CONFIG_KMOD is not set 149# CONFIG_KMOD is not set
88CONFIG_STOP_MACHINE=y 150CONFIG_STOP_MACHINE=y
89CONFIG_BLOCK=y 151CONFIG_BLOCK=y
90CONFIG_LBD=y 152# CONFIG_LBD is not set
91# CONFIG_BLK_DEV_IO_TRACE is not set 153CONFIG_BLK_DEV_IO_TRACE=y
92# CONFIG_LSF is not set 154# CONFIG_LSF is not set
93# CONFIG_BLK_DEV_BSG is not set 155CONFIG_BLK_DEV_BSG=y
94 156
95# 157#
96# IO Schedulers 158# IO Schedulers
@@ -103,7 +165,8 @@ CONFIG_IOSCHED_CFQ=y
103# CONFIG_DEFAULT_DEADLINE is not set 165# CONFIG_DEFAULT_DEADLINE is not set
104CONFIG_DEFAULT_CFQ=y 166CONFIG_DEFAULT_CFQ=y
105# CONFIG_DEFAULT_NOOP is not set 167# CONFIG_DEFAULT_NOOP is not set
106CONFIG_DEFAULT_IOSCHED="anticipatory" 168CONFIG_DEFAULT_IOSCHED="cfq"
169CONFIG_CLASSIC_RCU=y
107 170
108# 171#
109# Processor type and features 172# Processor type and features
@@ -111,18 +174,21 @@ CONFIG_DEFAULT_IOSCHED="anticipatory"
111CONFIG_TICK_ONESHOT=y 174CONFIG_TICK_ONESHOT=y
112CONFIG_NO_HZ=y 175CONFIG_NO_HZ=y
113CONFIG_HIGH_RES_TIMERS=y 176CONFIG_HIGH_RES_TIMERS=y
177CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
114CONFIG_SMP=y 178CONFIG_SMP=y
115# CONFIG_X86_PC is not set 179CONFIG_X86_PC=y
116# CONFIG_X86_ELAN is not set 180# CONFIG_X86_ELAN is not set
117# CONFIG_X86_VOYAGER is not set 181# CONFIG_X86_VOYAGER is not set
118# CONFIG_X86_NUMAQ is not set 182# CONFIG_X86_NUMAQ is not set
119# CONFIG_X86_SUMMIT is not set 183# CONFIG_X86_SUMMIT is not set
120# CONFIG_X86_BIGSMP is not set 184# CONFIG_X86_BIGSMP is not set
121# CONFIG_X86_VISWS is not set 185# CONFIG_X86_VISWS is not set
122CONFIG_X86_GENERICARCH=y 186# CONFIG_X86_GENERICARCH is not set
123# CONFIG_X86_ES7000 is not set 187# CONFIG_X86_ES7000 is not set
124# CONFIG_PARAVIRT is not set 188# CONFIG_X86_RDC321X is not set
125CONFIG_X86_CYCLONE_TIMER=y 189# CONFIG_X86_VSMP is not set
190CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
191# CONFIG_PARAVIRT_GUEST is not set
126# CONFIG_M386 is not set 192# CONFIG_M386 is not set
127# CONFIG_M486 is not set 193# CONFIG_M486 is not set
128# CONFIG_M586 is not set 194# CONFIG_M586 is not set
@@ -130,9 +196,8 @@ CONFIG_X86_CYCLONE_TIMER=y
130# CONFIG_M586MMX is not set 196# CONFIG_M586MMX is not set
131# CONFIG_M686 is not set 197# CONFIG_M686 is not set
132# CONFIG_MPENTIUMII is not set 198# CONFIG_MPENTIUMII is not set
133CONFIG_MPENTIUMIII=y 199# CONFIG_MPENTIUMIII is not set
134# CONFIG_MPENTIUMM is not set 200# CONFIG_MPENTIUMM is not set
135# CONFIG_MCORE2 is not set
136# CONFIG_MPENTIUM4 is not set 201# CONFIG_MPENTIUM4 is not set
137# CONFIG_MK6 is not set 202# CONFIG_MK6 is not set
138# CONFIG_MK7 is not set 203# CONFIG_MK7 is not set
@@ -147,14 +212,14 @@ CONFIG_MPENTIUMIII=y
147# CONFIG_MCYRIXIII is not set 212# CONFIG_MCYRIXIII is not set
148# CONFIG_MVIAC3_2 is not set 213# CONFIG_MVIAC3_2 is not set
149# CONFIG_MVIAC7 is not set 214# CONFIG_MVIAC7 is not set
150CONFIG_X86_GENERIC=y 215# CONFIG_MPSC is not set
216CONFIG_MCORE2=y
217# CONFIG_GENERIC_CPU is not set
218# CONFIG_X86_GENERIC is not set
219CONFIG_X86_CPU=y
151CONFIG_X86_CMPXCHG=y 220CONFIG_X86_CMPXCHG=y
152CONFIG_X86_L1_CACHE_SHIFT=7 221CONFIG_X86_L1_CACHE_SHIFT=6
153CONFIG_X86_XADD=y 222CONFIG_X86_XADD=y
154CONFIG_RWSEM_XCHGADD_ALGORITHM=y
155# CONFIG_ARCH_HAS_ILOG2_U32 is not set
156# CONFIG_ARCH_HAS_ILOG2_U64 is not set
157CONFIG_GENERIC_CALIBRATE_DELAY=y
158CONFIG_X86_WP_WORKS_OK=y 223CONFIG_X86_WP_WORKS_OK=y
159CONFIG_X86_INVLPG=y 224CONFIG_X86_INVLPG=y
160CONFIG_X86_BSWAP=y 225CONFIG_X86_BSWAP=y
@@ -162,106 +227,120 @@ CONFIG_X86_POPAD_OK=y
162CONFIG_X86_GOOD_APIC=y 227CONFIG_X86_GOOD_APIC=y
163CONFIG_X86_INTEL_USERCOPY=y 228CONFIG_X86_INTEL_USERCOPY=y
164CONFIG_X86_USE_PPRO_CHECKSUM=y 229CONFIG_X86_USE_PPRO_CHECKSUM=y
230CONFIG_X86_P6_NOP=y
165CONFIG_X86_TSC=y 231CONFIG_X86_TSC=y
166CONFIG_X86_CMOV=y 232CONFIG_X86_MINIMUM_CPU_FAMILY=6
167CONFIG_X86_MINIMUM_CPU_FAMILY=4 233CONFIG_X86_DEBUGCTLMSR=y
168CONFIG_HPET_TIMER=y 234CONFIG_HPET_TIMER=y
169CONFIG_HPET_EMULATE_RTC=y 235CONFIG_HPET_EMULATE_RTC=y
170CONFIG_NR_CPUS=32 236CONFIG_DMI=y
171CONFIG_SCHED_SMT=y 237# CONFIG_IOMMU_HELPER is not set
238CONFIG_NR_CPUS=4
239# CONFIG_SCHED_SMT is not set
172CONFIG_SCHED_MC=y 240CONFIG_SCHED_MC=y
173# CONFIG_PREEMPT_NONE is not set 241# CONFIG_PREEMPT_NONE is not set
174CONFIG_PREEMPT_VOLUNTARY=y 242CONFIG_PREEMPT_VOLUNTARY=y
175# CONFIG_PREEMPT is not set 243# CONFIG_PREEMPT is not set
176CONFIG_PREEMPT_BKL=y
177CONFIG_X86_LOCAL_APIC=y 244CONFIG_X86_LOCAL_APIC=y
178CONFIG_X86_IO_APIC=y 245CONFIG_X86_IO_APIC=y
179CONFIG_X86_MCE=y 246# CONFIG_X86_MCE is not set
180CONFIG_X86_MCE_NONFATAL=y
181CONFIG_X86_MCE_P4THERMAL=y
182CONFIG_VM86=y 247CONFIG_VM86=y
183# CONFIG_TOSHIBA is not set 248# CONFIG_TOSHIBA is not set
184# CONFIG_I8K is not set 249# CONFIG_I8K is not set
185# CONFIG_X86_REBOOTFIXUPS is not set 250# CONFIG_X86_REBOOTFIXUPS is not set
186CONFIG_MICROCODE=y 251# CONFIG_MICROCODE is not set
187CONFIG_MICROCODE_OLD_INTERFACE=y
188CONFIG_X86_MSR=y 252CONFIG_X86_MSR=y
189CONFIG_X86_CPUID=y 253CONFIG_X86_CPUID=y
190
191#
192# Firmware Drivers
193#
194# CONFIG_EDD is not set
195# CONFIG_DELL_RBU is not set
196# CONFIG_DCDBAS is not set
197CONFIG_DMIID=y
198# CONFIG_NOHIGHMEM is not set 254# CONFIG_NOHIGHMEM is not set
199CONFIG_HIGHMEM4G=y 255CONFIG_HIGHMEM4G=y
200# CONFIG_HIGHMEM64G is not set 256# CONFIG_HIGHMEM64G is not set
201CONFIG_PAGE_OFFSET=0xC0000000 257CONFIG_PAGE_OFFSET=0xC0000000
202CONFIG_HIGHMEM=y 258CONFIG_HIGHMEM=y
203CONFIG_ARCH_POPULATES_NODE_MAP=y 259CONFIG_NEED_NODE_MEMMAP_SIZE=y
260CONFIG_ARCH_FLATMEM_ENABLE=y
261CONFIG_ARCH_SPARSEMEM_ENABLE=y
262CONFIG_ARCH_SELECT_MEMORY_MODEL=y
204CONFIG_SELECT_MEMORY_MODEL=y 263CONFIG_SELECT_MEMORY_MODEL=y
205CONFIG_FLATMEM_MANUAL=y 264# CONFIG_FLATMEM_MANUAL is not set
206# CONFIG_DISCONTIGMEM_MANUAL is not set 265# CONFIG_DISCONTIGMEM_MANUAL is not set
207# CONFIG_SPARSEMEM_MANUAL is not set 266CONFIG_SPARSEMEM_MANUAL=y
208CONFIG_FLATMEM=y 267CONFIG_SPARSEMEM=y
209CONFIG_FLAT_NODE_MEM_MAP=y 268CONFIG_HAVE_MEMORY_PRESENT=y
210# CONFIG_SPARSEMEM_STATIC is not set 269CONFIG_SPARSEMEM_STATIC=y
270# CONFIG_SPARSEMEM_VMEMMAP_ENABLE is not set
271
272#
273# Memory hotplug is currently incompatible with Software Suspend
274#
275CONFIG_PAGEFLAGS_EXTENDED=y
211CONFIG_SPLIT_PTLOCK_CPUS=4 276CONFIG_SPLIT_PTLOCK_CPUS=4
212CONFIG_RESOURCES_64BIT=y 277CONFIG_RESOURCES_64BIT=y
213CONFIG_ZONE_DMA_FLAG=1 278CONFIG_ZONE_DMA_FLAG=1
214CONFIG_BOUNCE=y 279CONFIG_BOUNCE=y
215CONFIG_NR_QUICK=1
216CONFIG_VIRT_TO_BUS=y 280CONFIG_VIRT_TO_BUS=y
217# CONFIG_HIGHPTE is not set 281# CONFIG_HIGHPTE is not set
218# CONFIG_MATH_EMULATION is not set 282# CONFIG_MATH_EMULATION is not set
219CONFIG_MTRR=y 283CONFIG_MTRR=y
220# CONFIG_EFI is not set 284# CONFIG_X86_PAT is not set
285CONFIG_EFI=y
221# CONFIG_IRQBALANCE is not set 286# CONFIG_IRQBALANCE is not set
222CONFIG_SECCOMP=y 287CONFIG_SECCOMP=y
223# CONFIG_HZ_100 is not set 288# CONFIG_HZ_100 is not set
224CONFIG_HZ_250=y 289# CONFIG_HZ_250 is not set
225# CONFIG_HZ_300 is not set 290# CONFIG_HZ_300 is not set
226# CONFIG_HZ_1000 is not set 291CONFIG_HZ_1000=y
227CONFIG_HZ=250 292CONFIG_HZ=1000
228# CONFIG_KEXEC is not set 293CONFIG_SCHED_HRTICK=y
229# CONFIG_CRASH_DUMP is not set 294CONFIG_KEXEC=y
230CONFIG_PHYSICAL_START=0x100000 295CONFIG_CRASH_DUMP=y
231# CONFIG_RELOCATABLE is not set 296CONFIG_PHYSICAL_START=0x1000000
232CONFIG_PHYSICAL_ALIGN=0x100000 297CONFIG_RELOCATABLE=y
233# CONFIG_HOTPLUG_CPU is not set 298CONFIG_PHYSICAL_ALIGN=0x200000
234CONFIG_COMPAT_VDSO=y 299CONFIG_HOTPLUG_CPU=y
300# CONFIG_COMPAT_VDSO is not set
235CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y 301CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
236 302
237# 303#
238# Power management options (ACPI, APM) 304# Power management options
239# 305#
240CONFIG_PM=y 306CONFIG_PM=y
241CONFIG_PM_LEGACY=y 307CONFIG_PM_DEBUG=y
242# CONFIG_PM_DEBUG is not set 308# CONFIG_PM_VERBOSE is not set
243 309CONFIG_CAN_PM_TRACE=y
244# 310CONFIG_PM_TRACE=y
245# ACPI (Advanced Configuration and Power Interface) Support 311CONFIG_PM_TRACE_RTC=y
246# 312CONFIG_PM_SLEEP_SMP=y
313CONFIG_PM_SLEEP=y
314CONFIG_SUSPEND=y
315CONFIG_SUSPEND_FREEZER=y
316CONFIG_HIBERNATION=y
317CONFIG_PM_STD_PARTITION=""
247CONFIG_ACPI=y 318CONFIG_ACPI=y
319CONFIG_ACPI_SLEEP=y
248CONFIG_ACPI_PROCFS=y 320CONFIG_ACPI_PROCFS=y
321CONFIG_ACPI_PROCFS_POWER=y
322CONFIG_ACPI_SYSFS_POWER=y
323CONFIG_ACPI_PROC_EVENT=y
249CONFIG_ACPI_AC=y 324CONFIG_ACPI_AC=y
250CONFIG_ACPI_BATTERY=y 325CONFIG_ACPI_BATTERY=y
251CONFIG_ACPI_BUTTON=y 326CONFIG_ACPI_BUTTON=y
252CONFIG_ACPI_FAN=y 327CONFIG_ACPI_FAN=y
253# CONFIG_ACPI_DOCK is not set 328CONFIG_ACPI_DOCK=y
329# CONFIG_ACPI_BAY is not set
254CONFIG_ACPI_PROCESSOR=y 330CONFIG_ACPI_PROCESSOR=y
331CONFIG_ACPI_HOTPLUG_CPU=y
255CONFIG_ACPI_THERMAL=y 332CONFIG_ACPI_THERMAL=y
333# CONFIG_ACPI_WMI is not set
256# CONFIG_ACPI_ASUS is not set 334# CONFIG_ACPI_ASUS is not set
257# CONFIG_ACPI_TOSHIBA is not set 335# CONFIG_ACPI_TOSHIBA is not set
258CONFIG_ACPI_BLACKLIST_YEAR=2001 336# CONFIG_ACPI_CUSTOM_DSDT is not set
259CONFIG_ACPI_DEBUG=y 337CONFIG_ACPI_BLACKLIST_YEAR=0
338# CONFIG_ACPI_DEBUG is not set
260CONFIG_ACPI_EC=y 339CONFIG_ACPI_EC=y
261CONFIG_ACPI_POWER=y 340CONFIG_ACPI_POWER=y
262CONFIG_ACPI_SYSTEM=y 341CONFIG_ACPI_SYSTEM=y
263CONFIG_X86_PM_TIMER=y 342CONFIG_X86_PM_TIMER=y
264# CONFIG_ACPI_CONTAINER is not set 343CONFIG_ACPI_CONTAINER=y
265# CONFIG_ACPI_SBS is not set 344# CONFIG_ACPI_SBS is not set
266# CONFIG_APM is not set 345# CONFIG_APM is not set
267 346
@@ -271,15 +350,17 @@ CONFIG_X86_PM_TIMER=y
271CONFIG_CPU_FREQ=y 350CONFIG_CPU_FREQ=y
272CONFIG_CPU_FREQ_TABLE=y 351CONFIG_CPU_FREQ_TABLE=y
273CONFIG_CPU_FREQ_DEBUG=y 352CONFIG_CPU_FREQ_DEBUG=y
274CONFIG_CPU_FREQ_STAT=y 353# CONFIG_CPU_FREQ_STAT is not set
275# CONFIG_CPU_FREQ_STAT_DETAILS is not set 354# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set
276CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y 355# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set
277# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set 356CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y
357# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set
358# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set
278CONFIG_CPU_FREQ_GOV_PERFORMANCE=y 359CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
279# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set 360# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set
280CONFIG_CPU_FREQ_GOV_USERSPACE=y 361CONFIG_CPU_FREQ_GOV_USERSPACE=y
281CONFIG_CPU_FREQ_GOV_ONDEMAND=y 362CONFIG_CPU_FREQ_GOV_ONDEMAND=y
282CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y 363# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set
283 364
284# 365#
285# CPUFreq processor drivers 366# CPUFreq processor drivers
@@ -287,8 +368,7 @@ CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y
287CONFIG_X86_ACPI_CPUFREQ=y 368CONFIG_X86_ACPI_CPUFREQ=y
288# CONFIG_X86_POWERNOW_K6 is not set 369# CONFIG_X86_POWERNOW_K6 is not set
289# CONFIG_X86_POWERNOW_K7 is not set 370# CONFIG_X86_POWERNOW_K7 is not set
290CONFIG_X86_POWERNOW_K8=y 371# CONFIG_X86_POWERNOW_K8 is not set
291CONFIG_X86_POWERNOW_K8_ACPI=y
292# CONFIG_X86_GX_SUSPMOD is not set 372# CONFIG_X86_GX_SUSPMOD is not set
293# CONFIG_X86_SPEEDSTEP_CENTRINO is not set 373# CONFIG_X86_SPEEDSTEP_CENTRINO is not set
294# CONFIG_X86_SPEEDSTEP_ICH is not set 374# CONFIG_X86_SPEEDSTEP_ICH is not set
@@ -302,43 +382,72 @@ CONFIG_X86_POWERNOW_K8_ACPI=y
302# 382#
303# shared options 383# shared options
304# 384#
305CONFIG_X86_ACPI_CPUFREQ_PROC_INTF=y 385# CONFIG_X86_ACPI_CPUFREQ_PROC_INTF is not set
306# CONFIG_X86_SPEEDSTEP_LIB is not set 386# CONFIG_X86_SPEEDSTEP_LIB is not set
387CONFIG_CPU_IDLE=y
388CONFIG_CPU_IDLE_GOV_LADDER=y
389CONFIG_CPU_IDLE_GOV_MENU=y
307 390
308# 391#
309# Bus options (PCI, PCMCIA, EISA, MCA, ISA) 392# Bus options (PCI etc.)
310# 393#
311CONFIG_PCI=y 394CONFIG_PCI=y
312# CONFIG_PCI_GOBIOS is not set 395# CONFIG_PCI_GOBIOS is not set
313# CONFIG_PCI_GOMMCONFIG is not set 396# CONFIG_PCI_GOMMCONFIG is not set
314# CONFIG_PCI_GODIRECT is not set 397# CONFIG_PCI_GODIRECT is not set
315CONFIG_PCI_GOANY=y 398CONFIG_PCI_GOANY=y
399# CONFIG_PCI_GOOLPC is not set
316CONFIG_PCI_BIOS=y 400CONFIG_PCI_BIOS=y
317CONFIG_PCI_DIRECT=y 401CONFIG_PCI_DIRECT=y
318CONFIG_PCI_MMCONFIG=y 402CONFIG_PCI_MMCONFIG=y
319# CONFIG_PCIEPORTBUS is not set 403CONFIG_PCI_DOMAINS=y
404CONFIG_PCIEPORTBUS=y
405# CONFIG_HOTPLUG_PCI_PCIE is not set
406CONFIG_PCIEAER=y
407# CONFIG_PCIEASPM is not set
320CONFIG_ARCH_SUPPORTS_MSI=y 408CONFIG_ARCH_SUPPORTS_MSI=y
321CONFIG_PCI_MSI=y 409CONFIG_PCI_MSI=y
410# CONFIG_PCI_LEGACY is not set
322# CONFIG_PCI_DEBUG is not set 411# CONFIG_PCI_DEBUG is not set
323# CONFIG_HT_IRQ is not set 412CONFIG_HT_IRQ=y
324CONFIG_ISA_DMA_API=y 413CONFIG_ISA_DMA_API=y
325# CONFIG_ISA is not set 414# CONFIG_ISA is not set
326# CONFIG_MCA is not set 415# CONFIG_MCA is not set
327# CONFIG_SCx200 is not set 416# CONFIG_SCx200 is not set
417# CONFIG_OLPC is not set
328CONFIG_K8_NB=y 418CONFIG_K8_NB=y
329 419CONFIG_PCCARD=y
330# 420# CONFIG_PCMCIA_DEBUG is not set
331# PCCARD (PCMCIA/CardBus) support 421CONFIG_PCMCIA=y
332# 422CONFIG_PCMCIA_LOAD_CIS=y
333# CONFIG_PCCARD is not set 423CONFIG_PCMCIA_IOCTL=y
334# CONFIG_HOTPLUG_PCI is not set 424CONFIG_CARDBUS=y
335 425
336# 426#
337# Executable file formats 427# PC-card bridges
428#
429CONFIG_YENTA=y
430CONFIG_YENTA_O2=y
431CONFIG_YENTA_RICOH=y
432CONFIG_YENTA_TI=y
433CONFIG_YENTA_ENE_TUNE=y
434CONFIG_YENTA_TOSHIBA=y
435# CONFIG_PD6729 is not set
436# CONFIG_I82092 is not set
437CONFIG_PCCARD_NONSTATIC=y
438CONFIG_HOTPLUG_PCI=y
439# CONFIG_HOTPLUG_PCI_FAKE is not set
440# CONFIG_HOTPLUG_PCI_IBM is not set
441# CONFIG_HOTPLUG_PCI_ACPI is not set
442# CONFIG_HOTPLUG_PCI_CPCI is not set
443# CONFIG_HOTPLUG_PCI_SHPC is not set
444
445#
446# Executable file formats / Emulations
338# 447#
339CONFIG_BINFMT_ELF=y 448CONFIG_BINFMT_ELF=y
340# CONFIG_BINFMT_AOUT is not set 449# CONFIG_BINFMT_AOUT is not set
341# CONFIG_BINFMT_MISC is not set 450CONFIG_BINFMT_MISC=y
342 451
343# 452#
344# Networking 453# Networking
@@ -349,59 +458,142 @@ CONFIG_NET=y
349# Networking options 458# Networking options
350# 459#
351CONFIG_PACKET=y 460CONFIG_PACKET=y
352# CONFIG_PACKET_MMAP is not set 461CONFIG_PACKET_MMAP=y
353CONFIG_UNIX=y 462CONFIG_UNIX=y
354CONFIG_XFRM=y 463CONFIG_XFRM=y
355# CONFIG_XFRM_USER is not set 464CONFIG_XFRM_USER=y
356# CONFIG_XFRM_SUB_POLICY is not set 465# CONFIG_XFRM_SUB_POLICY is not set
357# CONFIG_XFRM_MIGRATE is not set 466# CONFIG_XFRM_MIGRATE is not set
467# CONFIG_XFRM_STATISTICS is not set
358# CONFIG_NET_KEY is not set 468# CONFIG_NET_KEY is not set
359CONFIG_INET=y 469CONFIG_INET=y
360CONFIG_IP_MULTICAST=y 470CONFIG_IP_MULTICAST=y
361# CONFIG_IP_ADVANCED_ROUTER is not set 471CONFIG_IP_ADVANCED_ROUTER=y
472CONFIG_ASK_IP_FIB_HASH=y
473# CONFIG_IP_FIB_TRIE is not set
362CONFIG_IP_FIB_HASH=y 474CONFIG_IP_FIB_HASH=y
363CONFIG_IP_PNP=y 475CONFIG_IP_MULTIPLE_TABLES=y
364CONFIG_IP_PNP_DHCP=y 476CONFIG_IP_ROUTE_MULTIPATH=y
365# CONFIG_IP_PNP_BOOTP is not set 477CONFIG_IP_ROUTE_VERBOSE=y
366# CONFIG_IP_PNP_RARP is not set 478# CONFIG_IP_PNP is not set
367# CONFIG_NET_IPIP is not set 479# CONFIG_NET_IPIP is not set
368# CONFIG_NET_IPGRE is not set 480# CONFIG_NET_IPGRE is not set
369# CONFIG_IP_MROUTE is not set 481CONFIG_IP_MROUTE=y
482CONFIG_IP_PIMSM_V1=y
483CONFIG_IP_PIMSM_V2=y
370# CONFIG_ARPD is not set 484# CONFIG_ARPD is not set
371# CONFIG_SYN_COOKIES is not set 485CONFIG_SYN_COOKIES=y
372# CONFIG_INET_AH is not set 486# CONFIG_INET_AH is not set
373# CONFIG_INET_ESP is not set 487# CONFIG_INET_ESP is not set
374# CONFIG_INET_IPCOMP is not set 488# CONFIG_INET_IPCOMP is not set
375# CONFIG_INET_XFRM_TUNNEL is not set 489# CONFIG_INET_XFRM_TUNNEL is not set
376CONFIG_INET_TUNNEL=y 490CONFIG_INET_TUNNEL=y
377CONFIG_INET_XFRM_MODE_TRANSPORT=y 491# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
378CONFIG_INET_XFRM_MODE_TUNNEL=y 492# CONFIG_INET_XFRM_MODE_TUNNEL is not set
379# CONFIG_INET_XFRM_MODE_BEET is not set 493# CONFIG_INET_XFRM_MODE_BEET is not set
380CONFIG_INET_DIAG=y 494CONFIG_INET_LRO=y
381CONFIG_INET_TCP_DIAG=y 495# CONFIG_INET_DIAG is not set
382# CONFIG_TCP_CONG_ADVANCED is not set 496CONFIG_TCP_CONG_ADVANCED=y
497# CONFIG_TCP_CONG_BIC is not set
383CONFIG_TCP_CONG_CUBIC=y 498CONFIG_TCP_CONG_CUBIC=y
499# CONFIG_TCP_CONG_WESTWOOD is not set
500# CONFIG_TCP_CONG_HTCP is not set
501# CONFIG_TCP_CONG_HSTCP is not set
502# CONFIG_TCP_CONG_HYBLA is not set
503# CONFIG_TCP_CONG_VEGAS is not set
504# CONFIG_TCP_CONG_SCALABLE is not set
505# CONFIG_TCP_CONG_LP is not set
506# CONFIG_TCP_CONG_VENO is not set
507# CONFIG_TCP_CONG_YEAH is not set
508# CONFIG_TCP_CONG_ILLINOIS is not set
509# CONFIG_DEFAULT_BIC is not set
510CONFIG_DEFAULT_CUBIC=y
511# CONFIG_DEFAULT_HTCP is not set
512# CONFIG_DEFAULT_VEGAS is not set
513# CONFIG_DEFAULT_WESTWOOD is not set
514# CONFIG_DEFAULT_RENO is not set
384CONFIG_DEFAULT_TCP_CONG="cubic" 515CONFIG_DEFAULT_TCP_CONG="cubic"
385# CONFIG_TCP_MD5SIG is not set 516CONFIG_TCP_MD5SIG=y
517# CONFIG_IP_VS is not set
386CONFIG_IPV6=y 518CONFIG_IPV6=y
387# CONFIG_IPV6_PRIVACY is not set 519# CONFIG_IPV6_PRIVACY is not set
388# CONFIG_IPV6_ROUTER_PREF is not set 520# CONFIG_IPV6_ROUTER_PREF is not set
389# CONFIG_IPV6_OPTIMISTIC_DAD is not set 521# CONFIG_IPV6_OPTIMISTIC_DAD is not set
390# CONFIG_INET6_AH is not set 522CONFIG_INET6_AH=y
391# CONFIG_INET6_ESP is not set 523CONFIG_INET6_ESP=y
392# CONFIG_INET6_IPCOMP is not set 524# CONFIG_INET6_IPCOMP is not set
393# CONFIG_IPV6_MIP6 is not set 525# CONFIG_IPV6_MIP6 is not set
394# CONFIG_INET6_XFRM_TUNNEL is not set 526# CONFIG_INET6_XFRM_TUNNEL is not set
395# CONFIG_INET6_TUNNEL is not set 527# CONFIG_INET6_TUNNEL is not set
396CONFIG_INET6_XFRM_MODE_TRANSPORT=y 528CONFIG_INET6_XFRM_MODE_TRANSPORT=y
397CONFIG_INET6_XFRM_MODE_TUNNEL=y 529CONFIG_INET6_XFRM_MODE_TUNNEL=y
398# CONFIG_INET6_XFRM_MODE_BEET is not set 530CONFIG_INET6_XFRM_MODE_BEET=y
399# CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set 531# CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set
400CONFIG_IPV6_SIT=y 532CONFIG_IPV6_SIT=y
533CONFIG_IPV6_NDISC_NODETYPE=y
401# CONFIG_IPV6_TUNNEL is not set 534# CONFIG_IPV6_TUNNEL is not set
402# CONFIG_IPV6_MULTIPLE_TABLES is not set 535# CONFIG_IPV6_MULTIPLE_TABLES is not set
403# CONFIG_NETWORK_SECMARK is not set 536# CONFIG_IPV6_MROUTE is not set
404# CONFIG_NETFILTER is not set 537CONFIG_NETLABEL=y
538CONFIG_NETWORK_SECMARK=y
539CONFIG_NETFILTER=y
540# CONFIG_NETFILTER_DEBUG is not set
541# CONFIG_NETFILTER_ADVANCED is not set
542
543#
544# Core Netfilter Configuration
545#
546CONFIG_NETFILTER_NETLINK=y
547CONFIG_NETFILTER_NETLINK_LOG=y
548CONFIG_NF_CONNTRACK=y
549CONFIG_NF_CONNTRACK_SECMARK=y
550CONFIG_NF_CONNTRACK_FTP=y
551CONFIG_NF_CONNTRACK_IRC=y
552CONFIG_NF_CONNTRACK_SIP=y
553CONFIG_NF_CT_NETLINK=y
554CONFIG_NETFILTER_XTABLES=y
555CONFIG_NETFILTER_XT_TARGET_MARK=y
556CONFIG_NETFILTER_XT_TARGET_NFLOG=y
557CONFIG_NETFILTER_XT_TARGET_SECMARK=y
558CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
559CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
560CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
561CONFIG_NETFILTER_XT_MATCH_MARK=y
562CONFIG_NETFILTER_XT_MATCH_POLICY=y
563CONFIG_NETFILTER_XT_MATCH_STATE=y
564
565#
566# IP: Netfilter Configuration
567#
568CONFIG_NF_CONNTRACK_IPV4=y
569CONFIG_NF_CONNTRACK_PROC_COMPAT=y
570CONFIG_IP_NF_IPTABLES=y
571CONFIG_IP_NF_FILTER=y
572CONFIG_IP_NF_TARGET_REJECT=y
573CONFIG_IP_NF_TARGET_LOG=y
574CONFIG_IP_NF_TARGET_ULOG=y
575CONFIG_NF_NAT=y
576CONFIG_NF_NAT_NEEDED=y
577CONFIG_IP_NF_TARGET_MASQUERADE=y
578CONFIG_NF_NAT_FTP=y
579CONFIG_NF_NAT_IRC=y
580# CONFIG_NF_NAT_TFTP is not set
581# CONFIG_NF_NAT_AMANDA is not set
582# CONFIG_NF_NAT_PPTP is not set
583# CONFIG_NF_NAT_H323 is not set
584CONFIG_NF_NAT_SIP=y
585CONFIG_IP_NF_MANGLE=y
586
587#
588# IPv6: Netfilter Configuration
589#
590CONFIG_NF_CONNTRACK_IPV6=y
591CONFIG_IP6_NF_IPTABLES=y
592CONFIG_IP6_NF_MATCH_IPV6HEADER=y
593CONFIG_IP6_NF_FILTER=y
594CONFIG_IP6_NF_TARGET_LOG=y
595CONFIG_IP6_NF_TARGET_REJECT=y
596CONFIG_IP6_NF_MANGLE=y
405# CONFIG_IP_DCCP is not set 597# CONFIG_IP_DCCP is not set
406# CONFIG_IP_SCTP is not set 598# CONFIG_IP_SCTP is not set
407# CONFIG_TIPC is not set 599# CONFIG_TIPC is not set
@@ -409,6 +601,7 @@ CONFIG_IPV6_SIT=y
409# CONFIG_BRIDGE is not set 601# CONFIG_BRIDGE is not set
410# CONFIG_VLAN_8021Q is not set 602# CONFIG_VLAN_8021Q is not set
411# CONFIG_DECNET is not set 603# CONFIG_DECNET is not set
604CONFIG_LLC=y
412# CONFIG_LLC2 is not set 605# CONFIG_LLC2 is not set
413# CONFIG_IPX is not set 606# CONFIG_IPX is not set
414# CONFIG_ATALK is not set 607# CONFIG_ATALK is not set
@@ -416,28 +609,99 @@ CONFIG_IPV6_SIT=y
416# CONFIG_LAPB is not set 609# CONFIG_LAPB is not set
417# CONFIG_ECONET is not set 610# CONFIG_ECONET is not set
418# CONFIG_WAN_ROUTER is not set 611# CONFIG_WAN_ROUTER is not set
419 612CONFIG_NET_SCHED=y
420# 613
421# QoS and/or fair queueing 614#
422# 615# Queueing/Scheduling
423# CONFIG_NET_SCHED is not set 616#
617# CONFIG_NET_SCH_CBQ is not set
618# CONFIG_NET_SCH_HTB is not set
619# CONFIG_NET_SCH_HFSC is not set
620# CONFIG_NET_SCH_PRIO is not set
621# CONFIG_NET_SCH_RR is not set
622# CONFIG_NET_SCH_RED is not set
623# CONFIG_NET_SCH_SFQ is not set
624# CONFIG_NET_SCH_TEQL is not set
625# CONFIG_NET_SCH_TBF is not set
626# CONFIG_NET_SCH_GRED is not set
627# CONFIG_NET_SCH_DSMARK is not set
628# CONFIG_NET_SCH_NETEM is not set
629# CONFIG_NET_SCH_INGRESS is not set
630
631#
632# Classification
633#
634CONFIG_NET_CLS=y
635# CONFIG_NET_CLS_BASIC is not set
636# CONFIG_NET_CLS_TCINDEX is not set
637# CONFIG_NET_CLS_ROUTE4 is not set
638# CONFIG_NET_CLS_FW is not set
639# CONFIG_NET_CLS_U32 is not set
640# CONFIG_NET_CLS_RSVP is not set
641# CONFIG_NET_CLS_RSVP6 is not set
642# CONFIG_NET_CLS_FLOW is not set
643CONFIG_NET_EMATCH=y
644CONFIG_NET_EMATCH_STACK=32
645# CONFIG_NET_EMATCH_CMP is not set
646# CONFIG_NET_EMATCH_NBYTE is not set
647# CONFIG_NET_EMATCH_U32 is not set
648# CONFIG_NET_EMATCH_META is not set
649# CONFIG_NET_EMATCH_TEXT is not set
650CONFIG_NET_CLS_ACT=y
651# CONFIG_NET_ACT_POLICE is not set
652# CONFIG_NET_ACT_GACT is not set
653# CONFIG_NET_ACT_MIRRED is not set
654# CONFIG_NET_ACT_IPT is not set
655# CONFIG_NET_ACT_NAT is not set
656# CONFIG_NET_ACT_PEDIT is not set
657# CONFIG_NET_ACT_SIMP is not set
658CONFIG_NET_SCH_FIFO=y
424 659
425# 660#
426# Network testing 661# Network testing
427# 662#
428# CONFIG_NET_PKTGEN is not set 663# CONFIG_NET_PKTGEN is not set
429# CONFIG_NET_TCPPROBE is not set 664# CONFIG_NET_TCPPROBE is not set
430# CONFIG_HAMRADIO is not set 665CONFIG_HAMRADIO=y
666
667#
668# Packet Radio protocols
669#
670# CONFIG_AX25 is not set
671# CONFIG_CAN is not set
431# CONFIG_IRDA is not set 672# CONFIG_IRDA is not set
432# CONFIG_BT is not set 673# CONFIG_BT is not set
433# CONFIG_AF_RXRPC is not set 674# CONFIG_AF_RXRPC is not set
675CONFIG_FIB_RULES=y
434 676
435# 677#
436# Wireless 678# Wireless
437# 679#
438# CONFIG_CFG80211 is not set 680CONFIG_CFG80211=y
439# CONFIG_WIRELESS_EXT is not set 681CONFIG_NL80211=y
440# CONFIG_MAC80211 is not set 682CONFIG_WIRELESS_EXT=y
683CONFIG_MAC80211=y
684
685#
686# Rate control algorithm selection
687#
688CONFIG_MAC80211_RC_DEFAULT_PID=y
689# CONFIG_MAC80211_RC_DEFAULT_NONE is not set
690
691#
692# Selecting 'y' for an algorithm will
693#
694
695#
696# build the algorithm into mac80211.
697#
698CONFIG_MAC80211_RC_DEFAULT="pid"
699CONFIG_MAC80211_RC_PID=y
700# CONFIG_MAC80211_MESH is not set
701CONFIG_MAC80211_LEDS=y
702# CONFIG_MAC80211_DEBUGFS is not set
703# CONFIG_MAC80211_DEBUG_PACKET_ALIGNMENT is not set
704# CONFIG_MAC80211_DEBUG is not set
441# CONFIG_IEEE80211 is not set 705# CONFIG_IEEE80211 is not set
442# CONFIG_RFKILL is not set 706# CONFIG_RFKILL is not set
443# CONFIG_NET_9P is not set 707# CONFIG_NET_9P is not set
@@ -449,13 +713,15 @@ CONFIG_IPV6_SIT=y
449# 713#
450# Generic Driver Options 714# Generic Driver Options
451# 715#
716CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
452CONFIG_STANDALONE=y 717CONFIG_STANDALONE=y
453CONFIG_PREVENT_FIRMWARE_BUILD=y 718CONFIG_PREVENT_FIRMWARE_BUILD=y
454CONFIG_FW_LOADER=y 719CONFIG_FW_LOADER=y
455# CONFIG_DEBUG_DRIVER is not set 720# CONFIG_DEBUG_DRIVER is not set
456# CONFIG_DEBUG_DEVRES is not set 721CONFIG_DEBUG_DEVRES=y
457# CONFIG_SYS_HYPERVISOR is not set 722# CONFIG_SYS_HYPERVISOR is not set
458# CONFIG_CONNECTOR is not set 723CONFIG_CONNECTOR=y
724CONFIG_PROC_EVENTS=y
459# CONFIG_MTD is not set 725# CONFIG_MTD is not set
460# CONFIG_PARPORT is not set 726# CONFIG_PARPORT is not set
461CONFIG_PNP=y 727CONFIG_PNP=y
@@ -466,7 +732,7 @@ CONFIG_PNP=y
466# 732#
467CONFIG_PNPACPI=y 733CONFIG_PNPACPI=y
468CONFIG_BLK_DEV=y 734CONFIG_BLK_DEV=y
469CONFIG_BLK_DEV_FD=y 735# CONFIG_BLK_DEV_FD is not set
470# CONFIG_BLK_CPQ_DA is not set 736# CONFIG_BLK_CPQ_DA is not set
471# CONFIG_BLK_CPQ_CISS_DA is not set 737# CONFIG_BLK_CPQ_CISS_DA is not set
472# CONFIG_BLK_DEV_DAC960 is not set 738# CONFIG_BLK_DEV_DAC960 is not set
@@ -479,8 +745,8 @@ CONFIG_BLK_DEV_LOOP=y
479# CONFIG_BLK_DEV_UB is not set 745# CONFIG_BLK_DEV_UB is not set
480CONFIG_BLK_DEV_RAM=y 746CONFIG_BLK_DEV_RAM=y
481CONFIG_BLK_DEV_RAM_COUNT=16 747CONFIG_BLK_DEV_RAM_COUNT=16
482CONFIG_BLK_DEV_RAM_SIZE=4096 748CONFIG_BLK_DEV_RAM_SIZE=16384
483CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024 749# CONFIG_BLK_DEV_XIP is not set
484# CONFIG_CDROM_PKTCDVD is not set 750# CONFIG_CDROM_PKTCDVD is not set
485# CONFIG_ATA_OVER_ETH is not set 751# CONFIG_ATA_OVER_ETH is not set
486CONFIG_MISC_DEVICES=y 752CONFIG_MISC_DEVICES=y
@@ -489,73 +755,17 @@ CONFIG_MISC_DEVICES=y
489# CONFIG_EEPROM_93CX6 is not set 755# CONFIG_EEPROM_93CX6 is not set
490# CONFIG_SGI_IOC4 is not set 756# CONFIG_SGI_IOC4 is not set
491# CONFIG_TIFM_CORE is not set 757# CONFIG_TIFM_CORE is not set
758# CONFIG_ACER_WMI is not set
759# CONFIG_ASUS_LAPTOP is not set
760# CONFIG_FUJITSU_LAPTOP is not set
761# CONFIG_TC1100_WMI is not set
762# CONFIG_MSI_LAPTOP is not set
492# CONFIG_SONY_LAPTOP is not set 763# CONFIG_SONY_LAPTOP is not set
493# CONFIG_THINKPAD_ACPI is not set 764# CONFIG_THINKPAD_ACPI is not set
494CONFIG_IDE=y 765# CONFIG_INTEL_MENLOW is not set
495CONFIG_BLK_DEV_IDE=y 766# CONFIG_ENCLOSURE_SERVICES is not set
496 767CONFIG_HAVE_IDE=y
497# 768# CONFIG_IDE is not set
498# Please see Documentation/ide.txt for help/info on IDE drives
499#
500# CONFIG_BLK_DEV_IDE_SATA is not set
501# CONFIG_BLK_DEV_HD_IDE is not set
502CONFIG_BLK_DEV_IDEDISK=y
503CONFIG_IDEDISK_MULTI_MODE=y
504CONFIG_BLK_DEV_IDECD=y
505# CONFIG_BLK_DEV_IDETAPE is not set
506# CONFIG_BLK_DEV_IDEFLOPPY is not set
507# CONFIG_BLK_DEV_IDESCSI is not set
508CONFIG_BLK_DEV_IDEACPI=y
509# CONFIG_IDE_TASK_IOCTL is not set
510CONFIG_IDE_PROC_FS=y
511
512#
513# IDE chipset support/bugfixes
514#
515CONFIG_IDE_GENERIC=y
516# CONFIG_BLK_DEV_CMD640 is not set
517# CONFIG_BLK_DEV_IDEPNP is not set
518CONFIG_BLK_DEV_IDEPCI=y
519# CONFIG_IDEPCI_SHARE_IRQ is not set
520CONFIG_IDEPCI_PCIBUS_ORDER=y
521# CONFIG_BLK_DEV_OFFBOARD is not set
522# CONFIG_BLK_DEV_GENERIC is not set
523# CONFIG_BLK_DEV_OPTI621 is not set
524# CONFIG_BLK_DEV_RZ1000 is not set
525CONFIG_BLK_DEV_IDEDMA_PCI=y
526# CONFIG_BLK_DEV_IDEDMA_FORCED is not set
527# CONFIG_IDEDMA_ONLYDISK is not set
528# CONFIG_BLK_DEV_AEC62XX is not set
529# CONFIG_BLK_DEV_ALI15X3 is not set
530CONFIG_BLK_DEV_AMD74XX=y
531# CONFIG_BLK_DEV_ATIIXP is not set
532# CONFIG_BLK_DEV_CMD64X is not set
533# CONFIG_BLK_DEV_TRIFLEX is not set
534# CONFIG_BLK_DEV_CY82C693 is not set
535# CONFIG_BLK_DEV_CS5520 is not set
536# CONFIG_BLK_DEV_CS5530 is not set
537# CONFIG_BLK_DEV_CS5535 is not set
538# CONFIG_BLK_DEV_HPT34X is not set
539# CONFIG_BLK_DEV_HPT366 is not set
540# CONFIG_BLK_DEV_JMICRON is not set
541# CONFIG_BLK_DEV_SC1200 is not set
542CONFIG_BLK_DEV_PIIX=y
543# CONFIG_BLK_DEV_IT8213 is not set
544# CONFIG_BLK_DEV_IT821X is not set
545# CONFIG_BLK_DEV_NS87415 is not set
546# CONFIG_BLK_DEV_PDC202XX_OLD is not set
547# CONFIG_BLK_DEV_PDC202XX_NEW is not set
548# CONFIG_BLK_DEV_SVWKS is not set
549# CONFIG_BLK_DEV_SIIMAGE is not set
550# CONFIG_BLK_DEV_SIS5513 is not set
551# CONFIG_BLK_DEV_SLC90E66 is not set
552# CONFIG_BLK_DEV_TRM290 is not set
553# CONFIG_BLK_DEV_VIA82CXXX is not set
554# CONFIG_BLK_DEV_TC86C001 is not set
555# CONFIG_IDE_ARM is not set
556CONFIG_BLK_DEV_IDEDMA=y
557# CONFIG_IDEDMA_IVB is not set
558# CONFIG_BLK_DEV_HD is not set
559 769
560# 770#
561# SCSI device support 771# SCSI device support
@@ -564,8 +774,8 @@ CONFIG_BLK_DEV_IDEDMA=y
564CONFIG_SCSI=y 774CONFIG_SCSI=y
565CONFIG_SCSI_DMA=y 775CONFIG_SCSI_DMA=y
566# CONFIG_SCSI_TGT is not set 776# CONFIG_SCSI_TGT is not set
567CONFIG_SCSI_NETLINK=y 777# CONFIG_SCSI_NETLINK is not set
568# CONFIG_SCSI_PROC_FS is not set 778CONFIG_SCSI_PROC_FS=y
569 779
570# 780#
571# SCSI support type (disk, tape, CD-ROM) 781# SCSI support type (disk, tape, CD-ROM)
@@ -574,7 +784,7 @@ CONFIG_BLK_DEV_SD=y
574# CONFIG_CHR_DEV_ST is not set 784# CONFIG_CHR_DEV_ST is not set
575# CONFIG_CHR_DEV_OSST is not set 785# CONFIG_CHR_DEV_OSST is not set
576CONFIG_BLK_DEV_SR=y 786CONFIG_BLK_DEV_SR=y
577# CONFIG_BLK_DEV_SR_VENDOR is not set 787CONFIG_BLK_DEV_SR_VENDOR=y
578CONFIG_CHR_DEV_SG=y 788CONFIG_CHR_DEV_SG=y
579# CONFIG_CHR_DEV_SCH is not set 789# CONFIG_CHR_DEV_SCH is not set
580 790
@@ -582,7 +792,7 @@ CONFIG_CHR_DEV_SG=y
582# Some SCSI devices (e.g. CD jukebox) support multiple LUNs 792# Some SCSI devices (e.g. CD jukebox) support multiple LUNs
583# 793#
584# CONFIG_SCSI_MULTI_LUN is not set 794# CONFIG_SCSI_MULTI_LUN is not set
585# CONFIG_SCSI_CONSTANTS is not set 795CONFIG_SCSI_CONSTANTS=y
586# CONFIG_SCSI_LOGGING is not set 796# CONFIG_SCSI_LOGGING is not set
587# CONFIG_SCSI_SCAN_ASYNC is not set 797# CONFIG_SCSI_SCAN_ASYNC is not set
588CONFIG_SCSI_WAIT_SCAN=m 798CONFIG_SCSI_WAIT_SCAN=m
@@ -591,81 +801,37 @@ CONFIG_SCSI_WAIT_SCAN=m
591# SCSI Transports 801# SCSI Transports
592# 802#
593CONFIG_SCSI_SPI_ATTRS=y 803CONFIG_SCSI_SPI_ATTRS=y
594CONFIG_SCSI_FC_ATTRS=y 804# CONFIG_SCSI_FC_ATTRS is not set
595# CONFIG_SCSI_ISCSI_ATTRS is not set 805# CONFIG_SCSI_ISCSI_ATTRS is not set
596# CONFIG_SCSI_SAS_ATTRS is not set 806# CONFIG_SCSI_SAS_ATTRS is not set
597# CONFIG_SCSI_SAS_LIBSAS is not set 807# CONFIG_SCSI_SAS_LIBSAS is not set
598 808# CONFIG_SCSI_SRP_ATTRS is not set
599# 809# CONFIG_SCSI_LOWLEVEL is not set
600# SCSI low-level drivers 810# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
601#
602# CONFIG_ISCSI_TCP is not set
603CONFIG_BLK_DEV_3W_XXXX_RAID=y
604# CONFIG_SCSI_3W_9XXX is not set
605# CONFIG_SCSI_ACARD is not set
606# CONFIG_SCSI_AACRAID is not set
607CONFIG_SCSI_AIC7XXX=y
608CONFIG_AIC7XXX_CMDS_PER_DEVICE=32
609CONFIG_AIC7XXX_RESET_DELAY_MS=5000
610CONFIG_AIC7XXX_DEBUG_ENABLE=y
611CONFIG_AIC7XXX_DEBUG_MASK=0
612CONFIG_AIC7XXX_REG_PRETTY_PRINT=y
613# CONFIG_SCSI_AIC7XXX_OLD is not set
614CONFIG_SCSI_AIC79XX=y
615CONFIG_AIC79XX_CMDS_PER_DEVICE=32
616CONFIG_AIC79XX_RESET_DELAY_MS=4000
617# CONFIG_AIC79XX_DEBUG_ENABLE is not set
618CONFIG_AIC79XX_DEBUG_MASK=0
619# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set
620# CONFIG_SCSI_AIC94XX is not set
621# CONFIG_SCSI_DPT_I2O is not set
622# CONFIG_SCSI_ADVANSYS is not set
623# CONFIG_SCSI_ARCMSR is not set
624# CONFIG_MEGARAID_NEWGEN is not set
625# CONFIG_MEGARAID_LEGACY is not set
626# CONFIG_MEGARAID_SAS is not set
627# CONFIG_SCSI_HPTIOP is not set
628# CONFIG_SCSI_BUSLOGIC is not set
629# CONFIG_SCSI_DMX3191D is not set
630# CONFIG_SCSI_EATA is not set
631# CONFIG_SCSI_FUTURE_DOMAIN is not set
632# CONFIG_SCSI_GDTH is not set
633# CONFIG_SCSI_IPS is not set
634# CONFIG_SCSI_INITIO is not set
635# CONFIG_SCSI_INIA100 is not set
636# CONFIG_SCSI_STEX is not set
637# CONFIG_SCSI_SYM53C8XX_2 is not set
638# CONFIG_SCSI_IPR is not set
639# CONFIG_SCSI_QLOGIC_1280 is not set
640# CONFIG_SCSI_QLA_FC is not set
641# CONFIG_SCSI_QLA_ISCSI is not set
642# CONFIG_SCSI_LPFC is not set
643# CONFIG_SCSI_DC395x is not set
644# CONFIG_SCSI_DC390T is not set
645# CONFIG_SCSI_NSP32 is not set
646# CONFIG_SCSI_DEBUG is not set
647# CONFIG_SCSI_SRP is not set
648CONFIG_ATA=y 811CONFIG_ATA=y
649# CONFIG_ATA_NONSTANDARD is not set 812# CONFIG_ATA_NONSTANDARD is not set
650CONFIG_ATA_ACPI=y 813CONFIG_ATA_ACPI=y
814CONFIG_SATA_PMP=y
651CONFIG_SATA_AHCI=y 815CONFIG_SATA_AHCI=y
652CONFIG_SATA_SVW=y 816# CONFIG_SATA_SIL24 is not set
817CONFIG_ATA_SFF=y
818# CONFIG_SATA_SVW is not set
653CONFIG_ATA_PIIX=y 819CONFIG_ATA_PIIX=y
654# CONFIG_SATA_MV is not set 820# CONFIG_SATA_MV is not set
655CONFIG_SATA_NV=y 821# CONFIG_SATA_NV is not set
656# CONFIG_PDC_ADMA is not set 822# CONFIG_PDC_ADMA is not set
657# CONFIG_SATA_QSTOR is not set 823# CONFIG_SATA_QSTOR is not set
658# CONFIG_SATA_PROMISE is not set 824# CONFIG_SATA_PROMISE is not set
659# CONFIG_SATA_SX4 is not set 825# CONFIG_SATA_SX4 is not set
660CONFIG_SATA_SIL=y 826# CONFIG_SATA_SIL is not set
661# CONFIG_SATA_SIL24 is not set
662# CONFIG_SATA_SIS is not set 827# CONFIG_SATA_SIS is not set
663# CONFIG_SATA_ULI is not set 828# CONFIG_SATA_ULI is not set
664CONFIG_SATA_VIA=y 829# CONFIG_SATA_VIA is not set
665# CONFIG_SATA_VITESSE is not set 830# CONFIG_SATA_VITESSE is not set
666# CONFIG_SATA_INIC162X is not set 831# CONFIG_SATA_INIC162X is not set
832# CONFIG_PATA_ACPI is not set
667# CONFIG_PATA_ALI is not set 833# CONFIG_PATA_ALI is not set
668# CONFIG_PATA_AMD is not set 834CONFIG_PATA_AMD=y
669# CONFIG_PATA_ARTOP is not set 835# CONFIG_PATA_ARTOP is not set
670# CONFIG_PATA_ATIIXP is not set 836# CONFIG_PATA_ATIIXP is not set
671# CONFIG_PATA_CMD640_PCI is not set 837# CONFIG_PATA_CMD640_PCI is not set
@@ -673,6 +839,7 @@ CONFIG_SATA_VIA=y
673# CONFIG_PATA_CS5520 is not set 839# CONFIG_PATA_CS5520 is not set
674# CONFIG_PATA_CS5530 is not set 840# CONFIG_PATA_CS5530 is not set
675# CONFIG_PATA_CS5535 is not set 841# CONFIG_PATA_CS5535 is not set
842# CONFIG_PATA_CS5536 is not set
676# CONFIG_PATA_CYPRESS is not set 843# CONFIG_PATA_CYPRESS is not set
677# CONFIG_PATA_EFAR is not set 844# CONFIG_PATA_EFAR is not set
678# CONFIG_ATA_GENERIC is not set 845# CONFIG_ATA_GENERIC is not set
@@ -686,11 +853,14 @@ CONFIG_SATA_VIA=y
686# CONFIG_PATA_TRIFLEX is not set 853# CONFIG_PATA_TRIFLEX is not set
687# CONFIG_PATA_MARVELL is not set 854# CONFIG_PATA_MARVELL is not set
688# CONFIG_PATA_MPIIX is not set 855# CONFIG_PATA_MPIIX is not set
689# CONFIG_PATA_OLDPIIX is not set 856CONFIG_PATA_OLDPIIX=y
690# CONFIG_PATA_NETCELL is not set 857# CONFIG_PATA_NETCELL is not set
858# CONFIG_PATA_NINJA32 is not set
691# CONFIG_PATA_NS87410 is not set 859# CONFIG_PATA_NS87410 is not set
860# CONFIG_PATA_NS87415 is not set
692# CONFIG_PATA_OPTI is not set 861# CONFIG_PATA_OPTI is not set
693# CONFIG_PATA_OPTIDMA is not set 862# CONFIG_PATA_OPTIDMA is not set
863# CONFIG_PATA_PCMCIA is not set
694# CONFIG_PATA_PDC_OLD is not set 864# CONFIG_PATA_PDC_OLD is not set
695# CONFIG_PATA_RADISYS is not set 865# CONFIG_PATA_RADISYS is not set
696# CONFIG_PATA_RZ1000 is not set 866# CONFIG_PATA_RZ1000 is not set
@@ -702,65 +872,42 @@ CONFIG_SATA_VIA=y
702# CONFIG_PATA_VIA is not set 872# CONFIG_PATA_VIA is not set
703# CONFIG_PATA_WINBOND is not set 873# CONFIG_PATA_WINBOND is not set
704CONFIG_MD=y 874CONFIG_MD=y
705# CONFIG_BLK_DEV_MD is not set 875CONFIG_BLK_DEV_MD=y
876# CONFIG_MD_LINEAR is not set
877# CONFIG_MD_RAID0 is not set
878# CONFIG_MD_RAID1 is not set
879# CONFIG_MD_RAID10 is not set
880# CONFIG_MD_RAID456 is not set
881# CONFIG_MD_MULTIPATH is not set
882# CONFIG_MD_FAULTY is not set
706CONFIG_BLK_DEV_DM=y 883CONFIG_BLK_DEV_DM=y
707# CONFIG_DM_DEBUG is not set 884# CONFIG_DM_DEBUG is not set
708# CONFIG_DM_CRYPT is not set 885# CONFIG_DM_CRYPT is not set
709# CONFIG_DM_SNAPSHOT is not set 886# CONFIG_DM_SNAPSHOT is not set
710# CONFIG_DM_MIRROR is not set 887CONFIG_DM_MIRROR=y
711# CONFIG_DM_ZERO is not set 888CONFIG_DM_ZERO=y
712# CONFIG_DM_MULTIPATH is not set 889# CONFIG_DM_MULTIPATH is not set
713# CONFIG_DM_DELAY is not set 890# CONFIG_DM_DELAY is not set
714 891# CONFIG_DM_UEVENT is not set
715# 892# CONFIG_FUSION is not set
716# Fusion MPT device support
717#
718CONFIG_FUSION=y
719CONFIG_FUSION_SPI=y
720# CONFIG_FUSION_FC is not set
721# CONFIG_FUSION_SAS is not set
722CONFIG_FUSION_MAX_SGE=128
723# CONFIG_FUSION_CTL is not set
724 893
725# 894#
726# IEEE 1394 (FireWire) support 895# IEEE 1394 (FireWire) support
727# 896#
728# CONFIG_FIREWIRE is not set 897# CONFIG_FIREWIRE is not set
729CONFIG_IEEE1394=y 898# CONFIG_IEEE1394 is not set
730
731#
732# Subsystem Options
733#
734# CONFIG_IEEE1394_VERBOSEDEBUG is not set
735
736#
737# Controllers
738#
739
740#
741# Texas Instruments PCILynx requires I2C
742#
743CONFIG_IEEE1394_OHCI1394=y
744
745#
746# Protocols
747#
748# CONFIG_IEEE1394_VIDEO1394 is not set
749# CONFIG_IEEE1394_SBP2 is not set
750# CONFIG_IEEE1394_ETH1394_ROM_ENTRY is not set
751# CONFIG_IEEE1394_ETH1394 is not set
752# CONFIG_IEEE1394_DV1394 is not set
753CONFIG_IEEE1394_RAWIO=y
754# CONFIG_I2O is not set 899# CONFIG_I2O is not set
755CONFIG_MACINTOSH_DRIVERS=y 900CONFIG_MACINTOSH_DRIVERS=y
756# CONFIG_MAC_EMUMOUSEBTN is not set 901CONFIG_MAC_EMUMOUSEBTN=y
757CONFIG_NETDEVICES=y 902CONFIG_NETDEVICES=y
758CONFIG_NETDEVICES_MULTIQUEUE=y 903# CONFIG_NETDEVICES_MULTIQUEUE is not set
904# CONFIG_IFB is not set
759# CONFIG_DUMMY is not set 905# CONFIG_DUMMY is not set
760# CONFIG_BONDING is not set 906# CONFIG_BONDING is not set
761# CONFIG_MACVLAN is not set 907# CONFIG_MACVLAN is not set
762# CONFIG_EQUALIZER is not set 908# CONFIG_EQUALIZER is not set
763# CONFIG_TUN is not set 909# CONFIG_TUN is not set
910# CONFIG_VETH is not set
764# CONFIG_NET_SB1000 is not set 911# CONFIG_NET_SB1000 is not set
765# CONFIG_ARCNET is not set 912# CONFIG_ARCNET is not set
766# CONFIG_PHYLIB is not set 913# CONFIG_PHYLIB is not set
@@ -770,38 +917,40 @@ CONFIG_MII=y
770# CONFIG_SUNGEM is not set 917# CONFIG_SUNGEM is not set
771# CONFIG_CASSINI is not set 918# CONFIG_CASSINI is not set
772CONFIG_NET_VENDOR_3COM=y 919CONFIG_NET_VENDOR_3COM=y
773CONFIG_VORTEX=y 920# CONFIG_VORTEX is not set
774# CONFIG_TYPHOON is not set 921# CONFIG_TYPHOON is not set
775CONFIG_NET_TULIP=y 922CONFIG_NET_TULIP=y
776# CONFIG_DE2104X is not set 923# CONFIG_DE2104X is not set
777CONFIG_TULIP=y 924# CONFIG_TULIP is not set
778# CONFIG_TULIP_MWI is not set
779# CONFIG_TULIP_MMIO is not set
780# CONFIG_TULIP_NAPI is not set
781# CONFIG_DE4X5 is not set 925# CONFIG_DE4X5 is not set
782# CONFIG_WINBOND_840 is not set 926# CONFIG_WINBOND_840 is not set
783# CONFIG_DM9102 is not set 927# CONFIG_DM9102 is not set
784# CONFIG_ULI526X is not set 928# CONFIG_ULI526X is not set
929# CONFIG_PCMCIA_XIRCOM is not set
785# CONFIG_HP100 is not set 930# CONFIG_HP100 is not set
931# CONFIG_IBM_NEW_EMAC_ZMII is not set
932# CONFIG_IBM_NEW_EMAC_RGMII is not set
933# CONFIG_IBM_NEW_EMAC_TAH is not set
934# CONFIG_IBM_NEW_EMAC_EMAC4 is not set
786CONFIG_NET_PCI=y 935CONFIG_NET_PCI=y
787# CONFIG_PCNET32 is not set 936# CONFIG_PCNET32 is not set
788# CONFIG_AMD8111_ETH is not set 937# CONFIG_AMD8111_ETH is not set
789# CONFIG_ADAPTEC_STARFIRE is not set 938# CONFIG_ADAPTEC_STARFIRE is not set
790CONFIG_B44=y 939# CONFIG_B44 is not set
791CONFIG_FORCEDETH=y 940CONFIG_FORCEDETH=y
792# CONFIG_FORCEDETH_NAPI is not set 941# CONFIG_FORCEDETH_NAPI is not set
793# CONFIG_DGRS is not set
794# CONFIG_EEPRO100 is not set 942# CONFIG_EEPRO100 is not set
795CONFIG_E100=y 943CONFIG_E100=y
796# CONFIG_FEALNX is not set 944# CONFIG_FEALNX is not set
797# CONFIG_NATSEMI is not set 945# CONFIG_NATSEMI is not set
798# CONFIG_NE2K_PCI is not set 946# CONFIG_NE2K_PCI is not set
799CONFIG_8139CP=y 947# CONFIG_8139CP is not set
800CONFIG_8139TOO=y 948CONFIG_8139TOO=y
801# CONFIG_8139TOO_PIO is not set 949CONFIG_8139TOO_PIO=y
802# CONFIG_8139TOO_TUNE_TWISTER is not set 950# CONFIG_8139TOO_TUNE_TWISTER is not set
803# CONFIG_8139TOO_8129 is not set 951# CONFIG_8139TOO_8129 is not set
804# CONFIG_8139_OLD_RX_RESET is not set 952# CONFIG_8139_OLD_RX_RESET is not set
953# CONFIG_R6040 is not set
805# CONFIG_SIS900 is not set 954# CONFIG_SIS900 is not set
806# CONFIG_EPIC100 is not set 955# CONFIG_EPIC100 is not set
807# CONFIG_SUNDANCE is not set 956# CONFIG_SUNDANCE is not set
@@ -814,34 +963,75 @@ CONFIG_NETDEV_1000=y
814CONFIG_E1000=y 963CONFIG_E1000=y
815# CONFIG_E1000_NAPI is not set 964# CONFIG_E1000_NAPI is not set
816# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set 965# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set
966# CONFIG_E1000E is not set
967# CONFIG_E1000E_ENABLED is not set
968# CONFIG_IP1000 is not set
969# CONFIG_IGB is not set
817# CONFIG_NS83820 is not set 970# CONFIG_NS83820 is not set
818# CONFIG_HAMACHI is not set 971# CONFIG_HAMACHI is not set
819# CONFIG_YELLOWFIN is not set 972# CONFIG_YELLOWFIN is not set
820CONFIG_R8169=y 973# CONFIG_R8169 is not set
821# CONFIG_R8169_NAPI is not set
822# CONFIG_SIS190 is not set 974# CONFIG_SIS190 is not set
823# CONFIG_SKGE is not set 975# CONFIG_SKGE is not set
824CONFIG_SKY2=y 976CONFIG_SKY2=y
977# CONFIG_SKY2_DEBUG is not set
825# CONFIG_VIA_VELOCITY is not set 978# CONFIG_VIA_VELOCITY is not set
826CONFIG_TIGON3=y 979CONFIG_TIGON3=y
827CONFIG_BNX2=y 980# CONFIG_BNX2 is not set
828# CONFIG_QLA3XXX is not set 981# CONFIG_QLA3XXX is not set
829# CONFIG_ATL1 is not set 982# CONFIG_ATL1 is not set
830CONFIG_NETDEV_10000=y 983CONFIG_NETDEV_10000=y
831# CONFIG_CHELSIO_T1 is not set 984# CONFIG_CHELSIO_T1 is not set
832# CONFIG_CHELSIO_T3 is not set 985# CONFIG_CHELSIO_T3 is not set
986# CONFIG_IXGBE is not set
833# CONFIG_IXGB is not set 987# CONFIG_IXGB is not set
834# CONFIG_S2IO is not set 988# CONFIG_S2IO is not set
835# CONFIG_MYRI10GE is not set 989# CONFIG_MYRI10GE is not set
836# CONFIG_NETXEN_NIC is not set 990# CONFIG_NETXEN_NIC is not set
991# CONFIG_NIU is not set
837# CONFIG_MLX4_CORE is not set 992# CONFIG_MLX4_CORE is not set
838# CONFIG_TR is not set 993# CONFIG_TEHUTI is not set
994# CONFIG_BNX2X is not set
995# CONFIG_SFC is not set
996CONFIG_TR=y
997# CONFIG_IBMOL is not set
998# CONFIG_IBMLS is not set
999# CONFIG_3C359 is not set
1000# CONFIG_TMS380TR is not set
839 1001
840# 1002#
841# Wireless LAN 1003# Wireless LAN
842# 1004#
843# CONFIG_WLAN_PRE80211 is not set 1005# CONFIG_WLAN_PRE80211 is not set
844# CONFIG_WLAN_80211 is not set 1006CONFIG_WLAN_80211=y
1007# CONFIG_PCMCIA_RAYCS is not set
1008# CONFIG_IPW2100 is not set
1009# CONFIG_IPW2200 is not set
1010# CONFIG_LIBERTAS is not set
1011# CONFIG_AIRO is not set
1012# CONFIG_HERMES is not set
1013# CONFIG_ATMEL is not set
1014# CONFIG_AIRO_CS is not set
1015# CONFIG_PCMCIA_WL3501 is not set
1016# CONFIG_PRISM54 is not set
1017# CONFIG_USB_ZD1201 is not set
1018# CONFIG_USB_NET_RNDIS_WLAN is not set
1019# CONFIG_RTL8180 is not set
1020# CONFIG_RTL8187 is not set
1021# CONFIG_ADM8211 is not set
1022# CONFIG_P54_COMMON is not set
1023CONFIG_ATH5K=y
1024# CONFIG_ATH5K_DEBUG is not set
1025# CONFIG_IWLWIFI is not set
1026# CONFIG_IWLCORE is not set
1027# CONFIG_IWLWIFI_LEDS is not set
1028# CONFIG_IWL4965 is not set
1029# CONFIG_IWL3945 is not set
1030# CONFIG_HOSTAP is not set
1031# CONFIG_B43 is not set
1032# CONFIG_B43LEGACY is not set
1033# CONFIG_ZD1211RW is not set
1034# CONFIG_RT2X00 is not set
845 1035
846# 1036#
847# USB Network Adapters 1037# USB Network Adapters
@@ -850,16 +1040,27 @@ CONFIG_NETDEV_10000=y
850# CONFIG_USB_KAWETH is not set 1040# CONFIG_USB_KAWETH is not set
851# CONFIG_USB_PEGASUS is not set 1041# CONFIG_USB_PEGASUS is not set
852# CONFIG_USB_RTL8150 is not set 1042# CONFIG_USB_RTL8150 is not set
853# CONFIG_USB_USBNET_MII is not set
854# CONFIG_USB_USBNET is not set 1043# CONFIG_USB_USBNET is not set
1044CONFIG_NET_PCMCIA=y
1045# CONFIG_PCMCIA_3C589 is not set
1046# CONFIG_PCMCIA_3C574 is not set
1047# CONFIG_PCMCIA_FMVJ18X is not set
1048# CONFIG_PCMCIA_PCNET is not set
1049# CONFIG_PCMCIA_NMCLAN is not set
1050# CONFIG_PCMCIA_SMC91C92 is not set
1051# CONFIG_PCMCIA_XIRC2PS is not set
1052# CONFIG_PCMCIA_AXNET is not set
1053# CONFIG_PCMCIA_IBMTR is not set
855# CONFIG_WAN is not set 1054# CONFIG_WAN is not set
856# CONFIG_FDDI is not set 1055CONFIG_FDDI=y
1056# CONFIG_DEFXX is not set
1057# CONFIG_SKFP is not set
857# CONFIG_HIPPI is not set 1058# CONFIG_HIPPI is not set
858# CONFIG_PPP is not set 1059# CONFIG_PPP is not set
859# CONFIG_SLIP is not set 1060# CONFIG_SLIP is not set
860# CONFIG_NET_FC is not set 1061# CONFIG_NET_FC is not set
861# CONFIG_SHAPER is not set
862CONFIG_NETCONSOLE=y 1062CONFIG_NETCONSOLE=y
1063# CONFIG_NETCONSOLE_DYNAMIC is not set
863CONFIG_NETPOLL=y 1064CONFIG_NETPOLL=y
864# CONFIG_NETPOLL_TRAP is not set 1065# CONFIG_NETPOLL_TRAP is not set
865CONFIG_NET_POLL_CONTROLLER=y 1066CONFIG_NET_POLL_CONTROLLER=y
@@ -870,18 +1071,17 @@ CONFIG_NET_POLL_CONTROLLER=y
870# Input device support 1071# Input device support
871# 1072#
872CONFIG_INPUT=y 1073CONFIG_INPUT=y
873# CONFIG_INPUT_FF_MEMLESS is not set 1074CONFIG_INPUT_FF_MEMLESS=y
874# CONFIG_INPUT_POLLDEV is not set 1075CONFIG_INPUT_POLLDEV=y
875 1076
876# 1077#
877# Userland interfaces 1078# Userland interfaces
878# 1079#
879CONFIG_INPUT_MOUSEDEV=y 1080CONFIG_INPUT_MOUSEDEV=y
880CONFIG_INPUT_MOUSEDEV_PSAUX=y 1081# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
881CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 1082CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
882CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 1083CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
883# CONFIG_INPUT_JOYDEV is not set 1084# CONFIG_INPUT_JOYDEV is not set
884# CONFIG_INPUT_TSDEV is not set
885CONFIG_INPUT_EVDEV=y 1085CONFIG_INPUT_EVDEV=y
886# CONFIG_INPUT_EVBUG is not set 1086# CONFIG_INPUT_EVBUG is not set
887 1087
@@ -906,17 +1106,63 @@ CONFIG_MOUSE_PS2_TRACKPOINT=y
906# CONFIG_MOUSE_SERIAL is not set 1106# CONFIG_MOUSE_SERIAL is not set
907# CONFIG_MOUSE_APPLETOUCH is not set 1107# CONFIG_MOUSE_APPLETOUCH is not set
908# CONFIG_MOUSE_VSXXXAA is not set 1108# CONFIG_MOUSE_VSXXXAA is not set
909# CONFIG_INPUT_JOYSTICK is not set 1109CONFIG_INPUT_JOYSTICK=y
910# CONFIG_INPUT_TABLET is not set 1110# CONFIG_JOYSTICK_ANALOG is not set
911# CONFIG_INPUT_TOUCHSCREEN is not set 1111# CONFIG_JOYSTICK_A3D is not set
912# CONFIG_INPUT_MISC is not set 1112# CONFIG_JOYSTICK_ADI is not set
1113# CONFIG_JOYSTICK_COBRA is not set
1114# CONFIG_JOYSTICK_GF2K is not set
1115# CONFIG_JOYSTICK_GRIP is not set
1116# CONFIG_JOYSTICK_GRIP_MP is not set
1117# CONFIG_JOYSTICK_GUILLEMOT is not set
1118# CONFIG_JOYSTICK_INTERACT is not set
1119# CONFIG_JOYSTICK_SIDEWINDER is not set
1120# CONFIG_JOYSTICK_TMDC is not set
1121# CONFIG_JOYSTICK_IFORCE is not set
1122# CONFIG_JOYSTICK_WARRIOR is not set
1123# CONFIG_JOYSTICK_MAGELLAN is not set
1124# CONFIG_JOYSTICK_SPACEORB is not set
1125# CONFIG_JOYSTICK_SPACEBALL is not set
1126# CONFIG_JOYSTICK_STINGER is not set
1127# CONFIG_JOYSTICK_TWIDJOY is not set
1128# CONFIG_JOYSTICK_ZHENHUA is not set
1129# CONFIG_JOYSTICK_JOYDUMP is not set
1130# CONFIG_JOYSTICK_XPAD is not set
1131CONFIG_INPUT_TABLET=y
1132# CONFIG_TABLET_USB_ACECAD is not set
1133# CONFIG_TABLET_USB_AIPTEK is not set
1134# CONFIG_TABLET_USB_GTCO is not set
1135# CONFIG_TABLET_USB_KBTAB is not set
1136# CONFIG_TABLET_USB_WACOM is not set
1137CONFIG_INPUT_TOUCHSCREEN=y
1138# CONFIG_TOUCHSCREEN_FUJITSU is not set
1139# CONFIG_TOUCHSCREEN_GUNZE is not set
1140# CONFIG_TOUCHSCREEN_ELO is not set
1141# CONFIG_TOUCHSCREEN_MTOUCH is not set
1142# CONFIG_TOUCHSCREEN_MK712 is not set
1143# CONFIG_TOUCHSCREEN_PENMOUNT is not set
1144# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set
1145# CONFIG_TOUCHSCREEN_TOUCHWIN is not set
1146# CONFIG_TOUCHSCREEN_UCB1400 is not set
1147# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set
1148CONFIG_INPUT_MISC=y
1149# CONFIG_INPUT_PCSPKR is not set
1150# CONFIG_INPUT_APANEL is not set
1151# CONFIG_INPUT_WISTRON_BTNS is not set
1152# CONFIG_INPUT_ATLAS_BTNS is not set
1153# CONFIG_INPUT_ATI_REMOTE is not set
1154# CONFIG_INPUT_ATI_REMOTE2 is not set
1155# CONFIG_INPUT_KEYSPAN_REMOTE is not set
1156# CONFIG_INPUT_POWERMATE is not set
1157# CONFIG_INPUT_YEALINK is not set
1158# CONFIG_INPUT_UINPUT is not set
913 1159
914# 1160#
915# Hardware I/O ports 1161# Hardware I/O ports
916# 1162#
917CONFIG_SERIO=y 1163CONFIG_SERIO=y
918CONFIG_SERIO_I8042=y 1164CONFIG_SERIO_I8042=y
919# CONFIG_SERIO_SERPORT is not set 1165CONFIG_SERIO_SERPORT=y
920# CONFIG_SERIO_CT82C710 is not set 1166# CONFIG_SERIO_CT82C710 is not set
921# CONFIG_SERIO_PCIPS2 is not set 1167# CONFIG_SERIO_PCIPS2 is not set
922CONFIG_SERIO_LIBPS2=y 1168CONFIG_SERIO_LIBPS2=y
@@ -929,8 +1175,26 @@ CONFIG_SERIO_LIBPS2=y
929CONFIG_VT=y 1175CONFIG_VT=y
930CONFIG_VT_CONSOLE=y 1176CONFIG_VT_CONSOLE=y
931CONFIG_HW_CONSOLE=y 1177CONFIG_HW_CONSOLE=y
932# CONFIG_VT_HW_CONSOLE_BINDING is not set 1178CONFIG_VT_HW_CONSOLE_BINDING=y
933# CONFIG_SERIAL_NONSTANDARD is not set 1179CONFIG_DEVKMEM=y
1180CONFIG_SERIAL_NONSTANDARD=y
1181# CONFIG_COMPUTONE is not set
1182# CONFIG_ROCKETPORT is not set
1183# CONFIG_CYCLADES is not set
1184# CONFIG_DIGIEPCA is not set
1185# CONFIG_MOXA_INTELLIO is not set
1186# CONFIG_MOXA_SMARTIO is not set
1187# CONFIG_ISI is not set
1188# CONFIG_SYNCLINK is not set
1189# CONFIG_SYNCLINKMP is not set
1190# CONFIG_SYNCLINK_GT is not set
1191# CONFIG_N_HDLC is not set
1192# CONFIG_RISCOM8 is not set
1193# CONFIG_SPECIALIX is not set
1194# CONFIG_SX is not set
1195# CONFIG_RIO is not set
1196# CONFIG_STALDRV is not set
1197# CONFIG_NOZOMI is not set
934 1198
935# 1199#
936# Serial drivers 1200# Serial drivers
@@ -940,9 +1204,14 @@ CONFIG_SERIAL_8250_CONSOLE=y
940CONFIG_FIX_EARLYCON_MEM=y 1204CONFIG_FIX_EARLYCON_MEM=y
941CONFIG_SERIAL_8250_PCI=y 1205CONFIG_SERIAL_8250_PCI=y
942CONFIG_SERIAL_8250_PNP=y 1206CONFIG_SERIAL_8250_PNP=y
943CONFIG_SERIAL_8250_NR_UARTS=4 1207# CONFIG_SERIAL_8250_CS is not set
1208CONFIG_SERIAL_8250_NR_UARTS=32
944CONFIG_SERIAL_8250_RUNTIME_UARTS=4 1209CONFIG_SERIAL_8250_RUNTIME_UARTS=4
945# CONFIG_SERIAL_8250_EXTENDED is not set 1210CONFIG_SERIAL_8250_EXTENDED=y
1211CONFIG_SERIAL_8250_MANY_PORTS=y
1212CONFIG_SERIAL_8250_SHARE_IRQ=y
1213CONFIG_SERIAL_8250_DETECT_IRQ=y
1214CONFIG_SERIAL_8250_RSA=y
946 1215
947# 1216#
948# Non-8250 serial port support 1217# Non-8250 serial port support
@@ -951,89 +1220,275 @@ CONFIG_SERIAL_CORE=y
951CONFIG_SERIAL_CORE_CONSOLE=y 1220CONFIG_SERIAL_CORE_CONSOLE=y
952# CONFIG_SERIAL_JSM is not set 1221# CONFIG_SERIAL_JSM is not set
953CONFIG_UNIX98_PTYS=y 1222CONFIG_UNIX98_PTYS=y
954CONFIG_LEGACY_PTYS=y 1223# CONFIG_LEGACY_PTYS is not set
955CONFIG_LEGACY_PTY_COUNT=256
956# CONFIG_IPMI_HANDLER is not set 1224# CONFIG_IPMI_HANDLER is not set
957# CONFIG_WATCHDOG is not set
958CONFIG_HW_RANDOM=y 1225CONFIG_HW_RANDOM=y
959CONFIG_HW_RANDOM_INTEL=y 1226# CONFIG_HW_RANDOM_INTEL is not set
960CONFIG_HW_RANDOM_AMD=y 1227# CONFIG_HW_RANDOM_AMD is not set
961CONFIG_HW_RANDOM_GEODE=y 1228CONFIG_HW_RANDOM_GEODE=y
962CONFIG_HW_RANDOM_VIA=y 1229CONFIG_HW_RANDOM_VIA=y
963# CONFIG_NVRAM is not set 1230CONFIG_NVRAM=y
964CONFIG_RTC=y
965# CONFIG_R3964 is not set 1231# CONFIG_R3964 is not set
966# CONFIG_APPLICOM is not set 1232# CONFIG_APPLICOM is not set
967# CONFIG_SONYPI is not set 1233# CONFIG_SONYPI is not set
968CONFIG_AGP=y 1234
969# CONFIG_AGP_ALI is not set 1235#
970# CONFIG_AGP_ATI is not set 1236# PCMCIA character devices
971# CONFIG_AGP_AMD is not set 1237#
972CONFIG_AGP_AMD64=y 1238# CONFIG_SYNCLINK_CS is not set
973CONFIG_AGP_INTEL=y 1239# CONFIG_CARDMAN_4000 is not set
974# CONFIG_AGP_NVIDIA is not set 1240# CONFIG_CARDMAN_4040 is not set
975# CONFIG_AGP_SIS is not set 1241# CONFIG_IPWIRELESS is not set
976# CONFIG_AGP_SWORKS is not set
977# CONFIG_AGP_VIA is not set
978# CONFIG_AGP_EFFICEON is not set
979# CONFIG_DRM is not set
980# CONFIG_MWAVE is not set 1242# CONFIG_MWAVE is not set
981# CONFIG_PC8736x_GPIO is not set 1243# CONFIG_PC8736x_GPIO is not set
982# CONFIG_NSC_GPIO is not set 1244# CONFIG_NSC_GPIO is not set
983# CONFIG_CS5535_GPIO is not set 1245# CONFIG_CS5535_GPIO is not set
984CONFIG_RAW_DRIVER=y 1246# CONFIG_RAW_DRIVER is not set
985CONFIG_MAX_RAW_DEVS=256
986CONFIG_HPET=y 1247CONFIG_HPET=y
987# CONFIG_HPET_RTC_IRQ is not set 1248# CONFIG_HPET_RTC_IRQ is not set
988CONFIG_HPET_MMAP=y 1249# CONFIG_HPET_MMAP is not set
989# CONFIG_HANGCHECK_TIMER is not set 1250# CONFIG_HANGCHECK_TIMER is not set
990# CONFIG_TCG_TPM is not set 1251# CONFIG_TCG_TPM is not set
991# CONFIG_TELCLOCK is not set 1252# CONFIG_TELCLOCK is not set
992CONFIG_DEVPORT=y 1253CONFIG_DEVPORT=y
993# CONFIG_I2C is not set 1254CONFIG_I2C=y
994 1255CONFIG_I2C_BOARDINFO=y
995# 1256# CONFIG_I2C_CHARDEV is not set
996# SPI support 1257
997# 1258#
1259# I2C Hardware Bus support
1260#
1261# CONFIG_I2C_ALI1535 is not set
1262# CONFIG_I2C_ALI1563 is not set
1263# CONFIG_I2C_ALI15X3 is not set
1264# CONFIG_I2C_AMD756 is not set
1265# CONFIG_I2C_AMD8111 is not set
1266CONFIG_I2C_I801=y
1267# CONFIG_I2C_I810 is not set
1268# CONFIG_I2C_PIIX4 is not set
1269# CONFIG_I2C_NFORCE2 is not set
1270# CONFIG_I2C_OCORES is not set
1271# CONFIG_I2C_PARPORT_LIGHT is not set
1272# CONFIG_I2C_PROSAVAGE is not set
1273# CONFIG_I2C_SAVAGE4 is not set
1274# CONFIG_I2C_SIMTEC is not set
1275# CONFIG_SCx200_ACB is not set
1276# CONFIG_I2C_SIS5595 is not set
1277# CONFIG_I2C_SIS630 is not set
1278# CONFIG_I2C_SIS96X is not set
1279# CONFIG_I2C_TAOS_EVM is not set
1280# CONFIG_I2C_STUB is not set
1281# CONFIG_I2C_TINY_USB is not set
1282# CONFIG_I2C_VIA is not set
1283# CONFIG_I2C_VIAPRO is not set
1284# CONFIG_I2C_VOODOO3 is not set
1285# CONFIG_I2C_PCA_PLATFORM is not set
1286
1287#
1288# Miscellaneous I2C Chip support
1289#
1290# CONFIG_DS1682 is not set
1291# CONFIG_SENSORS_EEPROM is not set
1292# CONFIG_SENSORS_PCF8574 is not set
1293# CONFIG_PCF8575 is not set
1294# CONFIG_SENSORS_PCF8591 is not set
1295# CONFIG_SENSORS_MAX6875 is not set
1296# CONFIG_SENSORS_TSL2550 is not set
1297# CONFIG_I2C_DEBUG_CORE is not set
1298# CONFIG_I2C_DEBUG_ALGO is not set
1299# CONFIG_I2C_DEBUG_BUS is not set
1300# CONFIG_I2C_DEBUG_CHIP is not set
998# CONFIG_SPI is not set 1301# CONFIG_SPI is not set
999# CONFIG_SPI_MASTER is not set
1000# CONFIG_W1 is not set 1302# CONFIG_W1 is not set
1001# CONFIG_POWER_SUPPLY is not set 1303CONFIG_POWER_SUPPLY=y
1304# CONFIG_POWER_SUPPLY_DEBUG is not set
1305# CONFIG_PDA_POWER is not set
1306# CONFIG_BATTERY_DS2760 is not set
1002# CONFIG_HWMON is not set 1307# CONFIG_HWMON is not set
1308CONFIG_THERMAL=y
1309CONFIG_WATCHDOG=y
1310# CONFIG_WATCHDOG_NOWAYOUT is not set
1311
1312#
1313# Watchdog Device Drivers
1314#
1315# CONFIG_SOFT_WATCHDOG is not set
1316# CONFIG_ACQUIRE_WDT is not set
1317# CONFIG_ADVANTECH_WDT is not set
1318# CONFIG_ALIM1535_WDT is not set
1319# CONFIG_ALIM7101_WDT is not set
1320# CONFIG_SC520_WDT is not set
1321# CONFIG_EUROTECH_WDT is not set
1322# CONFIG_IB700_WDT is not set
1323# CONFIG_IBMASR is not set
1324# CONFIG_WAFER_WDT is not set
1325# CONFIG_I6300ESB_WDT is not set
1326# CONFIG_ITCO_WDT is not set
1327# CONFIG_IT8712F_WDT is not set
1328# CONFIG_HP_WATCHDOG is not set
1329# CONFIG_SC1200_WDT is not set
1330# CONFIG_PC87413_WDT is not set
1331# CONFIG_60XX_WDT is not set
1332# CONFIG_SBC8360_WDT is not set
1333# CONFIG_SBC7240_WDT is not set
1334# CONFIG_CPU5_WDT is not set
1335# CONFIG_SMSC37B787_WDT is not set
1336# CONFIG_W83627HF_WDT is not set
1337# CONFIG_W83697HF_WDT is not set
1338# CONFIG_W83877F_WDT is not set
1339# CONFIG_W83977F_WDT is not set
1340# CONFIG_MACHZ_WDT is not set
1341# CONFIG_SBC_EPX_C3_WATCHDOG is not set
1342
1343#
1344# PCI-based Watchdog Cards
1345#
1346# CONFIG_PCIPCWATCHDOG is not set
1347# CONFIG_WDTPCI is not set
1348
1349#
1350# USB-based Watchdog Cards
1351#
1352# CONFIG_USBPCWATCHDOG is not set
1353
1354#
1355# Sonics Silicon Backplane
1356#
1357CONFIG_SSB_POSSIBLE=y
1358# CONFIG_SSB is not set
1003 1359
1004# 1360#
1005# Multifunction device drivers 1361# Multifunction device drivers
1006# 1362#
1007# CONFIG_MFD_SM501 is not set 1363# CONFIG_MFD_SM501 is not set
1364# CONFIG_HTC_PASIC3 is not set
1008 1365
1009# 1366#
1010# Multimedia devices 1367# Multimedia devices
1011# 1368#
1369
1370#
1371# Multimedia core support
1372#
1012# CONFIG_VIDEO_DEV is not set 1373# CONFIG_VIDEO_DEV is not set
1013# CONFIG_DVB_CORE is not set 1374# CONFIG_DVB_CORE is not set
1375
1376#
1377# Multimedia drivers
1378#
1014CONFIG_DAB=y 1379CONFIG_DAB=y
1015# CONFIG_USB_DABUSB is not set 1380# CONFIG_USB_DABUSB is not set
1016 1381
1017# 1382#
1018# Graphics support 1383# Graphics support
1019# 1384#
1020# CONFIG_BACKLIGHT_LCD_SUPPORT is not set 1385CONFIG_AGP=y
1386# CONFIG_AGP_ALI is not set
1387# CONFIG_AGP_ATI is not set
1388# CONFIG_AGP_AMD is not set
1389CONFIG_AGP_AMD64=y
1390CONFIG_AGP_INTEL=y
1391# CONFIG_AGP_NVIDIA is not set
1392# CONFIG_AGP_SIS is not set
1393# CONFIG_AGP_SWORKS is not set
1394# CONFIG_AGP_VIA is not set
1395# CONFIG_AGP_EFFICEON is not set
1396CONFIG_DRM=y
1397# CONFIG_DRM_TDFX is not set
1398# CONFIG_DRM_R128 is not set
1399# CONFIG_DRM_RADEON is not set
1400# CONFIG_DRM_I810 is not set
1401# CONFIG_DRM_I830 is not set
1402CONFIG_DRM_I915=y
1403# CONFIG_DRM_MGA is not set
1404# CONFIG_DRM_SIS is not set
1405# CONFIG_DRM_VIA is not set
1406# CONFIG_DRM_SAVAGE is not set
1407# CONFIG_VGASTATE is not set
1408# CONFIG_VIDEO_OUTPUT_CONTROL is not set
1409CONFIG_FB=y
1410# CONFIG_FIRMWARE_EDID is not set
1411# CONFIG_FB_DDC is not set
1412CONFIG_FB_CFB_FILLRECT=y
1413CONFIG_FB_CFB_COPYAREA=y
1414CONFIG_FB_CFB_IMAGEBLIT=y
1415# CONFIG_FB_CFB_REV_PIXELS_IN_BYTE is not set
1416# CONFIG_FB_SYS_FILLRECT is not set
1417# CONFIG_FB_SYS_COPYAREA is not set
1418# CONFIG_FB_SYS_IMAGEBLIT is not set
1419# CONFIG_FB_FOREIGN_ENDIAN is not set
1420# CONFIG_FB_SYS_FOPS is not set
1421CONFIG_FB_DEFERRED_IO=y
1422# CONFIG_FB_SVGALIB is not set
1423# CONFIG_FB_MACMODES is not set
1424# CONFIG_FB_BACKLIGHT is not set
1425CONFIG_FB_MODE_HELPERS=y
1426CONFIG_FB_TILEBLITTING=y
1427
1428#
1429# Frame buffer hardware drivers
1430#
1431# CONFIG_FB_CIRRUS is not set
1432# CONFIG_FB_PM2 is not set
1433# CONFIG_FB_CYBER2000 is not set
1434# CONFIG_FB_ARC is not set
1435# CONFIG_FB_ASILIANT is not set
1436# CONFIG_FB_IMSTT is not set
1437# CONFIG_FB_VGA16 is not set
1438# CONFIG_FB_UVESA is not set
1439# CONFIG_FB_VESA is not set
1440CONFIG_FB_EFI=y
1441# CONFIG_FB_IMAC is not set
1442# CONFIG_FB_N411 is not set
1443# CONFIG_FB_HGA is not set
1444# CONFIG_FB_S1D13XXX is not set
1445# CONFIG_FB_NVIDIA is not set
1446# CONFIG_FB_RIVA is not set
1447# CONFIG_FB_I810 is not set
1448# CONFIG_FB_LE80578 is not set
1449# CONFIG_FB_INTEL is not set
1450# CONFIG_FB_MATROX is not set
1451# CONFIG_FB_RADEON is not set
1452# CONFIG_FB_ATY128 is not set
1453# CONFIG_FB_ATY is not set
1454# CONFIG_FB_S3 is not set
1455# CONFIG_FB_SAVAGE is not set
1456# CONFIG_FB_SIS is not set
1457# CONFIG_FB_NEOMAGIC is not set
1458# CONFIG_FB_KYRO is not set
1459# CONFIG_FB_3DFX is not set
1460# CONFIG_FB_VOODOO1 is not set
1461# CONFIG_FB_VT8623 is not set
1462# CONFIG_FB_CYBLA is not set
1463# CONFIG_FB_TRIDENT is not set
1464# CONFIG_FB_ARK is not set
1465# CONFIG_FB_PM3 is not set
1466# CONFIG_FB_GEODE is not set
1467# CONFIG_FB_VIRTUAL is not set
1468CONFIG_BACKLIGHT_LCD_SUPPORT=y
1469# CONFIG_LCD_CLASS_DEVICE is not set
1470CONFIG_BACKLIGHT_CLASS_DEVICE=y
1471# CONFIG_BACKLIGHT_CORGI is not set
1472# CONFIG_BACKLIGHT_PROGEAR is not set
1021 1473
1022# 1474#
1023# Display device support 1475# Display device support
1024# 1476#
1025# CONFIG_DISPLAY_SUPPORT is not set 1477# CONFIG_DISPLAY_SUPPORT is not set
1026# CONFIG_VGASTATE is not set
1027# CONFIG_FB is not set
1028 1478
1029# 1479#
1030# Console display driver support 1480# Console display driver support
1031# 1481#
1032CONFIG_VGA_CONSOLE=y 1482CONFIG_VGA_CONSOLE=y
1033CONFIG_VGACON_SOFT_SCROLLBACK=y 1483CONFIG_VGACON_SOFT_SCROLLBACK=y
1034CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=128 1484CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64
1035CONFIG_VIDEO_SELECT=y 1485CONFIG_VIDEO_SELECT=y
1036CONFIG_DUMMY_CONSOLE=y 1486CONFIG_DUMMY_CONSOLE=y
1487# CONFIG_FRAMEBUFFER_CONSOLE is not set
1488CONFIG_LOGO=y
1489# CONFIG_LOGO_LINUX_MONO is not set
1490# CONFIG_LOGO_LINUX_VGA16 is not set
1491CONFIG_LOGO_LINUX_CLUT224=y
1037 1492
1038# 1493#
1039# Sound 1494# Sound
@@ -1043,33 +1498,167 @@ CONFIG_SOUND=y
1043# 1498#
1044# Advanced Linux Sound Architecture 1499# Advanced Linux Sound Architecture
1045# 1500#
1046# CONFIG_SND is not set 1501CONFIG_SND=y
1502CONFIG_SND_TIMER=y
1503CONFIG_SND_PCM=y
1504CONFIG_SND_HWDEP=y
1505CONFIG_SND_SEQUENCER=y
1506CONFIG_SND_SEQ_DUMMY=y
1507CONFIG_SND_OSSEMUL=y
1508CONFIG_SND_MIXER_OSS=y
1509CONFIG_SND_PCM_OSS=y
1510CONFIG_SND_PCM_OSS_PLUGINS=y
1511CONFIG_SND_SEQUENCER_OSS=y
1512CONFIG_SND_DYNAMIC_MINORS=y
1513CONFIG_SND_SUPPORT_OLD_API=y
1514CONFIG_SND_VERBOSE_PROCFS=y
1515# CONFIG_SND_VERBOSE_PRINTK is not set
1516# CONFIG_SND_DEBUG is not set
1517CONFIG_SND_VMASTER=y
1518
1519#
1520# Generic devices
1521#
1522# CONFIG_SND_PCSP is not set
1523# CONFIG_SND_DUMMY is not set
1524# CONFIG_SND_VIRMIDI is not set
1525# CONFIG_SND_MTPAV is not set
1526# CONFIG_SND_SERIAL_U16550 is not set
1527# CONFIG_SND_MPU401 is not set
1528
1529#
1530# PCI devices
1531#
1532# CONFIG_SND_AD1889 is not set
1533# CONFIG_SND_ALS300 is not set
1534# CONFIG_SND_ALS4000 is not set
1535# CONFIG_SND_ALI5451 is not set
1536# CONFIG_SND_ATIIXP is not set
1537# CONFIG_SND_ATIIXP_MODEM is not set
1538# CONFIG_SND_AU8810 is not set
1539# CONFIG_SND_AU8820 is not set
1540# CONFIG_SND_AU8830 is not set
1541# CONFIG_SND_AW2 is not set
1542# CONFIG_SND_AZT3328 is not set
1543# CONFIG_SND_BT87X is not set
1544# CONFIG_SND_CA0106 is not set
1545# CONFIG_SND_CMIPCI is not set
1546# CONFIG_SND_OXYGEN is not set
1547# CONFIG_SND_CS4281 is not set
1548# CONFIG_SND_CS46XX is not set
1549# CONFIG_SND_CS5530 is not set
1550# CONFIG_SND_CS5535AUDIO is not set
1551# CONFIG_SND_DARLA20 is not set
1552# CONFIG_SND_GINA20 is not set
1553# CONFIG_SND_LAYLA20 is not set
1554# CONFIG_SND_DARLA24 is not set
1555# CONFIG_SND_GINA24 is not set
1556# CONFIG_SND_LAYLA24 is not set
1557# CONFIG_SND_MONA is not set
1558# CONFIG_SND_MIA is not set
1559# CONFIG_SND_ECHO3G is not set
1560# CONFIG_SND_INDIGO is not set
1561# CONFIG_SND_INDIGOIO is not set
1562# CONFIG_SND_INDIGODJ is not set
1563# CONFIG_SND_EMU10K1 is not set
1564# CONFIG_SND_EMU10K1X is not set
1565# CONFIG_SND_ENS1370 is not set
1566# CONFIG_SND_ENS1371 is not set
1567# CONFIG_SND_ES1938 is not set
1568# CONFIG_SND_ES1968 is not set
1569# CONFIG_SND_FM801 is not set
1570CONFIG_SND_HDA_INTEL=y
1571CONFIG_SND_HDA_HWDEP=y
1572CONFIG_SND_HDA_CODEC_REALTEK=y
1573CONFIG_SND_HDA_CODEC_ANALOG=y
1574CONFIG_SND_HDA_CODEC_SIGMATEL=y
1575CONFIG_SND_HDA_CODEC_VIA=y
1576CONFIG_SND_HDA_CODEC_ATIHDMI=y
1577CONFIG_SND_HDA_CODEC_CONEXANT=y
1578CONFIG_SND_HDA_CODEC_CMEDIA=y
1579CONFIG_SND_HDA_CODEC_SI3054=y
1580CONFIG_SND_HDA_GENERIC=y
1581# CONFIG_SND_HDA_POWER_SAVE is not set
1582# CONFIG_SND_HDSP is not set
1583# CONFIG_SND_HDSPM is not set
1584# CONFIG_SND_HIFIER is not set
1585# CONFIG_SND_ICE1712 is not set
1586# CONFIG_SND_ICE1724 is not set
1587# CONFIG_SND_INTEL8X0 is not set
1588# CONFIG_SND_INTEL8X0M is not set
1589# CONFIG_SND_KORG1212 is not set
1590# CONFIG_SND_MAESTRO3 is not set
1591# CONFIG_SND_MIXART is not set
1592# CONFIG_SND_NM256 is not set
1593# CONFIG_SND_PCXHR is not set
1594# CONFIG_SND_RIPTIDE is not set
1595# CONFIG_SND_RME32 is not set
1596# CONFIG_SND_RME96 is not set
1597# CONFIG_SND_RME9652 is not set
1598# CONFIG_SND_SIS7019 is not set
1599# CONFIG_SND_SONICVIBES is not set
1600# CONFIG_SND_TRIDENT is not set
1601# CONFIG_SND_VIA82XX is not set
1602# CONFIG_SND_VIA82XX_MODEM is not set
1603# CONFIG_SND_VIRTUOSO is not set
1604# CONFIG_SND_VX222 is not set
1605# CONFIG_SND_YMFPCI is not set
1606
1607#
1608# USB devices
1609#
1610# CONFIG_SND_USB_AUDIO is not set
1611# CONFIG_SND_USB_USX2Y is not set
1612# CONFIG_SND_USB_CAIAQ is not set
1613
1614#
1615# PCMCIA devices
1616#
1617# CONFIG_SND_VXPOCKET is not set
1618# CONFIG_SND_PDAUDIOCF is not set
1619
1620#
1621# System on Chip audio support
1622#
1623# CONFIG_SND_SOC is not set
1624
1625#
1626# ALSA SoC audio for Freescale SOCs
1627#
1628
1629#
1630# SoC Audio for the Texas Instruments OMAP
1631#
1047 1632
1048# 1633#
1049# Open Sound System 1634# Open Sound System
1050# 1635#
1051CONFIG_SOUND_PRIME=y 1636# CONFIG_SOUND_PRIME is not set
1052# CONFIG_SOUND_TRIDENT is not set
1053# CONFIG_SOUND_MSNDCLAS is not set
1054# CONFIG_SOUND_MSNDPIN is not set
1055# CONFIG_SOUND_OSS is not set
1056CONFIG_HID_SUPPORT=y 1637CONFIG_HID_SUPPORT=y
1057CONFIG_HID=y 1638CONFIG_HID=y
1058# CONFIG_HID_DEBUG is not set 1639CONFIG_HID_DEBUG=y
1640CONFIG_HIDRAW=y
1059 1641
1060# 1642#
1061# USB Input Devices 1643# USB Input Devices
1062# 1644#
1063CONFIG_USB_HID=y 1645CONFIG_USB_HID=y
1064# CONFIG_USB_HIDINPUT_POWERBOOK is not set 1646CONFIG_USB_HIDINPUT_POWERBOOK=y
1065# CONFIG_HID_FF is not set 1647CONFIG_HID_FF=y
1066# CONFIG_USB_HIDDEV is not set 1648CONFIG_HID_PID=y
1649CONFIG_LOGITECH_FF=y
1650# CONFIG_LOGIRUMBLEPAD2_FF is not set
1651CONFIG_PANTHERLORD_FF=y
1652CONFIG_THRUSTMASTER_FF=y
1653CONFIG_ZEROPLUS_FF=y
1654CONFIG_USB_HIDDEV=y
1067CONFIG_USB_SUPPORT=y 1655CONFIG_USB_SUPPORT=y
1068CONFIG_USB_ARCH_HAS_HCD=y 1656CONFIG_USB_ARCH_HAS_HCD=y
1069CONFIG_USB_ARCH_HAS_OHCI=y 1657CONFIG_USB_ARCH_HAS_OHCI=y
1070CONFIG_USB_ARCH_HAS_EHCI=y 1658CONFIG_USB_ARCH_HAS_EHCI=y
1071CONFIG_USB=y 1659CONFIG_USB=y
1072# CONFIG_USB_DEBUG is not set 1660CONFIG_USB_DEBUG=y
1661CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
1073 1662
1074# 1663#
1075# Miscellaneous USB options 1664# Miscellaneous USB options
@@ -1077,18 +1666,18 @@ CONFIG_USB=y
1077CONFIG_USB_DEVICEFS=y 1666CONFIG_USB_DEVICEFS=y
1078# CONFIG_USB_DEVICE_CLASS is not set 1667# CONFIG_USB_DEVICE_CLASS is not set
1079# CONFIG_USB_DYNAMIC_MINORS is not set 1668# CONFIG_USB_DYNAMIC_MINORS is not set
1080# CONFIG_USB_SUSPEND is not set 1669CONFIG_USB_SUSPEND=y
1081# CONFIG_USB_PERSIST is not set
1082# CONFIG_USB_OTG is not set 1670# CONFIG_USB_OTG is not set
1083 1671
1084# 1672#
1085# USB Host Controller Drivers 1673# USB Host Controller Drivers
1086# 1674#
1675# CONFIG_USB_C67X00_HCD is not set
1087CONFIG_USB_EHCI_HCD=y 1676CONFIG_USB_EHCI_HCD=y
1088# CONFIG_USB_EHCI_SPLIT_ISO is not set
1089# CONFIG_USB_EHCI_ROOT_HUB_TT is not set 1677# CONFIG_USB_EHCI_ROOT_HUB_TT is not set
1090# CONFIG_USB_EHCI_TT_NEWSCHED is not set 1678# CONFIG_USB_EHCI_TT_NEWSCHED is not set
1091# CONFIG_USB_ISP116X_HCD is not set 1679# CONFIG_USB_ISP116X_HCD is not set
1680# CONFIG_USB_ISP1760_HCD is not set
1092CONFIG_USB_OHCI_HCD=y 1681CONFIG_USB_OHCI_HCD=y
1093# CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set 1682# CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set
1094# CONFIG_USB_OHCI_BIG_ENDIAN_MMIO is not set 1683# CONFIG_USB_OHCI_BIG_ENDIAN_MMIO is not set
@@ -1121,8 +1710,10 @@ CONFIG_USB_STORAGE=y
1121# CONFIG_USB_STORAGE_SDDR55 is not set 1710# CONFIG_USB_STORAGE_SDDR55 is not set
1122# CONFIG_USB_STORAGE_JUMPSHOT is not set 1711# CONFIG_USB_STORAGE_JUMPSHOT is not set
1123# CONFIG_USB_STORAGE_ALAUDA is not set 1712# CONFIG_USB_STORAGE_ALAUDA is not set
1713# CONFIG_USB_STORAGE_ONETOUCH is not set
1124# CONFIG_USB_STORAGE_KARMA is not set 1714# CONFIG_USB_STORAGE_KARMA is not set
1125# CONFIG_USB_LIBUSUAL is not set 1715# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set
1716CONFIG_USB_LIBUSUAL=y
1126 1717
1127# 1718#
1128# USB Imaging devices 1719# USB Imaging devices
@@ -1134,10 +1725,6 @@ CONFIG_USB_MON=y
1134# 1725#
1135# USB port drivers 1726# USB port drivers
1136# 1727#
1137
1138#
1139# USB Serial Converter support
1140#
1141# CONFIG_USB_SERIAL is not set 1728# CONFIG_USB_SERIAL is not set
1142 1729
1143# 1730#
@@ -1163,90 +1750,125 @@ CONFIG_USB_MON=y
1163# CONFIG_USB_TRANCEVIBRATOR is not set 1750# CONFIG_USB_TRANCEVIBRATOR is not set
1164# CONFIG_USB_IOWARRIOR is not set 1751# CONFIG_USB_IOWARRIOR is not set
1165# CONFIG_USB_TEST is not set 1752# CONFIG_USB_TEST is not set
1753# CONFIG_USB_GADGET is not set
1754# CONFIG_MMC is not set
1755# CONFIG_MEMSTICK is not set
1756CONFIG_NEW_LEDS=y
1757CONFIG_LEDS_CLASS=y
1166 1758
1167# 1759#
1168# USB DSL modem support 1760# LED drivers
1169# 1761#
1762# CONFIG_LEDS_CLEVO_MAIL is not set
1170 1763
1171# 1764#
1172# USB Gadget Support 1765# LED Triggers
1173# 1766#
1174# CONFIG_USB_GADGET is not set 1767CONFIG_LEDS_TRIGGERS=y
1175# CONFIG_MMC is not set 1768# CONFIG_LEDS_TRIGGER_TIMER is not set
1769# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
1770# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
1771# CONFIG_ACCESSIBILITY is not set
1772# CONFIG_INFINIBAND is not set
1773CONFIG_EDAC=y
1176 1774
1177# 1775#
1178# LED devices 1776# Reporting subsystems
1179# 1777#
1180# CONFIG_NEW_LEDS is not set 1778# CONFIG_EDAC_DEBUG is not set
1779# CONFIG_EDAC_MM_EDAC is not set
1780CONFIG_RTC_LIB=y
1781CONFIG_RTC_CLASS=y
1782# CONFIG_RTC_HCTOSYS is not set
1783# CONFIG_RTC_DEBUG is not set
1181 1784
1182# 1785#
1183# LED drivers 1786# RTC interfaces
1184# 1787#
1788CONFIG_RTC_INTF_SYSFS=y
1789CONFIG_RTC_INTF_PROC=y
1790CONFIG_RTC_INTF_DEV=y
1791# CONFIG_RTC_INTF_DEV_UIE_EMUL is not set
1792# CONFIG_RTC_DRV_TEST is not set
1185 1793
1186# 1794#
1187# LED Triggers 1795# I2C RTC drivers
1188# 1796#
1189# CONFIG_INFINIBAND is not set 1797# CONFIG_RTC_DRV_DS1307 is not set
1190# CONFIG_EDAC is not set 1798# CONFIG_RTC_DRV_DS1374 is not set
1799# CONFIG_RTC_DRV_DS1672 is not set
1800# CONFIG_RTC_DRV_MAX6900 is not set
1801# CONFIG_RTC_DRV_RS5C372 is not set
1802# CONFIG_RTC_DRV_ISL1208 is not set
1803# CONFIG_RTC_DRV_X1205 is not set
1804# CONFIG_RTC_DRV_PCF8563 is not set
1805# CONFIG_RTC_DRV_PCF8583 is not set
1806# CONFIG_RTC_DRV_M41T80 is not set
1807# CONFIG_RTC_DRV_S35390A is not set
1191 1808
1192# 1809#
1193# Real Time Clock 1810# SPI RTC drivers
1194# 1811#
1195# CONFIG_RTC_CLASS is not set
1196 1812
1197# 1813#
1198# DMA Engine support 1814# Platform RTC drivers
1199# 1815#
1200# CONFIG_DMA_ENGINE is not set 1816CONFIG_RTC_DRV_CMOS=y
1817# CONFIG_RTC_DRV_DS1511 is not set
1818# CONFIG_RTC_DRV_DS1553 is not set
1819# CONFIG_RTC_DRV_DS1742 is not set
1820# CONFIG_RTC_DRV_STK17TA8 is not set
1821# CONFIG_RTC_DRV_M48T86 is not set
1822# CONFIG_RTC_DRV_M48T59 is not set
1823# CONFIG_RTC_DRV_V3020 is not set
1201 1824
1202# 1825#
1203# DMA Clients 1826# on-CPU RTC drivers
1204# 1827#
1828CONFIG_DMADEVICES=y
1205 1829
1206# 1830#
1207# DMA Devices 1831# DMA Devices
1208# 1832#
1209CONFIG_VIRTUALIZATION=y 1833# CONFIG_INTEL_IOATDMA is not set
1210# CONFIG_KVM is not set 1834# CONFIG_UIO is not set
1211 1835
1212# 1836#
1213# Userspace I/O 1837# Firmware Drivers
1214# 1838#
1215# CONFIG_UIO is not set 1839# CONFIG_EDD is not set
1840CONFIG_EFI_VARS=y
1841# CONFIG_DELL_RBU is not set
1842# CONFIG_DCDBAS is not set
1843CONFIG_DMIID=y
1844# CONFIG_ISCSI_IBFT_FIND is not set
1216 1845
1217# 1846#
1218# File systems 1847# File systems
1219# 1848#
1220CONFIG_EXT2_FS=y 1849# CONFIG_EXT2_FS is not set
1221CONFIG_EXT2_FS_XATTR=y
1222CONFIG_EXT2_FS_POSIX_ACL=y
1223# CONFIG_EXT2_FS_SECURITY is not set
1224# CONFIG_EXT2_FS_XIP is not set
1225CONFIG_EXT3_FS=y 1850CONFIG_EXT3_FS=y
1226CONFIG_EXT3_FS_XATTR=y 1851CONFIG_EXT3_FS_XATTR=y
1227CONFIG_EXT3_FS_POSIX_ACL=y 1852CONFIG_EXT3_FS_POSIX_ACL=y
1228# CONFIG_EXT3_FS_SECURITY is not set 1853CONFIG_EXT3_FS_SECURITY=y
1229# CONFIG_EXT4DEV_FS is not set 1854# CONFIG_EXT4DEV_FS is not set
1230CONFIG_JBD=y 1855CONFIG_JBD=y
1231# CONFIG_JBD_DEBUG is not set 1856# CONFIG_JBD_DEBUG is not set
1232CONFIG_FS_MBCACHE=y 1857CONFIG_FS_MBCACHE=y
1233CONFIG_REISERFS_FS=y 1858# CONFIG_REISERFS_FS is not set
1234# CONFIG_REISERFS_CHECK is not set
1235# CONFIG_REISERFS_PROC_INFO is not set
1236CONFIG_REISERFS_FS_XATTR=y
1237CONFIG_REISERFS_FS_POSIX_ACL=y
1238# CONFIG_REISERFS_FS_SECURITY is not set
1239# CONFIG_JFS_FS is not set 1859# CONFIG_JFS_FS is not set
1240CONFIG_FS_POSIX_ACL=y 1860CONFIG_FS_POSIX_ACL=y
1241# CONFIG_XFS_FS is not set 1861# CONFIG_XFS_FS is not set
1242# CONFIG_GFS2_FS is not set
1243# CONFIG_OCFS2_FS is not set 1862# CONFIG_OCFS2_FS is not set
1244# CONFIG_MINIX_FS is not set 1863CONFIG_DNOTIFY=y
1245# CONFIG_ROMFS_FS is not set
1246CONFIG_INOTIFY=y 1864CONFIG_INOTIFY=y
1247CONFIG_INOTIFY_USER=y 1865CONFIG_INOTIFY_USER=y
1248# CONFIG_QUOTA is not set 1866CONFIG_QUOTA=y
1249CONFIG_DNOTIFY=y 1867CONFIG_QUOTA_NETLINK_INTERFACE=y
1868# CONFIG_PRINT_QUOTA_WARNING is not set
1869# CONFIG_QFMT_V1 is not set
1870CONFIG_QFMT_V2=y
1871CONFIG_QUOTACTL=y
1250# CONFIG_AUTOFS_FS is not set 1872# CONFIG_AUTOFS_FS is not set
1251CONFIG_AUTOFS4_FS=y 1873CONFIG_AUTOFS4_FS=y
1252# CONFIG_FUSE_FS is not set 1874# CONFIG_FUSE_FS is not set
@@ -1256,8 +1878,8 @@ CONFIG_GENERIC_ACL=y
1256# CD-ROM/DVD Filesystems 1878# CD-ROM/DVD Filesystems
1257# 1879#
1258CONFIG_ISO9660_FS=y 1880CONFIG_ISO9660_FS=y
1259# CONFIG_JOLIET is not set 1881CONFIG_JOLIET=y
1260# CONFIG_ZISOFS is not set 1882CONFIG_ZISOFS=y
1261# CONFIG_UDF_FS is not set 1883# CONFIG_UDF_FS is not set
1262 1884
1263# 1885#
@@ -1275,13 +1897,13 @@ CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1"
1275# 1897#
1276CONFIG_PROC_FS=y 1898CONFIG_PROC_FS=y
1277CONFIG_PROC_KCORE=y 1899CONFIG_PROC_KCORE=y
1900CONFIG_PROC_VMCORE=y
1278CONFIG_PROC_SYSCTL=y 1901CONFIG_PROC_SYSCTL=y
1279CONFIG_SYSFS=y 1902CONFIG_SYSFS=y
1280CONFIG_TMPFS=y 1903CONFIG_TMPFS=y
1281CONFIG_TMPFS_POSIX_ACL=y 1904CONFIG_TMPFS_POSIX_ACL=y
1282CONFIG_HUGETLBFS=y 1905CONFIG_HUGETLBFS=y
1283CONFIG_HUGETLB_PAGE=y 1906CONFIG_HUGETLB_PAGE=y
1284CONFIG_RAMFS=y
1285# CONFIG_CONFIGFS_FS is not set 1907# CONFIG_CONFIGFS_FS is not set
1286 1908
1287# 1909#
@@ -1289,6 +1911,7 @@ CONFIG_RAMFS=y
1289# 1911#
1290# CONFIG_ADFS_FS is not set 1912# CONFIG_ADFS_FS is not set
1291# CONFIG_AFFS_FS is not set 1913# CONFIG_AFFS_FS is not set
1914# CONFIG_ECRYPT_FS is not set
1292# CONFIG_HFS_FS is not set 1915# CONFIG_HFS_FS is not set
1293# CONFIG_HFSPLUS_FS is not set 1916# CONFIG_HFSPLUS_FS is not set
1294# CONFIG_BEFS_FS is not set 1917# CONFIG_BEFS_FS is not set
@@ -1296,33 +1919,15 @@ CONFIG_RAMFS=y
1296# CONFIG_EFS_FS is not set 1919# CONFIG_EFS_FS is not set
1297# CONFIG_CRAMFS is not set 1920# CONFIG_CRAMFS is not set
1298# CONFIG_VXFS_FS is not set 1921# CONFIG_VXFS_FS is not set
1922# CONFIG_MINIX_FS is not set
1299# CONFIG_HPFS_FS is not set 1923# CONFIG_HPFS_FS is not set
1300# CONFIG_QNX4FS_FS is not set 1924# CONFIG_QNX4FS_FS is not set
1925# CONFIG_ROMFS_FS is not set
1301# CONFIG_SYSV_FS is not set 1926# CONFIG_SYSV_FS is not set
1302# CONFIG_UFS_FS is not set 1927# CONFIG_UFS_FS is not set
1303 1928CONFIG_NETWORK_FILESYSTEMS=y
1304# 1929# CONFIG_NFS_FS is not set
1305# Network File Systems 1930# CONFIG_NFSD is not set
1306#
1307CONFIG_NFS_FS=y
1308CONFIG_NFS_V3=y
1309# CONFIG_NFS_V3_ACL is not set
1310# CONFIG_NFS_V4 is not set
1311# CONFIG_NFS_DIRECTIO is not set
1312CONFIG_NFSD=y
1313CONFIG_NFSD_V3=y
1314# CONFIG_NFSD_V3_ACL is not set
1315# CONFIG_NFSD_V4 is not set
1316CONFIG_NFSD_TCP=y
1317CONFIG_ROOT_NFS=y
1318CONFIG_LOCKD=y
1319CONFIG_LOCKD_V4=y
1320CONFIG_EXPORTFS=y
1321CONFIG_NFS_COMMON=y
1322CONFIG_SUNRPC=y
1323# CONFIG_SUNRPC_BIND34 is not set
1324# CONFIG_RPCSEC_GSS_KRB5 is not set
1325# CONFIG_RPCSEC_GSS_SPKM3 is not set
1326# CONFIG_SMB_FS is not set 1931# CONFIG_SMB_FS is not set
1327# CONFIG_CIFS is not set 1932# CONFIG_CIFS is not set
1328# CONFIG_NCP_FS is not set 1933# CONFIG_NCP_FS is not set
@@ -1332,14 +1937,26 @@ CONFIG_SUNRPC=y
1332# 1937#
1333# Partition Types 1938# Partition Types
1334# 1939#
1335# CONFIG_PARTITION_ADVANCED is not set 1940CONFIG_PARTITION_ADVANCED=y
1941# CONFIG_ACORN_PARTITION is not set
1942CONFIG_OSF_PARTITION=y
1943CONFIG_AMIGA_PARTITION=y
1944# CONFIG_ATARI_PARTITION is not set
1945CONFIG_MAC_PARTITION=y
1336CONFIG_MSDOS_PARTITION=y 1946CONFIG_MSDOS_PARTITION=y
1337 1947CONFIG_BSD_DISKLABEL=y
1338# 1948CONFIG_MINIX_SUBPARTITION=y
1339# Native Language Support 1949CONFIG_SOLARIS_X86_PARTITION=y
1340# 1950CONFIG_UNIXWARE_DISKLABEL=y
1951# CONFIG_LDM_PARTITION is not set
1952CONFIG_SGI_PARTITION=y
1953# CONFIG_ULTRIX_PARTITION is not set
1954CONFIG_SUN_PARTITION=y
1955CONFIG_KARMA_PARTITION=y
1956CONFIG_EFI_PARTITION=y
1957# CONFIG_SYSV68_PARTITION is not set
1341CONFIG_NLS=y 1958CONFIG_NLS=y
1342CONFIG_NLS_DEFAULT="iso8859-1" 1959CONFIG_NLS_DEFAULT="utf8"
1343CONFIG_NLS_CODEPAGE_437=y 1960CONFIG_NLS_CODEPAGE_437=y
1344# CONFIG_NLS_CODEPAGE_737 is not set 1961# CONFIG_NLS_CODEPAGE_737 is not set
1345# CONFIG_NLS_CODEPAGE_775 is not set 1962# CONFIG_NLS_CODEPAGE_775 is not set
@@ -1374,37 +1991,33 @@ CONFIG_NLS_ISO8859_1=y
1374# CONFIG_NLS_ISO8859_9 is not set 1991# CONFIG_NLS_ISO8859_9 is not set
1375# CONFIG_NLS_ISO8859_13 is not set 1992# CONFIG_NLS_ISO8859_13 is not set
1376# CONFIG_NLS_ISO8859_14 is not set 1993# CONFIG_NLS_ISO8859_14 is not set
1377CONFIG_NLS_ISO8859_15=y 1994# CONFIG_NLS_ISO8859_15 is not set
1378# CONFIG_NLS_KOI8_R is not set 1995# CONFIG_NLS_KOI8_R is not set
1379# CONFIG_NLS_KOI8_U is not set 1996# CONFIG_NLS_KOI8_U is not set
1380CONFIG_NLS_UTF8=y 1997CONFIG_NLS_UTF8=y
1381
1382#
1383# Distributed Lock Manager
1384#
1385# CONFIG_DLM is not set 1998# CONFIG_DLM is not set
1386CONFIG_INSTRUMENTATION=y
1387CONFIG_PROFILING=y
1388CONFIG_OPROFILE=y
1389CONFIG_KPROBES=y
1390 1999
1391# 2000#
1392# Kernel hacking 2001# Kernel hacking
1393# 2002#
1394CONFIG_TRACE_IRQFLAGS_SUPPORT=y 2003CONFIG_TRACE_IRQFLAGS_SUPPORT=y
1395# CONFIG_PRINTK_TIME is not set 2004# CONFIG_PRINTK_TIME is not set
2005# CONFIG_ENABLE_WARN_DEPRECATED is not set
1396# CONFIG_ENABLE_MUST_CHECK is not set 2006# CONFIG_ENABLE_MUST_CHECK is not set
2007CONFIG_FRAME_WARN=2048
1397CONFIG_MAGIC_SYSRQ=y 2008CONFIG_MAGIC_SYSRQ=y
1398CONFIG_UNUSED_SYMBOLS=y 2009# CONFIG_UNUSED_SYMBOLS is not set
1399# CONFIG_DEBUG_FS is not set 2010CONFIG_DEBUG_FS=y
1400# CONFIG_HEADERS_CHECK is not set 2011# CONFIG_HEADERS_CHECK is not set
1401CONFIG_DEBUG_KERNEL=y 2012CONFIG_DEBUG_KERNEL=y
1402# CONFIG_DEBUG_SHIRQ is not set 2013# CONFIG_DEBUG_SHIRQ is not set
1403CONFIG_DETECT_SOFTLOCKUP=y 2014# CONFIG_DETECT_SOFTLOCKUP is not set
1404# CONFIG_SCHED_DEBUG is not set 2015# CONFIG_SCHED_DEBUG is not set
1405# CONFIG_SCHEDSTATS is not set 2016CONFIG_SCHEDSTATS=y
1406CONFIG_TIMER_STATS=y 2017CONFIG_TIMER_STATS=y
2018# CONFIG_DEBUG_OBJECTS is not set
1407# CONFIG_SLUB_DEBUG_ON is not set 2019# CONFIG_SLUB_DEBUG_ON is not set
2020# CONFIG_SLUB_STATS is not set
1408# CONFIG_DEBUG_RT_MUTEXES is not set 2021# CONFIG_DEBUG_RT_MUTEXES is not set
1409# CONFIG_RT_MUTEX_TESTER is not set 2022# CONFIG_RT_MUTEX_TESTER is not set
1410# CONFIG_DEBUG_SPINLOCK is not set 2023# CONFIG_DEBUG_SPINLOCK is not set
@@ -1419,48 +2032,174 @@ CONFIG_TIMER_STATS=y
1419CONFIG_DEBUG_BUGVERBOSE=y 2032CONFIG_DEBUG_BUGVERBOSE=y
1420# CONFIG_DEBUG_INFO is not set 2033# CONFIG_DEBUG_INFO is not set
1421# CONFIG_DEBUG_VM is not set 2034# CONFIG_DEBUG_VM is not set
2035# CONFIG_DEBUG_WRITECOUNT is not set
1422# CONFIG_DEBUG_LIST is not set 2036# CONFIG_DEBUG_LIST is not set
1423# CONFIG_FRAME_POINTER is not set 2037# CONFIG_DEBUG_SG is not set
1424CONFIG_OPTIMIZE_INLINING=y 2038CONFIG_FRAME_POINTER=y
2039# CONFIG_BOOT_PRINTK_DELAY is not set
1425# CONFIG_RCU_TORTURE_TEST is not set 2040# CONFIG_RCU_TORTURE_TEST is not set
2041# CONFIG_KPROBES_SANITY_TEST is not set
2042# CONFIG_BACKTRACE_SELF_TEST is not set
1426# CONFIG_LKDTM is not set 2043# CONFIG_LKDTM is not set
1427# CONFIG_FAULT_INJECTION is not set 2044# CONFIG_FAULT_INJECTION is not set
2045# CONFIG_LATENCYTOP is not set
2046CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
2047# CONFIG_SAMPLES is not set
2048# CONFIG_KGDB is not set
2049CONFIG_HAVE_ARCH_KGDB=y
2050# CONFIG_STRICT_DEVMEM is not set
1428CONFIG_EARLY_PRINTK=y 2051CONFIG_EARLY_PRINTK=y
1429CONFIG_DEBUG_STACKOVERFLOW=y 2052CONFIG_DEBUG_STACKOVERFLOW=y
1430# CONFIG_DEBUG_STACK_USAGE is not set 2053CONFIG_DEBUG_STACK_USAGE=y
1431# CONFIG_DEBUG_RODATA is not set 2054# CONFIG_DEBUG_PAGEALLOC is not set
2055# CONFIG_X86_PTDUMP is not set
2056CONFIG_DEBUG_RODATA=y
2057# CONFIG_DEBUG_RODATA_TEST is not set
2058CONFIG_DEBUG_NX_TEST=m
1432# CONFIG_4KSTACKS is not set 2059# CONFIG_4KSTACKS is not set
1433CONFIG_X86_FIND_SMP_CONFIG=y 2060CONFIG_X86_FIND_SMP_CONFIG=y
1434CONFIG_X86_MPPARSE=y 2061CONFIG_X86_MPPARSE=y
1435CONFIG_DOUBLEFAULT=y 2062CONFIG_DOUBLEFAULT=y
2063CONFIG_IO_DELAY_TYPE_0X80=0
2064CONFIG_IO_DELAY_TYPE_0XED=1
2065CONFIG_IO_DELAY_TYPE_UDELAY=2
2066CONFIG_IO_DELAY_TYPE_NONE=3
2067CONFIG_IO_DELAY_0X80=y
2068# CONFIG_IO_DELAY_0XED is not set
2069# CONFIG_IO_DELAY_UDELAY is not set
2070# CONFIG_IO_DELAY_NONE is not set
2071CONFIG_DEFAULT_IO_DELAY_TYPE=0
2072CONFIG_DEBUG_BOOT_PARAMS=y
2073# CONFIG_CPA_DEBUG is not set
1436 2074
1437# 2075#
1438# Security options 2076# Security options
1439# 2077#
1440# CONFIG_KEYS is not set 2078CONFIG_KEYS=y
1441# CONFIG_SECURITY is not set 2079CONFIG_KEYS_DEBUG_PROC_KEYS=y
1442# CONFIG_CRYPTO is not set 2080CONFIG_SECURITY=y
2081CONFIG_SECURITY_NETWORK=y
2082# CONFIG_SECURITY_NETWORK_XFRM is not set
2083CONFIG_SECURITY_CAPABILITIES=y
2084CONFIG_SECURITY_FILE_CAPABILITIES=y
2085# CONFIG_SECURITY_ROOTPLUG is not set
2086CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536
2087CONFIG_SECURITY_SELINUX=y
2088CONFIG_SECURITY_SELINUX_BOOTPARAM=y
2089CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE=1
2090CONFIG_SECURITY_SELINUX_DISABLE=y
2091CONFIG_SECURITY_SELINUX_DEVELOP=y
2092CONFIG_SECURITY_SELINUX_AVC_STATS=y
2093CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
2094# CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT is not set
2095# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
2096# CONFIG_SECURITY_SMACK is not set
2097CONFIG_CRYPTO=y
2098
2099#
2100# Crypto core or helper
2101#
2102CONFIG_CRYPTO_ALGAPI=y
2103CONFIG_CRYPTO_AEAD=y
2104CONFIG_CRYPTO_BLKCIPHER=y
2105CONFIG_CRYPTO_HASH=y
2106CONFIG_CRYPTO_MANAGER=y
2107# CONFIG_CRYPTO_GF128MUL is not set
2108# CONFIG_CRYPTO_NULL is not set
2109# CONFIG_CRYPTO_CRYPTD is not set
2110CONFIG_CRYPTO_AUTHENC=y
2111# CONFIG_CRYPTO_TEST is not set
2112
2113#
2114# Authenticated Encryption with Associated Data
2115#
2116# CONFIG_CRYPTO_CCM is not set
2117# CONFIG_CRYPTO_GCM is not set
2118# CONFIG_CRYPTO_SEQIV is not set
2119
2120#
2121# Block modes
2122#
2123CONFIG_CRYPTO_CBC=y
2124# CONFIG_CRYPTO_CTR is not set
2125# CONFIG_CRYPTO_CTS is not set
2126CONFIG_CRYPTO_ECB=y
2127# CONFIG_CRYPTO_LRW is not set
2128# CONFIG_CRYPTO_PCBC is not set
2129# CONFIG_CRYPTO_XTS is not set
2130
2131#
2132# Hash modes
2133#
2134CONFIG_CRYPTO_HMAC=y
2135# CONFIG_CRYPTO_XCBC is not set
2136
2137#
2138# Digest
2139#
2140# CONFIG_CRYPTO_CRC32C is not set
2141# CONFIG_CRYPTO_MD4 is not set
2142CONFIG_CRYPTO_MD5=y
2143# CONFIG_CRYPTO_MICHAEL_MIC is not set
2144CONFIG_CRYPTO_SHA1=y
2145# CONFIG_CRYPTO_SHA256 is not set
2146# CONFIG_CRYPTO_SHA512 is not set
2147# CONFIG_CRYPTO_TGR192 is not set
2148# CONFIG_CRYPTO_WP512 is not set
2149
2150#
2151# Ciphers
2152#
2153CONFIG_CRYPTO_AES=y
2154# CONFIG_CRYPTO_AES_586 is not set
2155# CONFIG_CRYPTO_ANUBIS is not set
2156CONFIG_CRYPTO_ARC4=y
2157# CONFIG_CRYPTO_BLOWFISH is not set
2158# CONFIG_CRYPTO_CAMELLIA is not set
2159# CONFIG_CRYPTO_CAST5 is not set
2160# CONFIG_CRYPTO_CAST6 is not set
2161CONFIG_CRYPTO_DES=y
2162# CONFIG_CRYPTO_FCRYPT is not set
2163# CONFIG_CRYPTO_KHAZAD is not set
2164# CONFIG_CRYPTO_SALSA20 is not set
2165# CONFIG_CRYPTO_SALSA20_586 is not set
2166# CONFIG_CRYPTO_SEED is not set
2167# CONFIG_CRYPTO_SERPENT is not set
2168# CONFIG_CRYPTO_TEA is not set
2169# CONFIG_CRYPTO_TWOFISH is not set
2170# CONFIG_CRYPTO_TWOFISH_586 is not set
2171
2172#
2173# Compression
2174#
2175# CONFIG_CRYPTO_DEFLATE is not set
2176# CONFIG_CRYPTO_LZO is not set
2177CONFIG_CRYPTO_HW=y
2178# CONFIG_CRYPTO_DEV_PADLOCK is not set
2179# CONFIG_CRYPTO_DEV_GEODE is not set
2180# CONFIG_CRYPTO_DEV_HIFN_795X is not set
2181CONFIG_HAVE_KVM=y
2182CONFIG_VIRTUALIZATION=y
2183# CONFIG_KVM is not set
2184# CONFIG_LGUEST is not set
2185# CONFIG_VIRTIO_PCI is not set
2186# CONFIG_VIRTIO_BALLOON is not set
1443 2187
1444# 2188#
1445# Library routines 2189# Library routines
1446# 2190#
1447CONFIG_BITREVERSE=y 2191CONFIG_BITREVERSE=y
2192CONFIG_GENERIC_FIND_FIRST_BIT=y
2193CONFIG_GENERIC_FIND_NEXT_BIT=y
1448# CONFIG_CRC_CCITT is not set 2194# CONFIG_CRC_CCITT is not set
1449# CONFIG_CRC16 is not set 2195# CONFIG_CRC16 is not set
1450# CONFIG_CRC_ITU_T is not set 2196# CONFIG_CRC_ITU_T is not set
1451CONFIG_CRC32=y 2197CONFIG_CRC32=y
1452# CONFIG_CRC7 is not set 2198# CONFIG_CRC7 is not set
1453# CONFIG_LIBCRC32C is not set 2199# CONFIG_LIBCRC32C is not set
2200CONFIG_AUDIT_GENERIC=y
1454CONFIG_ZLIB_INFLATE=y 2201CONFIG_ZLIB_INFLATE=y
1455CONFIG_PLIST=y 2202CONFIG_PLIST=y
1456CONFIG_HAS_IOMEM=y 2203CONFIG_HAS_IOMEM=y
1457CONFIG_HAS_IOPORT=y 2204CONFIG_HAS_IOPORT=y
1458CONFIG_HAS_DMA=y 2205CONFIG_HAS_DMA=y
1459CONFIG_GENERIC_HARDIRQS=y
1460CONFIG_GENERIC_IRQ_PROBE=y
1461CONFIG_GENERIC_PENDING_IRQ=y
1462CONFIG_X86_SMP=y
1463CONFIG_X86_HT=y
1464CONFIG_X86_BIOS_REBOOT=y
1465CONFIG_X86_TRAMPOLINE=y
1466CONFIG_KTIME_SCALAR=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 2d6f5b2809d2..a40452429625 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1,64 +1,103 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.22-git14 3# Linux kernel version: 2.6.26-rc1
4# Fri Jul 20 09:53:15 2007 4# Sun May 4 19:59:57 2008
5# 5#
6CONFIG_X86_64=y
7CONFIG_64BIT=y 6CONFIG_64BIT=y
7# CONFIG_X86_32 is not set
8CONFIG_X86_64=y
8CONFIG_X86=y 9CONFIG_X86=y
10CONFIG_DEFCONFIG_LIST="arch/x86/configs/x86_64_defconfig"
11# CONFIG_GENERIC_LOCKBREAK is not set
9CONFIG_GENERIC_TIME=y 12CONFIG_GENERIC_TIME=y
10CONFIG_GENERIC_TIME_VSYSCALL=y
11CONFIG_GENERIC_CMOS_UPDATE=y 13CONFIG_GENERIC_CMOS_UPDATE=y
12CONFIG_ZONE_DMA32=y 14CONFIG_CLOCKSOURCE_WATCHDOG=y
15CONFIG_GENERIC_CLOCKEVENTS=y
16CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y
13CONFIG_LOCKDEP_SUPPORT=y 17CONFIG_LOCKDEP_SUPPORT=y
14CONFIG_STACKTRACE_SUPPORT=y 18CONFIG_STACKTRACE_SUPPORT=y
15CONFIG_SEMAPHORE_SLEEPERS=y 19CONFIG_HAVE_LATENCYTOP_SUPPORT=y
20CONFIG_FAST_CMPXCHG_LOCAL=y
16CONFIG_MMU=y 21CONFIG_MMU=y
17CONFIG_ZONE_DMA=y 22CONFIG_ZONE_DMA=y
18CONFIG_QUICKLIST=y
19CONFIG_NR_QUICK=2
20CONFIG_RWSEM_GENERIC_SPINLOCK=y
21CONFIG_GENERIC_HWEIGHT=y
22CONFIG_GENERIC_CALIBRATE_DELAY=y
23CONFIG_X86_CMPXCHG=y
24CONFIG_EARLY_PRINTK=y
25CONFIG_GENERIC_ISA_DMA=y 23CONFIG_GENERIC_ISA_DMA=y
26CONFIG_GENERIC_IOMAP=y 24CONFIG_GENERIC_IOMAP=y
27CONFIG_ARCH_MAY_HAVE_PC_FDC=y
28CONFIG_ARCH_POPULATES_NODE_MAP=y
29CONFIG_DMI=y
30CONFIG_AUDIT_ARCH=y
31CONFIG_GENERIC_BUG=y 25CONFIG_GENERIC_BUG=y
26CONFIG_GENERIC_HWEIGHT=y
27# CONFIG_GENERIC_GPIO is not set
28CONFIG_ARCH_MAY_HAVE_PC_FDC=y
29CONFIG_RWSEM_GENERIC_SPINLOCK=y
30# CONFIG_RWSEM_XCHGADD_ALGORITHM is not set
32# CONFIG_ARCH_HAS_ILOG2_U32 is not set 31# CONFIG_ARCH_HAS_ILOG2_U32 is not set
33# CONFIG_ARCH_HAS_ILOG2_U64 is not set 32# CONFIG_ARCH_HAS_ILOG2_U64 is not set
34CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" 33CONFIG_ARCH_HAS_CPU_IDLE_WAIT=y
34CONFIG_GENERIC_CALIBRATE_DELAY=y
35CONFIG_GENERIC_TIME_VSYSCALL=y
36CONFIG_ARCH_HAS_CPU_RELAX=y
37CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
38CONFIG_HAVE_SETUP_PER_CPU_AREA=y
39CONFIG_HAVE_CPUMASK_OF_CPU_MAP=y
40CONFIG_ARCH_HIBERNATION_POSSIBLE=y
41CONFIG_ARCH_SUSPEND_POSSIBLE=y
42CONFIG_ZONE_DMA32=y
43CONFIG_ARCH_POPULATES_NODE_MAP=y
44CONFIG_AUDIT_ARCH=y
45CONFIG_ARCH_SUPPORTS_AOUT=y
46CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
47CONFIG_GENERIC_HARDIRQS=y
48CONFIG_GENERIC_IRQ_PROBE=y
49CONFIG_GENERIC_PENDING_IRQ=y
50CONFIG_X86_SMP=y
51CONFIG_X86_64_SMP=y
52CONFIG_X86_HT=y
53CONFIG_X86_BIOS_REBOOT=y
54CONFIG_X86_TRAMPOLINE=y
55# CONFIG_KTIME_SCALAR is not set
35 56
36# 57#
37# Code maturity level options 58# General setup
38# 59#
39CONFIG_EXPERIMENTAL=y 60CONFIG_EXPERIMENTAL=y
40CONFIG_LOCK_KERNEL=y 61CONFIG_LOCK_KERNEL=y
41CONFIG_INIT_ENV_ARG_LIMIT=32 62CONFIG_INIT_ENV_ARG_LIMIT=32
42
43#
44# General setup
45#
46CONFIG_LOCALVERSION="" 63CONFIG_LOCALVERSION=""
47CONFIG_LOCALVERSION_AUTO=y 64# CONFIG_LOCALVERSION_AUTO is not set
48CONFIG_SWAP=y 65CONFIG_SWAP=y
49CONFIG_SYSVIPC=y 66CONFIG_SYSVIPC=y
50CONFIG_SYSVIPC_SYSCTL=y 67CONFIG_SYSVIPC_SYSCTL=y
51CONFIG_POSIX_MQUEUE=y 68CONFIG_POSIX_MQUEUE=y
52# CONFIG_BSD_PROCESS_ACCT is not set 69CONFIG_BSD_PROCESS_ACCT=y
53# CONFIG_TASKSTATS is not set 70# CONFIG_BSD_PROCESS_ACCT_V3 is not set
54# CONFIG_USER_NS is not set 71CONFIG_TASKSTATS=y
55# CONFIG_AUDIT is not set 72CONFIG_TASK_DELAY_ACCT=y
56CONFIG_IKCONFIG=y 73CONFIG_TASK_XACCT=y
57CONFIG_IKCONFIG_PROC=y 74CONFIG_TASK_IO_ACCOUNTING=y
58CONFIG_LOG_BUF_SHIFT=18 75CONFIG_AUDIT=y
59# CONFIG_CPUSETS is not set 76CONFIG_AUDITSYSCALL=y
60CONFIG_SYSFS_DEPRECATED=y 77CONFIG_AUDIT_TREE=y
78# CONFIG_IKCONFIG is not set
79CONFIG_LOG_BUF_SHIFT=17
80CONFIG_CGROUPS=y
81# CONFIG_CGROUP_DEBUG is not set
82CONFIG_CGROUP_NS=y
83# CONFIG_CGROUP_DEVICE is not set
84CONFIG_CPUSETS=y
85CONFIG_GROUP_SCHED=y
86CONFIG_FAIR_GROUP_SCHED=y
87# CONFIG_RT_GROUP_SCHED is not set
88# CONFIG_USER_SCHED is not set
89CONFIG_CGROUP_SCHED=y
90CONFIG_CGROUP_CPUACCT=y
91CONFIG_RESOURCE_COUNTERS=y
92# CONFIG_CGROUP_MEM_RES_CTLR is not set
93# CONFIG_SYSFS_DEPRECATED_V2 is not set
94CONFIG_PROC_PID_CPUSET=y
61CONFIG_RELAY=y 95CONFIG_RELAY=y
96CONFIG_NAMESPACES=y
97CONFIG_UTS_NS=y
98CONFIG_IPC_NS=y
99CONFIG_USER_NS=y
100CONFIG_PID_NS=y
62CONFIG_BLK_DEV_INITRD=y 101CONFIG_BLK_DEV_INITRD=y
63CONFIG_INITRAMFS_SOURCE="" 102CONFIG_INITRAMFS_SOURCE=""
64CONFIG_CC_OPTIMIZE_FOR_SIZE=y 103CONFIG_CC_OPTIMIZE_FOR_SIZE=y
@@ -66,13 +105,15 @@ CONFIG_SYSCTL=y
66# CONFIG_EMBEDDED is not set 105# CONFIG_EMBEDDED is not set
67CONFIG_UID16=y 106CONFIG_UID16=y
68CONFIG_SYSCTL_SYSCALL=y 107CONFIG_SYSCTL_SYSCALL=y
108CONFIG_SYSCTL_SYSCALL_CHECK=y
69CONFIG_KALLSYMS=y 109CONFIG_KALLSYMS=y
70CONFIG_KALLSYMS_ALL=y 110CONFIG_KALLSYMS_ALL=y
71# CONFIG_KALLSYMS_EXTRA_PASS is not set 111CONFIG_KALLSYMS_EXTRA_PASS=y
72CONFIG_HOTPLUG=y 112CONFIG_HOTPLUG=y
73CONFIG_PRINTK=y 113CONFIG_PRINTK=y
74CONFIG_BUG=y 114CONFIG_BUG=y
75CONFIG_ELF_CORE=y 115CONFIG_ELF_CORE=y
116# CONFIG_COMPAT_BRK is not set
76CONFIG_BASE_FULL=y 117CONFIG_BASE_FULL=y
77CONFIG_FUTEX=y 118CONFIG_FUTEX=y
78CONFIG_ANON_INODES=y 119CONFIG_ANON_INODES=y
@@ -82,9 +123,21 @@ CONFIG_TIMERFD=y
82CONFIG_EVENTFD=y 123CONFIG_EVENTFD=y
83CONFIG_SHMEM=y 124CONFIG_SHMEM=y
84CONFIG_VM_EVENT_COUNTERS=y 125CONFIG_VM_EVENT_COUNTERS=y
85CONFIG_SLAB=y 126CONFIG_SLUB_DEBUG=y
86# CONFIG_SLUB is not set 127# CONFIG_SLAB is not set
128CONFIG_SLUB=y
87# CONFIG_SLOB is not set 129# CONFIG_SLOB is not set
130CONFIG_PROFILING=y
131CONFIG_MARKERS=y
132# CONFIG_OPROFILE is not set
133CONFIG_HAVE_OPROFILE=y
134CONFIG_KPROBES=y
135CONFIG_KRETPROBES=y
136CONFIG_HAVE_KPROBES=y
137CONFIG_HAVE_KRETPROBES=y
138# CONFIG_HAVE_DMA_ATTRS is not set
139CONFIG_PROC_PAGE_MONITOR=y
140CONFIG_SLABINFO=y
88CONFIG_RT_MUTEXES=y 141CONFIG_RT_MUTEXES=y
89# CONFIG_TINY_SHMEM is not set 142# CONFIG_TINY_SHMEM is not set
90CONFIG_BASE_SMALL=0 143CONFIG_BASE_SMALL=0
@@ -96,14 +149,15 @@ CONFIG_MODULE_FORCE_UNLOAD=y
96# CONFIG_KMOD is not set 149# CONFIG_KMOD is not set
97CONFIG_STOP_MACHINE=y 150CONFIG_STOP_MACHINE=y
98CONFIG_BLOCK=y 151CONFIG_BLOCK=y
99# CONFIG_BLK_DEV_IO_TRACE is not set 152CONFIG_BLK_DEV_IO_TRACE=y
100# CONFIG_BLK_DEV_BSG is not set 153CONFIG_BLK_DEV_BSG=y
154CONFIG_BLOCK_COMPAT=y
101 155
102# 156#
103# IO Schedulers 157# IO Schedulers
104# 158#
105CONFIG_IOSCHED_NOOP=y 159CONFIG_IOSCHED_NOOP=y
106# CONFIG_IOSCHED_AS is not set 160CONFIG_IOSCHED_AS=y
107CONFIG_IOSCHED_DEADLINE=y 161CONFIG_IOSCHED_DEADLINE=y
108CONFIG_IOSCHED_CFQ=y 162CONFIG_IOSCHED_CFQ=y
109# CONFIG_DEFAULT_AS is not set 163# CONFIG_DEFAULT_AS is not set
@@ -111,107 +165,177 @@ CONFIG_IOSCHED_CFQ=y
111CONFIG_DEFAULT_CFQ=y 165CONFIG_DEFAULT_CFQ=y
112# CONFIG_DEFAULT_NOOP is not set 166# CONFIG_DEFAULT_NOOP is not set
113CONFIG_DEFAULT_IOSCHED="cfq" 167CONFIG_DEFAULT_IOSCHED="cfq"
168CONFIG_CLASSIC_RCU=y
114 169
115# 170#
116# Processor type and features 171# Processor type and features
117# 172#
173CONFIG_TICK_ONESHOT=y
174CONFIG_NO_HZ=y
175CONFIG_HIGH_RES_TIMERS=y
176CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
177CONFIG_SMP=y
118CONFIG_X86_PC=y 178CONFIG_X86_PC=y
179# CONFIG_X86_ELAN is not set
180# CONFIG_X86_VOYAGER is not set
181# CONFIG_X86_NUMAQ is not set
182# CONFIG_X86_SUMMIT is not set
183# CONFIG_X86_BIGSMP is not set
184# CONFIG_X86_VISWS is not set
185# CONFIG_X86_GENERICARCH is not set
186# CONFIG_X86_ES7000 is not set
187# CONFIG_X86_RDC321X is not set
119# CONFIG_X86_VSMP is not set 188# CONFIG_X86_VSMP is not set
189# CONFIG_PARAVIRT_GUEST is not set
190CONFIG_MEMTEST_BOOTPARAM=y
191CONFIG_MEMTEST_BOOTPARAM_VALUE=0
192# CONFIG_M386 is not set
193# CONFIG_M486 is not set
194# CONFIG_M586 is not set
195# CONFIG_M586TSC is not set
196# CONFIG_M586MMX is not set
197# CONFIG_M686 is not set
198# CONFIG_MPENTIUMII is not set
199# CONFIG_MPENTIUMIII is not set
200# CONFIG_MPENTIUMM is not set
201# CONFIG_MPENTIUM4 is not set
202# CONFIG_MK6 is not set
203# CONFIG_MK7 is not set
120# CONFIG_MK8 is not set 204# CONFIG_MK8 is not set
205# CONFIG_MCRUSOE is not set
206# CONFIG_MEFFICEON is not set
207# CONFIG_MWINCHIPC6 is not set
208# CONFIG_MWINCHIP2 is not set
209# CONFIG_MWINCHIP3D is not set
210# CONFIG_MGEODEGX1 is not set
211# CONFIG_MGEODE_LX is not set
212# CONFIG_MCYRIXIII is not set
213# CONFIG_MVIAC3_2 is not set
214# CONFIG_MVIAC7 is not set
121# CONFIG_MPSC is not set 215# CONFIG_MPSC is not set
122# CONFIG_MCORE2 is not set 216CONFIG_MCORE2=y
123CONFIG_GENERIC_CPU=y 217# CONFIG_GENERIC_CPU is not set
124CONFIG_X86_L1_CACHE_BYTES=128 218CONFIG_X86_CPU=y
125CONFIG_X86_L1_CACHE_SHIFT=7 219CONFIG_X86_L1_CACHE_BYTES=64
126CONFIG_X86_INTERNODE_CACHE_BYTES=128 220CONFIG_X86_INTERNODE_CACHE_BYTES=64
127CONFIG_X86_TSC=y 221CONFIG_X86_CMPXCHG=y
222CONFIG_X86_L1_CACHE_SHIFT=6
128CONFIG_X86_GOOD_APIC=y 223CONFIG_X86_GOOD_APIC=y
129# CONFIG_MICROCODE is not set 224CONFIG_X86_INTEL_USERCOPY=y
130CONFIG_X86_MSR=y 225CONFIG_X86_USE_PPRO_CHECKSUM=y
131CONFIG_X86_CPUID=y 226CONFIG_X86_P6_NOP=y
132CONFIG_X86_HT=y 227CONFIG_X86_TSC=y
133CONFIG_X86_IO_APIC=y 228CONFIG_X86_CMOV=y
134CONFIG_X86_LOCAL_APIC=y 229CONFIG_X86_MINIMUM_CPU_FAMILY=64
135CONFIG_MTRR=y 230CONFIG_X86_DEBUGCTLMSR=y
136CONFIG_SMP=y 231CONFIG_HPET_TIMER=y
137CONFIG_SCHED_SMT=y 232CONFIG_HPET_EMULATE_RTC=y
233CONFIG_DMI=y
234CONFIG_GART_IOMMU=y
235CONFIG_CALGARY_IOMMU=y
236CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y
237CONFIG_SWIOTLB=y
238CONFIG_IOMMU_HELPER=y
239CONFIG_NR_CPUS=4
240# CONFIG_SCHED_SMT is not set
138CONFIG_SCHED_MC=y 241CONFIG_SCHED_MC=y
139# CONFIG_PREEMPT_NONE is not set 242# CONFIG_PREEMPT_NONE is not set
140CONFIG_PREEMPT_VOLUNTARY=y 243CONFIG_PREEMPT_VOLUNTARY=y
141# CONFIG_PREEMPT is not set 244# CONFIG_PREEMPT is not set
142CONFIG_PREEMPT_BKL=y 245CONFIG_X86_LOCAL_APIC=y
246CONFIG_X86_IO_APIC=y
247# CONFIG_X86_MCE is not set
248# CONFIG_I8K is not set
249# CONFIG_MICROCODE is not set
250CONFIG_X86_MSR=y
251CONFIG_X86_CPUID=y
143CONFIG_NUMA=y 252CONFIG_NUMA=y
144CONFIG_K8_NUMA=y 253CONFIG_K8_NUMA=y
145CONFIG_NODES_SHIFT=6
146CONFIG_X86_64_ACPI_NUMA=y 254CONFIG_X86_64_ACPI_NUMA=y
147CONFIG_NUMA_EMU=y 255CONFIG_NODES_SPAN_OTHER_NODES=y
256# CONFIG_NUMA_EMU is not set
257CONFIG_NODES_SHIFT=6
258CONFIG_ARCH_SPARSEMEM_DEFAULT=y
259CONFIG_ARCH_SPARSEMEM_ENABLE=y
260CONFIG_ARCH_SELECT_MEMORY_MODEL=y
261CONFIG_SELECT_MEMORY_MODEL=y
262# CONFIG_FLATMEM_MANUAL is not set
263# CONFIG_DISCONTIGMEM_MANUAL is not set
264CONFIG_SPARSEMEM_MANUAL=y
265CONFIG_SPARSEMEM=y
148CONFIG_NEED_MULTIPLE_NODES=y 266CONFIG_NEED_MULTIPLE_NODES=y
267CONFIG_HAVE_MEMORY_PRESENT=y
149# CONFIG_SPARSEMEM_STATIC is not set 268# CONFIG_SPARSEMEM_STATIC is not set
269CONFIG_SPARSEMEM_EXTREME=y
270CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y
271CONFIG_SPARSEMEM_VMEMMAP=y
272
273#
274# Memory hotplug is currently incompatible with Software Suspend
275#
276CONFIG_PAGEFLAGS_EXTENDED=y
150CONFIG_SPLIT_PTLOCK_CPUS=4 277CONFIG_SPLIT_PTLOCK_CPUS=4
151CONFIG_MIGRATION=y 278CONFIG_MIGRATION=y
152CONFIG_RESOURCES_64BIT=y 279CONFIG_RESOURCES_64BIT=y
153CONFIG_ZONE_DMA_FLAG=1 280CONFIG_ZONE_DMA_FLAG=1
154CONFIG_BOUNCE=y 281CONFIG_BOUNCE=y
155CONFIG_VIRT_TO_BUS=y 282CONFIG_VIRT_TO_BUS=y
156CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y 283CONFIG_MTRR=y
157CONFIG_OUT_OF_LINE_PFN_TO_PAGE=y 284# CONFIG_X86_PAT is not set
158CONFIG_NR_CPUS=32 285CONFIG_EFI=y
159CONFIG_PHYSICAL_ALIGN=0x200000
160CONFIG_HOTPLUG_CPU=y
161CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
162CONFIG_HPET_TIMER=y
163CONFIG_HPET_EMULATE_RTC=y
164CONFIG_GART_IOMMU=y
165# CONFIG_CALGARY_IOMMU is not set
166CONFIG_SWIOTLB=y
167CONFIG_X86_MCE=y
168CONFIG_X86_MCE_INTEL=y
169CONFIG_X86_MCE_AMD=y
170# CONFIG_KEXEC is not set
171# CONFIG_CRASH_DUMP is not set
172# CONFIG_RELOCATABLE is not set
173CONFIG_PHYSICAL_START=0x200000
174CONFIG_SECCOMP=y 286CONFIG_SECCOMP=y
175# CONFIG_CC_STACKPROTECTOR is not set
176# CONFIG_HZ_100 is not set 287# CONFIG_HZ_100 is not set
177CONFIG_HZ_250=y 288# CONFIG_HZ_250 is not set
178# CONFIG_HZ_300 is not set 289# CONFIG_HZ_300 is not set
179# CONFIG_HZ_1000 is not set 290CONFIG_HZ_1000=y
180CONFIG_HZ=250 291CONFIG_HZ=1000
181CONFIG_K8_NB=y 292CONFIG_SCHED_HRTICK=y
182CONFIG_GENERIC_HARDIRQS=y 293CONFIG_KEXEC=y
183CONFIG_GENERIC_IRQ_PROBE=y 294CONFIG_CRASH_DUMP=y
184CONFIG_ISA_DMA_API=y 295CONFIG_PHYSICAL_START=0x1000000
185CONFIG_GENERIC_PENDING_IRQ=y 296CONFIG_RELOCATABLE=y
297CONFIG_PHYSICAL_ALIGN=0x200000
298CONFIG_HOTPLUG_CPU=y
299# CONFIG_COMPAT_VDSO is not set
300CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
301CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
186 302
187# 303#
188# Power management options 304# Power management options
189# 305#
306CONFIG_ARCH_HIBERNATION_HEADER=y
190CONFIG_PM=y 307CONFIG_PM=y
191# CONFIG_PM_LEGACY is not set 308CONFIG_PM_DEBUG=y
192# CONFIG_PM_DEBUG is not set 309# CONFIG_PM_VERBOSE is not set
310CONFIG_CAN_PM_TRACE=y
311CONFIG_PM_TRACE=y
312CONFIG_PM_TRACE_RTC=y
313CONFIG_PM_SLEEP_SMP=y
314CONFIG_PM_SLEEP=y
315CONFIG_SUSPEND=y
316CONFIG_SUSPEND_FREEZER=y
193CONFIG_HIBERNATION=y 317CONFIG_HIBERNATION=y
194CONFIG_PM_STD_PARTITION="" 318CONFIG_PM_STD_PARTITION=""
195
196#
197# ACPI (Advanced Configuration and Power Interface) Support
198#
199CONFIG_ACPI=y 319CONFIG_ACPI=y
200CONFIG_ACPI_SLEEP=y 320CONFIG_ACPI_SLEEP=y
201CONFIG_ACPI_SLEEP_PROC_FS=y
202CONFIG_ACPI_SLEEP_PROC_SLEEP=y
203CONFIG_ACPI_PROCFS=y 321CONFIG_ACPI_PROCFS=y
322CONFIG_ACPI_PROCFS_POWER=y
323CONFIG_ACPI_SYSFS_POWER=y
324CONFIG_ACPI_PROC_EVENT=y
204CONFIG_ACPI_AC=y 325CONFIG_ACPI_AC=y
205CONFIG_ACPI_BATTERY=y 326CONFIG_ACPI_BATTERY=y
206CONFIG_ACPI_BUTTON=y 327CONFIG_ACPI_BUTTON=y
207CONFIG_ACPI_FAN=y 328CONFIG_ACPI_FAN=y
208# CONFIG_ACPI_DOCK is not set 329CONFIG_ACPI_DOCK=y
330# CONFIG_ACPI_BAY is not set
209CONFIG_ACPI_PROCESSOR=y 331CONFIG_ACPI_PROCESSOR=y
210CONFIG_ACPI_HOTPLUG_CPU=y 332CONFIG_ACPI_HOTPLUG_CPU=y
211CONFIG_ACPI_THERMAL=y 333CONFIG_ACPI_THERMAL=y
212CONFIG_ACPI_NUMA=y 334CONFIG_ACPI_NUMA=y
335# CONFIG_ACPI_WMI is not set
213# CONFIG_ACPI_ASUS is not set 336# CONFIG_ACPI_ASUS is not set
214# CONFIG_ACPI_TOSHIBA is not set 337# CONFIG_ACPI_TOSHIBA is not set
338# CONFIG_ACPI_CUSTOM_DSDT is not set
215CONFIG_ACPI_BLACKLIST_YEAR=0 339CONFIG_ACPI_BLACKLIST_YEAR=0
216# CONFIG_ACPI_DEBUG is not set 340# CONFIG_ACPI_DEBUG is not set
217CONFIG_ACPI_EC=y 341CONFIG_ACPI_EC=y
@@ -227,29 +351,34 @@ CONFIG_ACPI_CONTAINER=y
227CONFIG_CPU_FREQ=y 351CONFIG_CPU_FREQ=y
228CONFIG_CPU_FREQ_TABLE=y 352CONFIG_CPU_FREQ_TABLE=y
229CONFIG_CPU_FREQ_DEBUG=y 353CONFIG_CPU_FREQ_DEBUG=y
230CONFIG_CPU_FREQ_STAT=y 354# CONFIG_CPU_FREQ_STAT is not set
231# CONFIG_CPU_FREQ_STAT_DETAILS is not set 355# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set
232CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y 356# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set
233# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set 357CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y
358# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set
359# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set
234CONFIG_CPU_FREQ_GOV_PERFORMANCE=y 360CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
235# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set 361# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set
236CONFIG_CPU_FREQ_GOV_USERSPACE=y 362CONFIG_CPU_FREQ_GOV_USERSPACE=y
237CONFIG_CPU_FREQ_GOV_ONDEMAND=y 363CONFIG_CPU_FREQ_GOV_ONDEMAND=y
238CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y 364# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set
239 365
240# 366#
241# CPUFreq processor drivers 367# CPUFreq processor drivers
242# 368#
243CONFIG_X86_POWERNOW_K8=y
244CONFIG_X86_POWERNOW_K8_ACPI=y
245# CONFIG_X86_SPEEDSTEP_CENTRINO is not set
246CONFIG_X86_ACPI_CPUFREQ=y 369CONFIG_X86_ACPI_CPUFREQ=y
370# CONFIG_X86_POWERNOW_K8 is not set
371# CONFIG_X86_SPEEDSTEP_CENTRINO is not set
372# CONFIG_X86_P4_CLOCKMOD is not set
247 373
248# 374#
249# shared options 375# shared options
250# 376#
251CONFIG_X86_ACPI_CPUFREQ_PROC_INTF=y 377# CONFIG_X86_ACPI_CPUFREQ_PROC_INTF is not set
252# CONFIG_X86_SPEEDSTEP_LIB is not set 378# CONFIG_X86_SPEEDSTEP_LIB is not set
379CONFIG_CPU_IDLE=y
380CONFIG_CPU_IDLE_GOV_LADDER=y
381CONFIG_CPU_IDLE_GOV_MENU=y
253 382
254# 383#
255# Bus options (PCI etc.) 384# Bus options (PCI etc.)
@@ -257,27 +386,56 @@ CONFIG_X86_ACPI_CPUFREQ_PROC_INTF=y
257CONFIG_PCI=y 386CONFIG_PCI=y
258CONFIG_PCI_DIRECT=y 387CONFIG_PCI_DIRECT=y
259CONFIG_PCI_MMCONFIG=y 388CONFIG_PCI_MMCONFIG=y
389CONFIG_PCI_DOMAINS=y
390CONFIG_DMAR=y
391CONFIG_DMAR_GFX_WA=y
392CONFIG_DMAR_FLOPPY_WA=y
260CONFIG_PCIEPORTBUS=y 393CONFIG_PCIEPORTBUS=y
394# CONFIG_HOTPLUG_PCI_PCIE is not set
261CONFIG_PCIEAER=y 395CONFIG_PCIEAER=y
396# CONFIG_PCIEASPM is not set
262CONFIG_ARCH_SUPPORTS_MSI=y 397CONFIG_ARCH_SUPPORTS_MSI=y
263CONFIG_PCI_MSI=y 398CONFIG_PCI_MSI=y
399# CONFIG_PCI_LEGACY is not set
264# CONFIG_PCI_DEBUG is not set 400# CONFIG_PCI_DEBUG is not set
265# CONFIG_HT_IRQ is not set 401CONFIG_HT_IRQ=y
266 402CONFIG_ISA_DMA_API=y
267# 403CONFIG_K8_NB=y
268# PCCARD (PCMCIA/CardBus) support 404CONFIG_PCCARD=y
269# 405# CONFIG_PCMCIA_DEBUG is not set
270# CONFIG_PCCARD is not set 406CONFIG_PCMCIA=y
271# CONFIG_HOTPLUG_PCI is not set 407CONFIG_PCMCIA_LOAD_CIS=y
408CONFIG_PCMCIA_IOCTL=y
409CONFIG_CARDBUS=y
410
411#
412# PC-card bridges
413#
414CONFIG_YENTA=y
415CONFIG_YENTA_O2=y
416CONFIG_YENTA_RICOH=y
417CONFIG_YENTA_TI=y
418CONFIG_YENTA_ENE_TUNE=y
419CONFIG_YENTA_TOSHIBA=y
420# CONFIG_PD6729 is not set
421# CONFIG_I82092 is not set
422CONFIG_PCCARD_NONSTATIC=y
423CONFIG_HOTPLUG_PCI=y
424# CONFIG_HOTPLUG_PCI_FAKE is not set
425# CONFIG_HOTPLUG_PCI_ACPI is not set
426# CONFIG_HOTPLUG_PCI_CPCI is not set
427# CONFIG_HOTPLUG_PCI_SHPC is not set
272 428
273# 429#
274# Executable file formats / Emulations 430# Executable file formats / Emulations
275# 431#
276CONFIG_BINFMT_ELF=y 432CONFIG_BINFMT_ELF=y
277# CONFIG_BINFMT_MISC is not set 433CONFIG_COMPAT_BINFMT_ELF=y
434CONFIG_BINFMT_MISC=y
278CONFIG_IA32_EMULATION=y 435CONFIG_IA32_EMULATION=y
279CONFIG_IA32_AOUT=y 436# CONFIG_IA32_AOUT is not set
280CONFIG_COMPAT=y 437CONFIG_COMPAT=y
438CONFIG_COMPAT_FOR_U64_ALIGNMENT=y
281CONFIG_SYSVIPC_COMPAT=y 439CONFIG_SYSVIPC_COMPAT=y
282 440
283# 441#
@@ -289,22 +447,31 @@ CONFIG_NET=y
289# Networking options 447# Networking options
290# 448#
291CONFIG_PACKET=y 449CONFIG_PACKET=y
292# CONFIG_PACKET_MMAP is not set 450CONFIG_PACKET_MMAP=y
293CONFIG_UNIX=y 451CONFIG_UNIX=y
452CONFIG_XFRM=y
453CONFIG_XFRM_USER=y
454# CONFIG_XFRM_SUB_POLICY is not set
455# CONFIG_XFRM_MIGRATE is not set
456# CONFIG_XFRM_STATISTICS is not set
294# CONFIG_NET_KEY is not set 457# CONFIG_NET_KEY is not set
295CONFIG_INET=y 458CONFIG_INET=y
296CONFIG_IP_MULTICAST=y 459CONFIG_IP_MULTICAST=y
297# CONFIG_IP_ADVANCED_ROUTER is not set 460CONFIG_IP_ADVANCED_ROUTER=y
461CONFIG_ASK_IP_FIB_HASH=y
462# CONFIG_IP_FIB_TRIE is not set
298CONFIG_IP_FIB_HASH=y 463CONFIG_IP_FIB_HASH=y
299CONFIG_IP_PNP=y 464CONFIG_IP_MULTIPLE_TABLES=y
300CONFIG_IP_PNP_DHCP=y 465CONFIG_IP_ROUTE_MULTIPATH=y
301# CONFIG_IP_PNP_BOOTP is not set 466CONFIG_IP_ROUTE_VERBOSE=y
302# CONFIG_IP_PNP_RARP is not set 467# CONFIG_IP_PNP is not set
303# CONFIG_NET_IPIP is not set 468# CONFIG_NET_IPIP is not set
304# CONFIG_NET_IPGRE is not set 469# CONFIG_NET_IPGRE is not set
305# CONFIG_IP_MROUTE is not set 470CONFIG_IP_MROUTE=y
471CONFIG_IP_PIMSM_V1=y
472CONFIG_IP_PIMSM_V2=y
306# CONFIG_ARPD is not set 473# CONFIG_ARPD is not set
307# CONFIG_SYN_COOKIES is not set 474CONFIG_SYN_COOKIES=y
308# CONFIG_INET_AH is not set 475# CONFIG_INET_AH is not set
309# CONFIG_INET_ESP is not set 476# CONFIG_INET_ESP is not set
310# CONFIG_INET_IPCOMP is not set 477# CONFIG_INET_IPCOMP is not set
@@ -313,31 +480,109 @@ CONFIG_INET_TUNNEL=y
313# CONFIG_INET_XFRM_MODE_TRANSPORT is not set 480# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
314# CONFIG_INET_XFRM_MODE_TUNNEL is not set 481# CONFIG_INET_XFRM_MODE_TUNNEL is not set
315# CONFIG_INET_XFRM_MODE_BEET is not set 482# CONFIG_INET_XFRM_MODE_BEET is not set
316CONFIG_INET_DIAG=y 483CONFIG_INET_LRO=y
317CONFIG_INET_TCP_DIAG=y 484# CONFIG_INET_DIAG is not set
318# CONFIG_TCP_CONG_ADVANCED is not set 485CONFIG_TCP_CONG_ADVANCED=y
486# CONFIG_TCP_CONG_BIC is not set
319CONFIG_TCP_CONG_CUBIC=y 487CONFIG_TCP_CONG_CUBIC=y
488# CONFIG_TCP_CONG_WESTWOOD is not set
489# CONFIG_TCP_CONG_HTCP is not set
490# CONFIG_TCP_CONG_HSTCP is not set
491# CONFIG_TCP_CONG_HYBLA is not set
492# CONFIG_TCP_CONG_VEGAS is not set
493# CONFIG_TCP_CONG_SCALABLE is not set
494# CONFIG_TCP_CONG_LP is not set
495# CONFIG_TCP_CONG_VENO is not set
496# CONFIG_TCP_CONG_YEAH is not set
497# CONFIG_TCP_CONG_ILLINOIS is not set
498# CONFIG_DEFAULT_BIC is not set
499CONFIG_DEFAULT_CUBIC=y
500# CONFIG_DEFAULT_HTCP is not set
501# CONFIG_DEFAULT_VEGAS is not set
502# CONFIG_DEFAULT_WESTWOOD is not set
503# CONFIG_DEFAULT_RENO is not set
320CONFIG_DEFAULT_TCP_CONG="cubic" 504CONFIG_DEFAULT_TCP_CONG="cubic"
321# CONFIG_TCP_MD5SIG is not set 505CONFIG_TCP_MD5SIG=y
506# CONFIG_IP_VS is not set
322CONFIG_IPV6=y 507CONFIG_IPV6=y
323# CONFIG_IPV6_PRIVACY is not set 508# CONFIG_IPV6_PRIVACY is not set
324# CONFIG_IPV6_ROUTER_PREF is not set 509# CONFIG_IPV6_ROUTER_PREF is not set
325# CONFIG_IPV6_OPTIMISTIC_DAD is not set 510# CONFIG_IPV6_OPTIMISTIC_DAD is not set
326# CONFIG_INET6_AH is not set 511CONFIG_INET6_AH=y
327# CONFIG_INET6_ESP is not set 512CONFIG_INET6_ESP=y
328# CONFIG_INET6_IPCOMP is not set 513# CONFIG_INET6_IPCOMP is not set
329# CONFIG_IPV6_MIP6 is not set 514# CONFIG_IPV6_MIP6 is not set
330# CONFIG_INET6_XFRM_TUNNEL is not set 515# CONFIG_INET6_XFRM_TUNNEL is not set
331# CONFIG_INET6_TUNNEL is not set 516# CONFIG_INET6_TUNNEL is not set
332# CONFIG_INET6_XFRM_MODE_TRANSPORT is not set 517CONFIG_INET6_XFRM_MODE_TRANSPORT=y
333# CONFIG_INET6_XFRM_MODE_TUNNEL is not set 518CONFIG_INET6_XFRM_MODE_TUNNEL=y
334# CONFIG_INET6_XFRM_MODE_BEET is not set 519CONFIG_INET6_XFRM_MODE_BEET=y
335# CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set 520# CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set
336CONFIG_IPV6_SIT=y 521CONFIG_IPV6_SIT=y
522CONFIG_IPV6_NDISC_NODETYPE=y
337# CONFIG_IPV6_TUNNEL is not set 523# CONFIG_IPV6_TUNNEL is not set
338# CONFIG_IPV6_MULTIPLE_TABLES is not set 524# CONFIG_IPV6_MULTIPLE_TABLES is not set
339# CONFIG_NETWORK_SECMARK is not set 525# CONFIG_IPV6_MROUTE is not set
340# CONFIG_NETFILTER is not set 526CONFIG_NETLABEL=y
527CONFIG_NETWORK_SECMARK=y
528CONFIG_NETFILTER=y
529# CONFIG_NETFILTER_DEBUG is not set
530# CONFIG_NETFILTER_ADVANCED is not set
531
532#
533# Core Netfilter Configuration
534#
535CONFIG_NETFILTER_NETLINK=y
536CONFIG_NETFILTER_NETLINK_LOG=y
537CONFIG_NF_CONNTRACK=y
538CONFIG_NF_CONNTRACK_SECMARK=y
539CONFIG_NF_CONNTRACK_FTP=y
540CONFIG_NF_CONNTRACK_IRC=y
541CONFIG_NF_CONNTRACK_SIP=y
542CONFIG_NF_CT_NETLINK=y
543CONFIG_NETFILTER_XTABLES=y
544CONFIG_NETFILTER_XT_TARGET_MARK=y
545CONFIG_NETFILTER_XT_TARGET_NFLOG=y
546CONFIG_NETFILTER_XT_TARGET_SECMARK=y
547CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
548CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
549CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
550CONFIG_NETFILTER_XT_MATCH_MARK=y
551CONFIG_NETFILTER_XT_MATCH_POLICY=y
552CONFIG_NETFILTER_XT_MATCH_STATE=y
553
554#
555# IP: Netfilter Configuration
556#
557CONFIG_NF_CONNTRACK_IPV4=y
558CONFIG_NF_CONNTRACK_PROC_COMPAT=y
559CONFIG_IP_NF_IPTABLES=y
560CONFIG_IP_NF_FILTER=y
561CONFIG_IP_NF_TARGET_REJECT=y
562CONFIG_IP_NF_TARGET_LOG=y
563CONFIG_IP_NF_TARGET_ULOG=y
564CONFIG_NF_NAT=y
565CONFIG_NF_NAT_NEEDED=y
566CONFIG_IP_NF_TARGET_MASQUERADE=y
567CONFIG_NF_NAT_FTP=y
568CONFIG_NF_NAT_IRC=y
569# CONFIG_NF_NAT_TFTP is not set
570# CONFIG_NF_NAT_AMANDA is not set
571# CONFIG_NF_NAT_PPTP is not set
572# CONFIG_NF_NAT_H323 is not set
573CONFIG_NF_NAT_SIP=y
574CONFIG_IP_NF_MANGLE=y
575
576#
577# IPv6: Netfilter Configuration
578#
579CONFIG_NF_CONNTRACK_IPV6=y
580CONFIG_IP6_NF_IPTABLES=y
581CONFIG_IP6_NF_MATCH_IPV6HEADER=y
582CONFIG_IP6_NF_FILTER=y
583CONFIG_IP6_NF_TARGET_LOG=y
584CONFIG_IP6_NF_TARGET_REJECT=y
585CONFIG_IP6_NF_MANGLE=y
341# CONFIG_IP_DCCP is not set 586# CONFIG_IP_DCCP is not set
342# CONFIG_IP_SCTP is not set 587# CONFIG_IP_SCTP is not set
343# CONFIG_TIPC is not set 588# CONFIG_TIPC is not set
@@ -345,6 +590,7 @@ CONFIG_IPV6_SIT=y
345# CONFIG_BRIDGE is not set 590# CONFIG_BRIDGE is not set
346# CONFIG_VLAN_8021Q is not set 591# CONFIG_VLAN_8021Q is not set
347# CONFIG_DECNET is not set 592# CONFIG_DECNET is not set
593CONFIG_LLC=y
348# CONFIG_LLC2 is not set 594# CONFIG_LLC2 is not set
349# CONFIG_IPX is not set 595# CONFIG_IPX is not set
350# CONFIG_ATALK is not set 596# CONFIG_ATALK is not set
@@ -352,28 +598,99 @@ CONFIG_IPV6_SIT=y
352# CONFIG_LAPB is not set 598# CONFIG_LAPB is not set
353# CONFIG_ECONET is not set 599# CONFIG_ECONET is not set
354# CONFIG_WAN_ROUTER is not set 600# CONFIG_WAN_ROUTER is not set
355 601CONFIG_NET_SCHED=y
356# 602
357# QoS and/or fair queueing 603#
358# 604# Queueing/Scheduling
359# CONFIG_NET_SCHED is not set 605#
606# CONFIG_NET_SCH_CBQ is not set
607# CONFIG_NET_SCH_HTB is not set
608# CONFIG_NET_SCH_HFSC is not set
609# CONFIG_NET_SCH_PRIO is not set
610# CONFIG_NET_SCH_RR is not set
611# CONFIG_NET_SCH_RED is not set
612# CONFIG_NET_SCH_SFQ is not set
613# CONFIG_NET_SCH_TEQL is not set
614# CONFIG_NET_SCH_TBF is not set
615# CONFIG_NET_SCH_GRED is not set
616# CONFIG_NET_SCH_DSMARK is not set
617# CONFIG_NET_SCH_NETEM is not set
618# CONFIG_NET_SCH_INGRESS is not set
619
620#
621# Classification
622#
623CONFIG_NET_CLS=y
624# CONFIG_NET_CLS_BASIC is not set
625# CONFIG_NET_CLS_TCINDEX is not set
626# CONFIG_NET_CLS_ROUTE4 is not set
627# CONFIG_NET_CLS_FW is not set
628# CONFIG_NET_CLS_U32 is not set
629# CONFIG_NET_CLS_RSVP is not set
630# CONFIG_NET_CLS_RSVP6 is not set
631# CONFIG_NET_CLS_FLOW is not set
632CONFIG_NET_EMATCH=y
633CONFIG_NET_EMATCH_STACK=32
634# CONFIG_NET_EMATCH_CMP is not set
635# CONFIG_NET_EMATCH_NBYTE is not set
636# CONFIG_NET_EMATCH_U32 is not set
637# CONFIG_NET_EMATCH_META is not set
638# CONFIG_NET_EMATCH_TEXT is not set
639CONFIG_NET_CLS_ACT=y
640# CONFIG_NET_ACT_POLICE is not set
641# CONFIG_NET_ACT_GACT is not set
642# CONFIG_NET_ACT_MIRRED is not set
643# CONFIG_NET_ACT_IPT is not set
644# CONFIG_NET_ACT_NAT is not set
645# CONFIG_NET_ACT_PEDIT is not set
646# CONFIG_NET_ACT_SIMP is not set
647CONFIG_NET_SCH_FIFO=y
360 648
361# 649#
362# Network testing 650# Network testing
363# 651#
364# CONFIG_NET_PKTGEN is not set 652# CONFIG_NET_PKTGEN is not set
365# CONFIG_NET_TCPPROBE is not set 653# CONFIG_NET_TCPPROBE is not set
366# CONFIG_HAMRADIO is not set 654CONFIG_HAMRADIO=y
655
656#
657# Packet Radio protocols
658#
659# CONFIG_AX25 is not set
660# CONFIG_CAN is not set
367# CONFIG_IRDA is not set 661# CONFIG_IRDA is not set
368# CONFIG_BT is not set 662# CONFIG_BT is not set
369# CONFIG_AF_RXRPC is not set 663# CONFIG_AF_RXRPC is not set
664CONFIG_FIB_RULES=y
370 665
371# 666#
372# Wireless 667# Wireless
373# 668#
374# CONFIG_CFG80211 is not set 669CONFIG_CFG80211=y
375# CONFIG_WIRELESS_EXT is not set 670CONFIG_NL80211=y
376# CONFIG_MAC80211 is not set 671CONFIG_WIRELESS_EXT=y
672CONFIG_MAC80211=y
673
674#
675# Rate control algorithm selection
676#
677CONFIG_MAC80211_RC_DEFAULT_PID=y
678# CONFIG_MAC80211_RC_DEFAULT_NONE is not set
679
680#
681# Selecting 'y' for an algorithm will
682#
683
684#
685# build the algorithm into mac80211.
686#
687CONFIG_MAC80211_RC_DEFAULT="pid"
688CONFIG_MAC80211_RC_PID=y
689# CONFIG_MAC80211_MESH is not set
690CONFIG_MAC80211_LEDS=y
691# CONFIG_MAC80211_DEBUGFS is not set
692# CONFIG_MAC80211_DEBUG_PACKET_ALIGNMENT is not set
693# CONFIG_MAC80211_DEBUG is not set
377# CONFIG_IEEE80211 is not set 694# CONFIG_IEEE80211 is not set
378# CONFIG_RFKILL is not set 695# CONFIG_RFKILL is not set
379# CONFIG_NET_9P is not set 696# CONFIG_NET_9P is not set
@@ -385,13 +702,15 @@ CONFIG_IPV6_SIT=y
385# 702#
386# Generic Driver Options 703# Generic Driver Options
387# 704#
705CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
388CONFIG_STANDALONE=y 706CONFIG_STANDALONE=y
389CONFIG_PREVENT_FIRMWARE_BUILD=y 707CONFIG_PREVENT_FIRMWARE_BUILD=y
390CONFIG_FW_LOADER=y 708CONFIG_FW_LOADER=y
391# CONFIG_DEBUG_DRIVER is not set 709# CONFIG_DEBUG_DRIVER is not set
392# CONFIG_DEBUG_DEVRES is not set 710CONFIG_DEBUG_DEVRES=y
393# CONFIG_SYS_HYPERVISOR is not set 711# CONFIG_SYS_HYPERVISOR is not set
394# CONFIG_CONNECTOR is not set 712CONFIG_CONNECTOR=y
713CONFIG_PROC_EVENTS=y
395# CONFIG_MTD is not set 714# CONFIG_MTD is not set
396# CONFIG_PARPORT is not set 715# CONFIG_PARPORT is not set
397CONFIG_PNP=y 716CONFIG_PNP=y
@@ -402,7 +721,7 @@ CONFIG_PNP=y
402# 721#
403CONFIG_PNPACPI=y 722CONFIG_PNPACPI=y
404CONFIG_BLK_DEV=y 723CONFIG_BLK_DEV=y
405CONFIG_BLK_DEV_FD=y 724# CONFIG_BLK_DEV_FD is not set
406# CONFIG_BLK_CPQ_DA is not set 725# CONFIG_BLK_CPQ_DA is not set
407# CONFIG_BLK_CPQ_CISS_DA is not set 726# CONFIG_BLK_CPQ_CISS_DA is not set
408# CONFIG_BLK_DEV_DAC960 is not set 727# CONFIG_BLK_DEV_DAC960 is not set
@@ -415,8 +734,8 @@ CONFIG_BLK_DEV_LOOP=y
415# CONFIG_BLK_DEV_UB is not set 734# CONFIG_BLK_DEV_UB is not set
416CONFIG_BLK_DEV_RAM=y 735CONFIG_BLK_DEV_RAM=y
417CONFIG_BLK_DEV_RAM_COUNT=16 736CONFIG_BLK_DEV_RAM_COUNT=16
418CONFIG_BLK_DEV_RAM_SIZE=4096 737CONFIG_BLK_DEV_RAM_SIZE=16384
419CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024 738# CONFIG_BLK_DEV_XIP is not set
420# CONFIG_CDROM_PKTCDVD is not set 739# CONFIG_CDROM_PKTCDVD is not set
421# CONFIG_ATA_OVER_ETH is not set 740# CONFIG_ATA_OVER_ETH is not set
422CONFIG_MISC_DEVICES=y 741CONFIG_MISC_DEVICES=y
@@ -425,72 +744,16 @@ CONFIG_MISC_DEVICES=y
425# CONFIG_EEPROM_93CX6 is not set 744# CONFIG_EEPROM_93CX6 is not set
426# CONFIG_SGI_IOC4 is not set 745# CONFIG_SGI_IOC4 is not set
427# CONFIG_TIFM_CORE is not set 746# CONFIG_TIFM_CORE is not set
747# CONFIG_ACER_WMI is not set
748# CONFIG_ASUS_LAPTOP is not set
749# CONFIG_FUJITSU_LAPTOP is not set
750# CONFIG_MSI_LAPTOP is not set
428# CONFIG_SONY_LAPTOP is not set 751# CONFIG_SONY_LAPTOP is not set
429# CONFIG_THINKPAD_ACPI is not set 752# CONFIG_THINKPAD_ACPI is not set
430CONFIG_IDE=y 753# CONFIG_INTEL_MENLOW is not set
431CONFIG_BLK_DEV_IDE=y 754# CONFIG_ENCLOSURE_SERVICES is not set
432 755CONFIG_HAVE_IDE=y
433# 756# CONFIG_IDE is not set
434# Please see Documentation/ide.txt for help/info on IDE drives
435#
436# CONFIG_BLK_DEV_IDE_SATA is not set
437# CONFIG_BLK_DEV_HD_IDE is not set
438CONFIG_BLK_DEV_IDEDISK=y
439CONFIG_IDEDISK_MULTI_MODE=y
440CONFIG_BLK_DEV_IDECD=y
441# CONFIG_BLK_DEV_IDETAPE is not set
442# CONFIG_BLK_DEV_IDEFLOPPY is not set
443# CONFIG_BLK_DEV_IDESCSI is not set
444CONFIG_BLK_DEV_IDEACPI=y
445# CONFIG_IDE_TASK_IOCTL is not set
446CONFIG_IDE_PROC_FS=y
447
448#
449# IDE chipset support/bugfixes
450#
451CONFIG_IDE_GENERIC=y
452# CONFIG_BLK_DEV_CMD640 is not set
453# CONFIG_BLK_DEV_IDEPNP is not set
454CONFIG_BLK_DEV_IDEPCI=y
455# CONFIG_IDEPCI_SHARE_IRQ is not set
456CONFIG_IDEPCI_PCIBUS_ORDER=y
457# CONFIG_BLK_DEV_OFFBOARD is not set
458# CONFIG_BLK_DEV_GENERIC is not set
459# CONFIG_BLK_DEV_OPTI621 is not set
460# CONFIG_BLK_DEV_RZ1000 is not set
461CONFIG_BLK_DEV_IDEDMA_PCI=y
462# CONFIG_BLK_DEV_IDEDMA_FORCED is not set
463# CONFIG_IDEDMA_ONLYDISK is not set
464# CONFIG_BLK_DEV_AEC62XX is not set
465# CONFIG_BLK_DEV_ALI15X3 is not set
466CONFIG_BLK_DEV_AMD74XX=y
467CONFIG_BLK_DEV_ATIIXP=y
468# CONFIG_BLK_DEV_CMD64X is not set
469# CONFIG_BLK_DEV_TRIFLEX is not set
470# CONFIG_BLK_DEV_CY82C693 is not set
471# CONFIG_BLK_DEV_CS5520 is not set
472# CONFIG_BLK_DEV_CS5530 is not set
473# CONFIG_BLK_DEV_HPT34X is not set
474# CONFIG_BLK_DEV_HPT366 is not set
475# CONFIG_BLK_DEV_JMICRON is not set
476# CONFIG_BLK_DEV_SC1200 is not set
477CONFIG_BLK_DEV_PIIX=y
478# CONFIG_BLK_DEV_IT8213 is not set
479# CONFIG_BLK_DEV_IT821X is not set
480# CONFIG_BLK_DEV_NS87415 is not set
481# CONFIG_BLK_DEV_PDC202XX_OLD is not set
482CONFIG_BLK_DEV_PDC202XX_NEW=y
483# CONFIG_BLK_DEV_SVWKS is not set
484# CONFIG_BLK_DEV_SIIMAGE is not set
485# CONFIG_BLK_DEV_SIS5513 is not set
486# CONFIG_BLK_DEV_SLC90E66 is not set
487# CONFIG_BLK_DEV_TRM290 is not set
488# CONFIG_BLK_DEV_VIA82CXXX is not set
489# CONFIG_BLK_DEV_TC86C001 is not set
490# CONFIG_IDE_ARM is not set
491CONFIG_BLK_DEV_IDEDMA=y
492# CONFIG_IDEDMA_IVB is not set
493# CONFIG_BLK_DEV_HD is not set
494 757
495# 758#
496# SCSI device support 759# SCSI device support
@@ -499,8 +762,8 @@ CONFIG_BLK_DEV_IDEDMA=y
499CONFIG_SCSI=y 762CONFIG_SCSI=y
500CONFIG_SCSI_DMA=y 763CONFIG_SCSI_DMA=y
501# CONFIG_SCSI_TGT is not set 764# CONFIG_SCSI_TGT is not set
502CONFIG_SCSI_NETLINK=y 765# CONFIG_SCSI_NETLINK is not set
503# CONFIG_SCSI_PROC_FS is not set 766CONFIG_SCSI_PROC_FS=y
504 767
505# 768#
506# SCSI support type (disk, tape, CD-ROM) 769# SCSI support type (disk, tape, CD-ROM)
@@ -509,7 +772,7 @@ CONFIG_BLK_DEV_SD=y
509# CONFIG_CHR_DEV_ST is not set 772# CONFIG_CHR_DEV_ST is not set
510# CONFIG_CHR_DEV_OSST is not set 773# CONFIG_CHR_DEV_OSST is not set
511CONFIG_BLK_DEV_SR=y 774CONFIG_BLK_DEV_SR=y
512# CONFIG_BLK_DEV_SR_VENDOR is not set 775CONFIG_BLK_DEV_SR_VENDOR=y
513CONFIG_CHR_DEV_SG=y 776CONFIG_CHR_DEV_SG=y
514# CONFIG_CHR_DEV_SCH is not set 777# CONFIG_CHR_DEV_SCH is not set
515 778
@@ -526,73 +789,37 @@ CONFIG_SCSI_WAIT_SCAN=m
526# SCSI Transports 789# SCSI Transports
527# 790#
528CONFIG_SCSI_SPI_ATTRS=y 791CONFIG_SCSI_SPI_ATTRS=y
529CONFIG_SCSI_FC_ATTRS=y 792# CONFIG_SCSI_FC_ATTRS is not set
530# CONFIG_SCSI_ISCSI_ATTRS is not set 793# CONFIG_SCSI_ISCSI_ATTRS is not set
531CONFIG_SCSI_SAS_ATTRS=y 794# CONFIG_SCSI_SAS_ATTRS is not set
532# CONFIG_SCSI_SAS_LIBSAS is not set 795# CONFIG_SCSI_SAS_LIBSAS is not set
533 796# CONFIG_SCSI_SRP_ATTRS is not set
534# 797# CONFIG_SCSI_LOWLEVEL is not set
535# SCSI low-level drivers 798# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
536#
537# CONFIG_ISCSI_TCP is not set
538# CONFIG_BLK_DEV_3W_XXXX_RAID is not set
539# CONFIG_SCSI_3W_9XXX is not set
540# CONFIG_SCSI_ACARD is not set
541# CONFIG_SCSI_AACRAID is not set
542# CONFIG_SCSI_AIC7XXX is not set
543# CONFIG_SCSI_AIC7XXX_OLD is not set
544CONFIG_SCSI_AIC79XX=y
545CONFIG_AIC79XX_CMDS_PER_DEVICE=32
546CONFIG_AIC79XX_RESET_DELAY_MS=4000
547# CONFIG_AIC79XX_DEBUG_ENABLE is not set
548CONFIG_AIC79XX_DEBUG_MASK=0
549# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set
550# CONFIG_SCSI_AIC94XX is not set
551# CONFIG_SCSI_ARCMSR is not set
552# CONFIG_MEGARAID_NEWGEN is not set
553# CONFIG_MEGARAID_LEGACY is not set
554# CONFIG_MEGARAID_SAS is not set
555# CONFIG_SCSI_HPTIOP is not set
556# CONFIG_SCSI_BUSLOGIC is not set
557# CONFIG_SCSI_DMX3191D is not set
558# CONFIG_SCSI_EATA is not set
559# CONFIG_SCSI_FUTURE_DOMAIN is not set
560# CONFIG_SCSI_GDTH is not set
561# CONFIG_SCSI_IPS is not set
562# CONFIG_SCSI_INITIO is not set
563# CONFIG_SCSI_INIA100 is not set
564# CONFIG_SCSI_STEX is not set
565# CONFIG_SCSI_SYM53C8XX_2 is not set
566# CONFIG_SCSI_IPR is not set
567# CONFIG_SCSI_QLOGIC_1280 is not set
568# CONFIG_SCSI_QLA_FC is not set
569# CONFIG_SCSI_QLA_ISCSI is not set
570# CONFIG_SCSI_LPFC is not set
571# CONFIG_SCSI_DC395x is not set
572# CONFIG_SCSI_DC390T is not set
573# CONFIG_SCSI_DEBUG is not set
574# CONFIG_SCSI_SRP is not set
575CONFIG_ATA=y 799CONFIG_ATA=y
576# CONFIG_ATA_NONSTANDARD is not set 800# CONFIG_ATA_NONSTANDARD is not set
577CONFIG_ATA_ACPI=y 801CONFIG_ATA_ACPI=y
802CONFIG_SATA_PMP=y
578CONFIG_SATA_AHCI=y 803CONFIG_SATA_AHCI=y
579CONFIG_SATA_SVW=y 804# CONFIG_SATA_SIL24 is not set
805CONFIG_ATA_SFF=y
806# CONFIG_SATA_SVW is not set
580CONFIG_ATA_PIIX=y 807CONFIG_ATA_PIIX=y
581# CONFIG_SATA_MV is not set 808# CONFIG_SATA_MV is not set
582CONFIG_SATA_NV=y 809# CONFIG_SATA_NV is not set
583# CONFIG_PDC_ADMA is not set 810# CONFIG_PDC_ADMA is not set
584# CONFIG_SATA_QSTOR is not set 811# CONFIG_SATA_QSTOR is not set
585# CONFIG_SATA_PROMISE is not set 812# CONFIG_SATA_PROMISE is not set
586# CONFIG_SATA_SX4 is not set 813# CONFIG_SATA_SX4 is not set
587CONFIG_SATA_SIL=y 814# CONFIG_SATA_SIL is not set
588# CONFIG_SATA_SIL24 is not set
589# CONFIG_SATA_SIS is not set 815# CONFIG_SATA_SIS is not set
590# CONFIG_SATA_ULI is not set 816# CONFIG_SATA_ULI is not set
591CONFIG_SATA_VIA=y 817# CONFIG_SATA_VIA is not set
592# CONFIG_SATA_VITESSE is not set 818# CONFIG_SATA_VITESSE is not set
593# CONFIG_SATA_INIC162X is not set 819# CONFIG_SATA_INIC162X is not set
820# CONFIG_PATA_ACPI is not set
594# CONFIG_PATA_ALI is not set 821# CONFIG_PATA_ALI is not set
595# CONFIG_PATA_AMD is not set 822CONFIG_PATA_AMD=y
596# CONFIG_PATA_ARTOP is not set 823# CONFIG_PATA_ARTOP is not set
597# CONFIG_PATA_ATIIXP is not set 824# CONFIG_PATA_ATIIXP is not set
598# CONFIG_PATA_CMD640_PCI is not set 825# CONFIG_PATA_CMD640_PCI is not set
@@ -612,11 +839,14 @@ CONFIG_SATA_VIA=y
612# CONFIG_PATA_TRIFLEX is not set 839# CONFIG_PATA_TRIFLEX is not set
613# CONFIG_PATA_MARVELL is not set 840# CONFIG_PATA_MARVELL is not set
614# CONFIG_PATA_MPIIX is not set 841# CONFIG_PATA_MPIIX is not set
615# CONFIG_PATA_OLDPIIX is not set 842CONFIG_PATA_OLDPIIX=y
616# CONFIG_PATA_NETCELL is not set 843# CONFIG_PATA_NETCELL is not set
844# CONFIG_PATA_NINJA32 is not set
617# CONFIG_PATA_NS87410 is not set 845# CONFIG_PATA_NS87410 is not set
846# CONFIG_PATA_NS87415 is not set
618# CONFIG_PATA_OPTI is not set 847# CONFIG_PATA_OPTI is not set
619# CONFIG_PATA_OPTIDMA is not set 848# CONFIG_PATA_OPTIDMA is not set
849# CONFIG_PATA_PCMCIA is not set
620# CONFIG_PATA_PDC_OLD is not set 850# CONFIG_PATA_PDC_OLD is not set
621# CONFIG_PATA_RADISYS is not set 851# CONFIG_PATA_RADISYS is not set
622# CONFIG_PATA_RZ1000 is not set 852# CONFIG_PATA_RZ1000 is not set
@@ -628,65 +858,42 @@ CONFIG_SATA_VIA=y
628# CONFIG_PATA_VIA is not set 858# CONFIG_PATA_VIA is not set
629# CONFIG_PATA_WINBOND is not set 859# CONFIG_PATA_WINBOND is not set
630CONFIG_MD=y 860CONFIG_MD=y
631# CONFIG_BLK_DEV_MD is not set 861CONFIG_BLK_DEV_MD=y
862# CONFIG_MD_LINEAR is not set
863# CONFIG_MD_RAID0 is not set
864# CONFIG_MD_RAID1 is not set
865# CONFIG_MD_RAID10 is not set
866# CONFIG_MD_RAID456 is not set
867# CONFIG_MD_MULTIPATH is not set
868# CONFIG_MD_FAULTY is not set
632CONFIG_BLK_DEV_DM=y 869CONFIG_BLK_DEV_DM=y
633# CONFIG_DM_DEBUG is not set 870# CONFIG_DM_DEBUG is not set
634# CONFIG_DM_CRYPT is not set 871# CONFIG_DM_CRYPT is not set
635# CONFIG_DM_SNAPSHOT is not set 872# CONFIG_DM_SNAPSHOT is not set
636# CONFIG_DM_MIRROR is not set 873CONFIG_DM_MIRROR=y
637# CONFIG_DM_ZERO is not set 874CONFIG_DM_ZERO=y
638# CONFIG_DM_MULTIPATH is not set 875# CONFIG_DM_MULTIPATH is not set
639# CONFIG_DM_DELAY is not set 876# CONFIG_DM_DELAY is not set
640 877# CONFIG_DM_UEVENT is not set
641# 878# CONFIG_FUSION is not set
642# Fusion MPT device support
643#
644CONFIG_FUSION=y
645CONFIG_FUSION_SPI=y
646# CONFIG_FUSION_FC is not set
647# CONFIG_FUSION_SAS is not set
648CONFIG_FUSION_MAX_SGE=128
649# CONFIG_FUSION_CTL is not set
650 879
651# 880#
652# IEEE 1394 (FireWire) support 881# IEEE 1394 (FireWire) support
653# 882#
654# CONFIG_FIREWIRE is not set 883# CONFIG_FIREWIRE is not set
655CONFIG_IEEE1394=y 884# CONFIG_IEEE1394 is not set
656
657#
658# Subsystem Options
659#
660# CONFIG_IEEE1394_VERBOSEDEBUG is not set
661
662#
663# Controllers
664#
665
666#
667# Texas Instruments PCILynx requires I2C
668#
669CONFIG_IEEE1394_OHCI1394=y
670
671#
672# Protocols
673#
674# CONFIG_IEEE1394_VIDEO1394 is not set
675# CONFIG_IEEE1394_SBP2 is not set
676# CONFIG_IEEE1394_ETH1394_ROM_ENTRY is not set
677# CONFIG_IEEE1394_ETH1394 is not set
678# CONFIG_IEEE1394_DV1394 is not set
679CONFIG_IEEE1394_RAWIO=y
680# CONFIG_I2O is not set 885# CONFIG_I2O is not set
681CONFIG_MACINTOSH_DRIVERS=y 886CONFIG_MACINTOSH_DRIVERS=y
682# CONFIG_MAC_EMUMOUSEBTN is not set 887CONFIG_MAC_EMUMOUSEBTN=y
683CONFIG_NETDEVICES=y 888CONFIG_NETDEVICES=y
684CONFIG_NETDEVICES_MULTIQUEUE=y 889# CONFIG_NETDEVICES_MULTIQUEUE is not set
890# CONFIG_IFB is not set
685# CONFIG_DUMMY is not set 891# CONFIG_DUMMY is not set
686# CONFIG_BONDING is not set 892# CONFIG_BONDING is not set
687# CONFIG_MACVLAN is not set 893# CONFIG_MACVLAN is not set
688# CONFIG_EQUALIZER is not set 894# CONFIG_EQUALIZER is not set
689CONFIG_TUN=y 895# CONFIG_TUN is not set
896# CONFIG_VETH is not set
690# CONFIG_NET_SB1000 is not set 897# CONFIG_NET_SB1000 is not set
691# CONFIG_ARCNET is not set 898# CONFIG_ARCNET is not set
692# CONFIG_PHYLIB is not set 899# CONFIG_PHYLIB is not set
@@ -696,39 +903,40 @@ CONFIG_MII=y
696# CONFIG_SUNGEM is not set 903# CONFIG_SUNGEM is not set
697# CONFIG_CASSINI is not set 904# CONFIG_CASSINI is not set
698CONFIG_NET_VENDOR_3COM=y 905CONFIG_NET_VENDOR_3COM=y
699CONFIG_VORTEX=y 906# CONFIG_VORTEX is not set
700# CONFIG_TYPHOON is not set 907# CONFIG_TYPHOON is not set
701CONFIG_NET_TULIP=y 908CONFIG_NET_TULIP=y
702# CONFIG_DE2104X is not set 909# CONFIG_DE2104X is not set
703CONFIG_TULIP=y 910# CONFIG_TULIP is not set
704# CONFIG_TULIP_MWI is not set
705# CONFIG_TULIP_MMIO is not set
706# CONFIG_TULIP_NAPI is not set
707# CONFIG_DE4X5 is not set 911# CONFIG_DE4X5 is not set
708# CONFIG_WINBOND_840 is not set 912# CONFIG_WINBOND_840 is not set
709# CONFIG_DM9102 is not set 913# CONFIG_DM9102 is not set
710# CONFIG_ULI526X is not set 914# CONFIG_ULI526X is not set
915# CONFIG_PCMCIA_XIRCOM is not set
711# CONFIG_HP100 is not set 916# CONFIG_HP100 is not set
917# CONFIG_IBM_NEW_EMAC_ZMII is not set
918# CONFIG_IBM_NEW_EMAC_RGMII is not set
919# CONFIG_IBM_NEW_EMAC_TAH is not set
920# CONFIG_IBM_NEW_EMAC_EMAC4 is not set
712CONFIG_NET_PCI=y 921CONFIG_NET_PCI=y
713# CONFIG_PCNET32 is not set 922# CONFIG_PCNET32 is not set
714CONFIG_AMD8111_ETH=y 923# CONFIG_AMD8111_ETH is not set
715# CONFIG_AMD8111E_NAPI is not set
716# CONFIG_ADAPTEC_STARFIRE is not set 924# CONFIG_ADAPTEC_STARFIRE is not set
717CONFIG_B44=y 925# CONFIG_B44 is not set
718CONFIG_FORCEDETH=y 926CONFIG_FORCEDETH=y
719# CONFIG_FORCEDETH_NAPI is not set 927# CONFIG_FORCEDETH_NAPI is not set
720# CONFIG_DGRS is not set
721# CONFIG_EEPRO100 is not set 928# CONFIG_EEPRO100 is not set
722CONFIG_E100=y 929CONFIG_E100=y
723# CONFIG_FEALNX is not set 930# CONFIG_FEALNX is not set
724# CONFIG_NATSEMI is not set 931# CONFIG_NATSEMI is not set
725# CONFIG_NE2K_PCI is not set 932# CONFIG_NE2K_PCI is not set
726CONFIG_8139CP=y 933# CONFIG_8139CP is not set
727CONFIG_8139TOO=y 934CONFIG_8139TOO=y
728# CONFIG_8139TOO_PIO is not set 935CONFIG_8139TOO_PIO=y
729# CONFIG_8139TOO_TUNE_TWISTER is not set 936# CONFIG_8139TOO_TUNE_TWISTER is not set
730# CONFIG_8139TOO_8129 is not set 937# CONFIG_8139TOO_8129 is not set
731# CONFIG_8139_OLD_RX_RESET is not set 938# CONFIG_8139_OLD_RX_RESET is not set
939# CONFIG_R6040 is not set
732# CONFIG_SIS900 is not set 940# CONFIG_SIS900 is not set
733# CONFIG_EPIC100 is not set 941# CONFIG_EPIC100 is not set
734# CONFIG_SUNDANCE is not set 942# CONFIG_SUNDANCE is not set
@@ -740,34 +948,74 @@ CONFIG_NETDEV_1000=y
740CONFIG_E1000=y 948CONFIG_E1000=y
741# CONFIG_E1000_NAPI is not set 949# CONFIG_E1000_NAPI is not set
742# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set 950# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set
951# CONFIG_E1000E is not set
952# CONFIG_E1000E_ENABLED is not set
953# CONFIG_IP1000 is not set
954# CONFIG_IGB is not set
743# CONFIG_NS83820 is not set 955# CONFIG_NS83820 is not set
744# CONFIG_HAMACHI is not set 956# CONFIG_HAMACHI is not set
745# CONFIG_YELLOWFIN is not set 957# CONFIG_YELLOWFIN is not set
746# CONFIG_R8169 is not set 958# CONFIG_R8169 is not set
747# CONFIG_SIS190 is not set 959# CONFIG_SIS190 is not set
748# CONFIG_SKGE is not set 960# CONFIG_SKGE is not set
749# CONFIG_SKY2 is not set 961CONFIG_SKY2=y
962# CONFIG_SKY2_DEBUG is not set
750# CONFIG_VIA_VELOCITY is not set 963# CONFIG_VIA_VELOCITY is not set
751CONFIG_TIGON3=y 964CONFIG_TIGON3=y
752CONFIG_BNX2=y 965# CONFIG_BNX2 is not set
753# CONFIG_QLA3XXX is not set 966# CONFIG_QLA3XXX is not set
754# CONFIG_ATL1 is not set 967# CONFIG_ATL1 is not set
755CONFIG_NETDEV_10000=y 968CONFIG_NETDEV_10000=y
756# CONFIG_CHELSIO_T1 is not set 969# CONFIG_CHELSIO_T1 is not set
757# CONFIG_CHELSIO_T3 is not set 970# CONFIG_CHELSIO_T3 is not set
971# CONFIG_IXGBE is not set
758# CONFIG_IXGB is not set 972# CONFIG_IXGB is not set
759CONFIG_S2IO=m 973# CONFIG_S2IO is not set
760# CONFIG_S2IO_NAPI is not set
761# CONFIG_MYRI10GE is not set 974# CONFIG_MYRI10GE is not set
762# CONFIG_NETXEN_NIC is not set 975# CONFIG_NETXEN_NIC is not set
976# CONFIG_NIU is not set
763# CONFIG_MLX4_CORE is not set 977# CONFIG_MLX4_CORE is not set
764# CONFIG_TR is not set 978# CONFIG_TEHUTI is not set
979# CONFIG_BNX2X is not set
980# CONFIG_SFC is not set
981CONFIG_TR=y
982# CONFIG_IBMOL is not set
983# CONFIG_3C359 is not set
984# CONFIG_TMS380TR is not set
765 985
766# 986#
767# Wireless LAN 987# Wireless LAN
768# 988#
769# CONFIG_WLAN_PRE80211 is not set 989# CONFIG_WLAN_PRE80211 is not set
770# CONFIG_WLAN_80211 is not set 990CONFIG_WLAN_80211=y
991# CONFIG_PCMCIA_RAYCS is not set
992# CONFIG_IPW2100 is not set
993# CONFIG_IPW2200 is not set
994# CONFIG_LIBERTAS is not set
995# CONFIG_AIRO is not set
996# CONFIG_HERMES is not set
997# CONFIG_ATMEL is not set
998# CONFIG_AIRO_CS is not set
999# CONFIG_PCMCIA_WL3501 is not set
1000# CONFIG_PRISM54 is not set
1001# CONFIG_USB_ZD1201 is not set
1002# CONFIG_USB_NET_RNDIS_WLAN is not set
1003# CONFIG_RTL8180 is not set
1004# CONFIG_RTL8187 is not set
1005# CONFIG_ADM8211 is not set
1006# CONFIG_P54_COMMON is not set
1007CONFIG_ATH5K=y
1008# CONFIG_ATH5K_DEBUG is not set
1009# CONFIG_IWLWIFI is not set
1010# CONFIG_IWLCORE is not set
1011# CONFIG_IWLWIFI_LEDS is not set
1012# CONFIG_IWL4965 is not set
1013# CONFIG_IWL3945 is not set
1014# CONFIG_HOSTAP is not set
1015# CONFIG_B43 is not set
1016# CONFIG_B43LEGACY is not set
1017# CONFIG_ZD1211RW is not set
1018# CONFIG_RT2X00 is not set
771 1019
772# 1020#
773# USB Network Adapters 1021# USB Network Adapters
@@ -776,16 +1024,26 @@ CONFIG_S2IO=m
776# CONFIG_USB_KAWETH is not set 1024# CONFIG_USB_KAWETH is not set
777# CONFIG_USB_PEGASUS is not set 1025# CONFIG_USB_PEGASUS is not set
778# CONFIG_USB_RTL8150 is not set 1026# CONFIG_USB_RTL8150 is not set
779# CONFIG_USB_USBNET_MII is not set
780# CONFIG_USB_USBNET is not set 1027# CONFIG_USB_USBNET is not set
1028CONFIG_NET_PCMCIA=y
1029# CONFIG_PCMCIA_3C589 is not set
1030# CONFIG_PCMCIA_3C574 is not set
1031# CONFIG_PCMCIA_FMVJ18X is not set
1032# CONFIG_PCMCIA_PCNET is not set
1033# CONFIG_PCMCIA_NMCLAN is not set
1034# CONFIG_PCMCIA_SMC91C92 is not set
1035# CONFIG_PCMCIA_XIRC2PS is not set
1036# CONFIG_PCMCIA_AXNET is not set
781# CONFIG_WAN is not set 1037# CONFIG_WAN is not set
782# CONFIG_FDDI is not set 1038CONFIG_FDDI=y
1039# CONFIG_DEFXX is not set
1040# CONFIG_SKFP is not set
783# CONFIG_HIPPI is not set 1041# CONFIG_HIPPI is not set
784# CONFIG_PPP is not set 1042# CONFIG_PPP is not set
785# CONFIG_SLIP is not set 1043# CONFIG_SLIP is not set
786# CONFIG_NET_FC is not set 1044# CONFIG_NET_FC is not set
787# CONFIG_SHAPER is not set
788CONFIG_NETCONSOLE=y 1045CONFIG_NETCONSOLE=y
1046# CONFIG_NETCONSOLE_DYNAMIC is not set
789CONFIG_NETPOLL=y 1047CONFIG_NETPOLL=y
790# CONFIG_NETPOLL_TRAP is not set 1048# CONFIG_NETPOLL_TRAP is not set
791CONFIG_NET_POLL_CONTROLLER=y 1049CONFIG_NET_POLL_CONTROLLER=y
@@ -796,18 +1054,17 @@ CONFIG_NET_POLL_CONTROLLER=y
796# Input device support 1054# Input device support
797# 1055#
798CONFIG_INPUT=y 1056CONFIG_INPUT=y
799# CONFIG_INPUT_FF_MEMLESS is not set 1057CONFIG_INPUT_FF_MEMLESS=y
800# CONFIG_INPUT_POLLDEV is not set 1058CONFIG_INPUT_POLLDEV=y
801 1059
802# 1060#
803# Userland interfaces 1061# Userland interfaces
804# 1062#
805CONFIG_INPUT_MOUSEDEV=y 1063CONFIG_INPUT_MOUSEDEV=y
806CONFIG_INPUT_MOUSEDEV_PSAUX=y 1064# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
807CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 1065CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
808CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 1066CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
809# CONFIG_INPUT_JOYDEV is not set 1067# CONFIG_INPUT_JOYDEV is not set
810# CONFIG_INPUT_TSDEV is not set
811CONFIG_INPUT_EVDEV=y 1068CONFIG_INPUT_EVDEV=y
812# CONFIG_INPUT_EVBUG is not set 1069# CONFIG_INPUT_EVBUG is not set
813 1070
@@ -832,17 +1089,62 @@ CONFIG_MOUSE_PS2_TRACKPOINT=y
832# CONFIG_MOUSE_SERIAL is not set 1089# CONFIG_MOUSE_SERIAL is not set
833# CONFIG_MOUSE_APPLETOUCH is not set 1090# CONFIG_MOUSE_APPLETOUCH is not set
834# CONFIG_MOUSE_VSXXXAA is not set 1091# CONFIG_MOUSE_VSXXXAA is not set
835# CONFIG_INPUT_JOYSTICK is not set 1092CONFIG_INPUT_JOYSTICK=y
836# CONFIG_INPUT_TABLET is not set 1093# CONFIG_JOYSTICK_ANALOG is not set
837# CONFIG_INPUT_TOUCHSCREEN is not set 1094# CONFIG_JOYSTICK_A3D is not set
838# CONFIG_INPUT_MISC is not set 1095# CONFIG_JOYSTICK_ADI is not set
1096# CONFIG_JOYSTICK_COBRA is not set
1097# CONFIG_JOYSTICK_GF2K is not set
1098# CONFIG_JOYSTICK_GRIP is not set
1099# CONFIG_JOYSTICK_GRIP_MP is not set
1100# CONFIG_JOYSTICK_GUILLEMOT is not set
1101# CONFIG_JOYSTICK_INTERACT is not set
1102# CONFIG_JOYSTICK_SIDEWINDER is not set
1103# CONFIG_JOYSTICK_TMDC is not set
1104# CONFIG_JOYSTICK_IFORCE is not set
1105# CONFIG_JOYSTICK_WARRIOR is not set
1106# CONFIG_JOYSTICK_MAGELLAN is not set
1107# CONFIG_JOYSTICK_SPACEORB is not set
1108# CONFIG_JOYSTICK_SPACEBALL is not set
1109# CONFIG_JOYSTICK_STINGER is not set
1110# CONFIG_JOYSTICK_TWIDJOY is not set
1111# CONFIG_JOYSTICK_ZHENHUA is not set
1112# CONFIG_JOYSTICK_JOYDUMP is not set
1113# CONFIG_JOYSTICK_XPAD is not set
1114CONFIG_INPUT_TABLET=y
1115# CONFIG_TABLET_USB_ACECAD is not set
1116# CONFIG_TABLET_USB_AIPTEK is not set
1117# CONFIG_TABLET_USB_GTCO is not set
1118# CONFIG_TABLET_USB_KBTAB is not set
1119# CONFIG_TABLET_USB_WACOM is not set
1120CONFIG_INPUT_TOUCHSCREEN=y
1121# CONFIG_TOUCHSCREEN_FUJITSU is not set
1122# CONFIG_TOUCHSCREEN_GUNZE is not set
1123# CONFIG_TOUCHSCREEN_ELO is not set
1124# CONFIG_TOUCHSCREEN_MTOUCH is not set
1125# CONFIG_TOUCHSCREEN_MK712 is not set
1126# CONFIG_TOUCHSCREEN_PENMOUNT is not set
1127# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set
1128# CONFIG_TOUCHSCREEN_TOUCHWIN is not set
1129# CONFIG_TOUCHSCREEN_UCB1400 is not set
1130# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set
1131CONFIG_INPUT_MISC=y
1132# CONFIG_INPUT_PCSPKR is not set
1133# CONFIG_INPUT_APANEL is not set
1134# CONFIG_INPUT_ATLAS_BTNS is not set
1135# CONFIG_INPUT_ATI_REMOTE is not set
1136# CONFIG_INPUT_ATI_REMOTE2 is not set
1137# CONFIG_INPUT_KEYSPAN_REMOTE is not set
1138# CONFIG_INPUT_POWERMATE is not set
1139# CONFIG_INPUT_YEALINK is not set
1140# CONFIG_INPUT_UINPUT is not set
839 1141
840# 1142#
841# Hardware I/O ports 1143# Hardware I/O ports
842# 1144#
843CONFIG_SERIO=y 1145CONFIG_SERIO=y
844CONFIG_SERIO_I8042=y 1146CONFIG_SERIO_I8042=y
845# CONFIG_SERIO_SERPORT is not set 1147CONFIG_SERIO_SERPORT=y
846# CONFIG_SERIO_CT82C710 is not set 1148# CONFIG_SERIO_CT82C710 is not set
847# CONFIG_SERIO_PCIPS2 is not set 1149# CONFIG_SERIO_PCIPS2 is not set
848CONFIG_SERIO_LIBPS2=y 1150CONFIG_SERIO_LIBPS2=y
@@ -855,8 +1157,26 @@ CONFIG_SERIO_LIBPS2=y
855CONFIG_VT=y 1157CONFIG_VT=y
856CONFIG_VT_CONSOLE=y 1158CONFIG_VT_CONSOLE=y
857CONFIG_HW_CONSOLE=y 1159CONFIG_HW_CONSOLE=y
858# CONFIG_VT_HW_CONSOLE_BINDING is not set 1160CONFIG_VT_HW_CONSOLE_BINDING=y
859# CONFIG_SERIAL_NONSTANDARD is not set 1161CONFIG_DEVKMEM=y
1162CONFIG_SERIAL_NONSTANDARD=y
1163# CONFIG_COMPUTONE is not set
1164# CONFIG_ROCKETPORT is not set
1165# CONFIG_CYCLADES is not set
1166# CONFIG_DIGIEPCA is not set
1167# CONFIG_MOXA_INTELLIO is not set
1168# CONFIG_MOXA_SMARTIO is not set
1169# CONFIG_ISI is not set
1170# CONFIG_SYNCLINK is not set
1171# CONFIG_SYNCLINKMP is not set
1172# CONFIG_SYNCLINK_GT is not set
1173# CONFIG_N_HDLC is not set
1174# CONFIG_RISCOM8 is not set
1175# CONFIG_SPECIALIX is not set
1176# CONFIG_SX is not set
1177# CONFIG_RIO is not set
1178# CONFIG_STALDRV is not set
1179# CONFIG_NOZOMI is not set
860 1180
861# 1181#
862# Serial drivers 1182# Serial drivers
@@ -866,9 +1186,14 @@ CONFIG_SERIAL_8250_CONSOLE=y
866CONFIG_FIX_EARLYCON_MEM=y 1186CONFIG_FIX_EARLYCON_MEM=y
867CONFIG_SERIAL_8250_PCI=y 1187CONFIG_SERIAL_8250_PCI=y
868CONFIG_SERIAL_8250_PNP=y 1188CONFIG_SERIAL_8250_PNP=y
869CONFIG_SERIAL_8250_NR_UARTS=4 1189# CONFIG_SERIAL_8250_CS is not set
1190CONFIG_SERIAL_8250_NR_UARTS=32
870CONFIG_SERIAL_8250_RUNTIME_UARTS=4 1191CONFIG_SERIAL_8250_RUNTIME_UARTS=4
871# CONFIG_SERIAL_8250_EXTENDED is not set 1192CONFIG_SERIAL_8250_EXTENDED=y
1193CONFIG_SERIAL_8250_MANY_PORTS=y
1194CONFIG_SERIAL_8250_SHARE_IRQ=y
1195CONFIG_SERIAL_8250_DETECT_IRQ=y
1196CONFIG_SERIAL_8250_RSA=y
872 1197
873# 1198#
874# Non-8250 serial port support 1199# Non-8250 serial port support
@@ -877,78 +1202,260 @@ CONFIG_SERIAL_CORE=y
877CONFIG_SERIAL_CORE_CONSOLE=y 1202CONFIG_SERIAL_CORE_CONSOLE=y
878# CONFIG_SERIAL_JSM is not set 1203# CONFIG_SERIAL_JSM is not set
879CONFIG_UNIX98_PTYS=y 1204CONFIG_UNIX98_PTYS=y
880CONFIG_LEGACY_PTYS=y 1205# CONFIG_LEGACY_PTYS is not set
881CONFIG_LEGACY_PTY_COUNT=256
882# CONFIG_IPMI_HANDLER is not set 1206# CONFIG_IPMI_HANDLER is not set
883# CONFIG_WATCHDOG is not set
884CONFIG_HW_RANDOM=y 1207CONFIG_HW_RANDOM=y
885CONFIG_HW_RANDOM_INTEL=y 1208# CONFIG_HW_RANDOM_INTEL is not set
886CONFIG_HW_RANDOM_AMD=y 1209# CONFIG_HW_RANDOM_AMD is not set
887# CONFIG_NVRAM is not set 1210CONFIG_NVRAM=y
888CONFIG_RTC=y
889# CONFIG_R3964 is not set 1211# CONFIG_R3964 is not set
890# CONFIG_APPLICOM is not set 1212# CONFIG_APPLICOM is not set
891CONFIG_AGP=y 1213
892CONFIG_AGP_AMD64=y 1214#
893CONFIG_AGP_INTEL=y 1215# PCMCIA character devices
894# CONFIG_AGP_SIS is not set 1216#
895# CONFIG_AGP_VIA is not set 1217# CONFIG_SYNCLINK_CS is not set
896# CONFIG_DRM is not set 1218# CONFIG_CARDMAN_4000 is not set
1219# CONFIG_CARDMAN_4040 is not set
1220# CONFIG_IPWIRELESS is not set
897# CONFIG_MWAVE is not set 1221# CONFIG_MWAVE is not set
898# CONFIG_PC8736x_GPIO is not set 1222# CONFIG_PC8736x_GPIO is not set
899CONFIG_RAW_DRIVER=y 1223# CONFIG_RAW_DRIVER is not set
900CONFIG_MAX_RAW_DEVS=256
901CONFIG_HPET=y 1224CONFIG_HPET=y
902# CONFIG_HPET_RTC_IRQ is not set 1225# CONFIG_HPET_RTC_IRQ is not set
903CONFIG_HPET_MMAP=y 1226# CONFIG_HPET_MMAP is not set
904# CONFIG_HANGCHECK_TIMER is not set 1227# CONFIG_HANGCHECK_TIMER is not set
905# CONFIG_TCG_TPM is not set 1228# CONFIG_TCG_TPM is not set
906# CONFIG_TELCLOCK is not set 1229# CONFIG_TELCLOCK is not set
907CONFIG_DEVPORT=y 1230CONFIG_DEVPORT=y
908# CONFIG_I2C is not set 1231CONFIG_I2C=y
909 1232CONFIG_I2C_BOARDINFO=y
910# 1233# CONFIG_I2C_CHARDEV is not set
911# SPI support 1234
912# 1235#
1236# I2C Hardware Bus support
1237#
1238# CONFIG_I2C_ALI1535 is not set
1239# CONFIG_I2C_ALI1563 is not set
1240# CONFIG_I2C_ALI15X3 is not set
1241# CONFIG_I2C_AMD756 is not set
1242# CONFIG_I2C_AMD8111 is not set
1243CONFIG_I2C_I801=y
1244# CONFIG_I2C_I810 is not set
1245# CONFIG_I2C_PIIX4 is not set
1246# CONFIG_I2C_NFORCE2 is not set
1247# CONFIG_I2C_OCORES is not set
1248# CONFIG_I2C_PARPORT_LIGHT is not set
1249# CONFIG_I2C_PROSAVAGE is not set
1250# CONFIG_I2C_SAVAGE4 is not set
1251# CONFIG_I2C_SIMTEC is not set
1252# CONFIG_I2C_SIS5595 is not set
1253# CONFIG_I2C_SIS630 is not set
1254# CONFIG_I2C_SIS96X is not set
1255# CONFIG_I2C_TAOS_EVM is not set
1256# CONFIG_I2C_STUB is not set
1257# CONFIG_I2C_TINY_USB is not set
1258# CONFIG_I2C_VIA is not set
1259# CONFIG_I2C_VIAPRO is not set
1260# CONFIG_I2C_VOODOO3 is not set
1261# CONFIG_I2C_PCA_PLATFORM is not set
1262
1263#
1264# Miscellaneous I2C Chip support
1265#
1266# CONFIG_DS1682 is not set
1267# CONFIG_SENSORS_EEPROM is not set
1268# CONFIG_SENSORS_PCF8574 is not set
1269# CONFIG_PCF8575 is not set
1270# CONFIG_SENSORS_PCF8591 is not set
1271# CONFIG_SENSORS_MAX6875 is not set
1272# CONFIG_SENSORS_TSL2550 is not set
1273# CONFIG_I2C_DEBUG_CORE is not set
1274# CONFIG_I2C_DEBUG_ALGO is not set
1275# CONFIG_I2C_DEBUG_BUS is not set
1276# CONFIG_I2C_DEBUG_CHIP is not set
913# CONFIG_SPI is not set 1277# CONFIG_SPI is not set
914# CONFIG_SPI_MASTER is not set
915# CONFIG_W1 is not set 1278# CONFIG_W1 is not set
916# CONFIG_POWER_SUPPLY is not set 1279CONFIG_POWER_SUPPLY=y
1280# CONFIG_POWER_SUPPLY_DEBUG is not set
1281# CONFIG_PDA_POWER is not set
1282# CONFIG_BATTERY_DS2760 is not set
917# CONFIG_HWMON is not set 1283# CONFIG_HWMON is not set
1284CONFIG_THERMAL=y
1285CONFIG_WATCHDOG=y
1286# CONFIG_WATCHDOG_NOWAYOUT is not set
1287
1288#
1289# Watchdog Device Drivers
1290#
1291# CONFIG_SOFT_WATCHDOG is not set
1292# CONFIG_ACQUIRE_WDT is not set
1293# CONFIG_ADVANTECH_WDT is not set
1294# CONFIG_ALIM1535_WDT is not set
1295# CONFIG_ALIM7101_WDT is not set
1296# CONFIG_SC520_WDT is not set
1297# CONFIG_EUROTECH_WDT is not set
1298# CONFIG_IB700_WDT is not set
1299# CONFIG_IBMASR is not set
1300# CONFIG_WAFER_WDT is not set
1301# CONFIG_I6300ESB_WDT is not set
1302# CONFIG_ITCO_WDT is not set
1303# CONFIG_IT8712F_WDT is not set
1304# CONFIG_HP_WATCHDOG is not set
1305# CONFIG_SC1200_WDT is not set
1306# CONFIG_PC87413_WDT is not set
1307# CONFIG_60XX_WDT is not set
1308# CONFIG_SBC8360_WDT is not set
1309# CONFIG_CPU5_WDT is not set
1310# CONFIG_SMSC37B787_WDT is not set
1311# CONFIG_W83627HF_WDT is not set
1312# CONFIG_W83697HF_WDT is not set
1313# CONFIG_W83877F_WDT is not set
1314# CONFIG_W83977F_WDT is not set
1315# CONFIG_MACHZ_WDT is not set
1316# CONFIG_SBC_EPX_C3_WATCHDOG is not set
1317
1318#
1319# PCI-based Watchdog Cards
1320#
1321# CONFIG_PCIPCWATCHDOG is not set
1322# CONFIG_WDTPCI is not set
1323
1324#
1325# USB-based Watchdog Cards
1326#
1327# CONFIG_USBPCWATCHDOG is not set
1328
1329#
1330# Sonics Silicon Backplane
1331#
1332CONFIG_SSB_POSSIBLE=y
1333# CONFIG_SSB is not set
918 1334
919# 1335#
920# Multifunction device drivers 1336# Multifunction device drivers
921# 1337#
922# CONFIG_MFD_SM501 is not set 1338# CONFIG_MFD_SM501 is not set
1339# CONFIG_HTC_PASIC3 is not set
923 1340
924# 1341#
925# Multimedia devices 1342# Multimedia devices
926# 1343#
1344
1345#
1346# Multimedia core support
1347#
927# CONFIG_VIDEO_DEV is not set 1348# CONFIG_VIDEO_DEV is not set
928# CONFIG_DVB_CORE is not set 1349# CONFIG_DVB_CORE is not set
1350
1351#
1352# Multimedia drivers
1353#
929CONFIG_DAB=y 1354CONFIG_DAB=y
930# CONFIG_USB_DABUSB is not set 1355# CONFIG_USB_DABUSB is not set
931 1356
932# 1357#
933# Graphics support 1358# Graphics support
934# 1359#
935# CONFIG_BACKLIGHT_LCD_SUPPORT is not set 1360CONFIG_AGP=y
1361CONFIG_AGP_AMD64=y
1362CONFIG_AGP_INTEL=y
1363# CONFIG_AGP_SIS is not set
1364# CONFIG_AGP_VIA is not set
1365CONFIG_DRM=y
1366# CONFIG_DRM_TDFX is not set
1367# CONFIG_DRM_R128 is not set
1368# CONFIG_DRM_RADEON is not set
1369# CONFIG_DRM_I810 is not set
1370# CONFIG_DRM_I830 is not set
1371CONFIG_DRM_I915=y
1372# CONFIG_DRM_MGA is not set
1373# CONFIG_DRM_SIS is not set
1374# CONFIG_DRM_VIA is not set
1375# CONFIG_DRM_SAVAGE is not set
1376# CONFIG_VGASTATE is not set
1377# CONFIG_VIDEO_OUTPUT_CONTROL is not set
1378CONFIG_FB=y
1379# CONFIG_FIRMWARE_EDID is not set
1380# CONFIG_FB_DDC is not set
1381CONFIG_FB_CFB_FILLRECT=y
1382CONFIG_FB_CFB_COPYAREA=y
1383CONFIG_FB_CFB_IMAGEBLIT=y
1384# CONFIG_FB_CFB_REV_PIXELS_IN_BYTE is not set
1385# CONFIG_FB_SYS_FILLRECT is not set
1386# CONFIG_FB_SYS_COPYAREA is not set
1387# CONFIG_FB_SYS_IMAGEBLIT is not set
1388# CONFIG_FB_FOREIGN_ENDIAN is not set
1389# CONFIG_FB_SYS_FOPS is not set
1390CONFIG_FB_DEFERRED_IO=y
1391# CONFIG_FB_SVGALIB is not set
1392# CONFIG_FB_MACMODES is not set
1393# CONFIG_FB_BACKLIGHT is not set
1394CONFIG_FB_MODE_HELPERS=y
1395CONFIG_FB_TILEBLITTING=y
1396
1397#
1398# Frame buffer hardware drivers
1399#
1400# CONFIG_FB_CIRRUS is not set
1401# CONFIG_FB_PM2 is not set
1402# CONFIG_FB_CYBER2000 is not set
1403# CONFIG_FB_ARC is not set
1404# CONFIG_FB_ASILIANT is not set
1405# CONFIG_FB_IMSTT is not set
1406# CONFIG_FB_VGA16 is not set
1407# CONFIG_FB_UVESA is not set
1408# CONFIG_FB_VESA is not set
1409CONFIG_FB_EFI=y
1410# CONFIG_FB_IMAC is not set
1411# CONFIG_FB_N411 is not set
1412# CONFIG_FB_HGA is not set
1413# CONFIG_FB_S1D13XXX is not set
1414# CONFIG_FB_NVIDIA is not set
1415# CONFIG_FB_RIVA is not set
1416# CONFIG_FB_LE80578 is not set
1417# CONFIG_FB_INTEL is not set
1418# CONFIG_FB_MATROX is not set
1419# CONFIG_FB_RADEON is not set
1420# CONFIG_FB_ATY128 is not set
1421# CONFIG_FB_ATY is not set
1422# CONFIG_FB_S3 is not set
1423# CONFIG_FB_SAVAGE is not set
1424# CONFIG_FB_SIS is not set
1425# CONFIG_FB_NEOMAGIC is not set
1426# CONFIG_FB_KYRO is not set
1427# CONFIG_FB_3DFX is not set
1428# CONFIG_FB_VOODOO1 is not set
1429# CONFIG_FB_VT8623 is not set
1430# CONFIG_FB_TRIDENT is not set
1431# CONFIG_FB_ARK is not set
1432# CONFIG_FB_PM3 is not set
1433# CONFIG_FB_GEODE is not set
1434# CONFIG_FB_VIRTUAL is not set
1435CONFIG_BACKLIGHT_LCD_SUPPORT=y
1436# CONFIG_LCD_CLASS_DEVICE is not set
1437CONFIG_BACKLIGHT_CLASS_DEVICE=y
1438# CONFIG_BACKLIGHT_CORGI is not set
1439# CONFIG_BACKLIGHT_PROGEAR is not set
936 1440
937# 1441#
938# Display device support 1442# Display device support
939# 1443#
940# CONFIG_DISPLAY_SUPPORT is not set 1444# CONFIG_DISPLAY_SUPPORT is not set
941# CONFIG_VGASTATE is not set
942# CONFIG_FB is not set
943 1445
944# 1446#
945# Console display driver support 1447# Console display driver support
946# 1448#
947CONFIG_VGA_CONSOLE=y 1449CONFIG_VGA_CONSOLE=y
948CONFIG_VGACON_SOFT_SCROLLBACK=y 1450CONFIG_VGACON_SOFT_SCROLLBACK=y
949CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=256 1451CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64
950CONFIG_VIDEO_SELECT=y 1452CONFIG_VIDEO_SELECT=y
951CONFIG_DUMMY_CONSOLE=y 1453CONFIG_DUMMY_CONSOLE=y
1454# CONFIG_FRAMEBUFFER_CONSOLE is not set
1455CONFIG_LOGO=y
1456# CONFIG_LOGO_LINUX_MONO is not set
1457# CONFIG_LOGO_LINUX_VGA16 is not set
1458CONFIG_LOGO_LINUX_CLUT224=y
952 1459
953# 1460#
954# Sound 1461# Sound
@@ -958,33 +1465,165 @@ CONFIG_SOUND=y
958# 1465#
959# Advanced Linux Sound Architecture 1466# Advanced Linux Sound Architecture
960# 1467#
961# CONFIG_SND is not set 1468CONFIG_SND=y
1469CONFIG_SND_TIMER=y
1470CONFIG_SND_PCM=y
1471CONFIG_SND_HWDEP=y
1472CONFIG_SND_SEQUENCER=y
1473CONFIG_SND_SEQ_DUMMY=y
1474CONFIG_SND_OSSEMUL=y
1475CONFIG_SND_MIXER_OSS=y
1476CONFIG_SND_PCM_OSS=y
1477CONFIG_SND_PCM_OSS_PLUGINS=y
1478CONFIG_SND_SEQUENCER_OSS=y
1479CONFIG_SND_DYNAMIC_MINORS=y
1480CONFIG_SND_SUPPORT_OLD_API=y
1481CONFIG_SND_VERBOSE_PROCFS=y
1482# CONFIG_SND_VERBOSE_PRINTK is not set
1483# CONFIG_SND_DEBUG is not set
1484CONFIG_SND_VMASTER=y
1485
1486#
1487# Generic devices
1488#
1489# CONFIG_SND_PCSP is not set
1490# CONFIG_SND_DUMMY is not set
1491# CONFIG_SND_VIRMIDI is not set
1492# CONFIG_SND_MTPAV is not set
1493# CONFIG_SND_SERIAL_U16550 is not set
1494# CONFIG_SND_MPU401 is not set
1495
1496#
1497# PCI devices
1498#
1499# CONFIG_SND_AD1889 is not set
1500# CONFIG_SND_ALS300 is not set
1501# CONFIG_SND_ALS4000 is not set
1502# CONFIG_SND_ALI5451 is not set
1503# CONFIG_SND_ATIIXP is not set
1504# CONFIG_SND_ATIIXP_MODEM is not set
1505# CONFIG_SND_AU8810 is not set
1506# CONFIG_SND_AU8820 is not set
1507# CONFIG_SND_AU8830 is not set
1508# CONFIG_SND_AW2 is not set
1509# CONFIG_SND_AZT3328 is not set
1510# CONFIG_SND_BT87X is not set
1511# CONFIG_SND_CA0106 is not set
1512# CONFIG_SND_CMIPCI is not set
1513# CONFIG_SND_OXYGEN is not set
1514# CONFIG_SND_CS4281 is not set
1515# CONFIG_SND_CS46XX is not set
1516# CONFIG_SND_CS5530 is not set
1517# CONFIG_SND_DARLA20 is not set
1518# CONFIG_SND_GINA20 is not set
1519# CONFIG_SND_LAYLA20 is not set
1520# CONFIG_SND_DARLA24 is not set
1521# CONFIG_SND_GINA24 is not set
1522# CONFIG_SND_LAYLA24 is not set
1523# CONFIG_SND_MONA is not set
1524# CONFIG_SND_MIA is not set
1525# CONFIG_SND_ECHO3G is not set
1526# CONFIG_SND_INDIGO is not set
1527# CONFIG_SND_INDIGOIO is not set
1528# CONFIG_SND_INDIGODJ is not set
1529# CONFIG_SND_EMU10K1 is not set
1530# CONFIG_SND_EMU10K1X is not set
1531# CONFIG_SND_ENS1370 is not set
1532# CONFIG_SND_ENS1371 is not set
1533# CONFIG_SND_ES1938 is not set
1534# CONFIG_SND_ES1968 is not set
1535# CONFIG_SND_FM801 is not set
1536CONFIG_SND_HDA_INTEL=y
1537CONFIG_SND_HDA_HWDEP=y
1538CONFIG_SND_HDA_CODEC_REALTEK=y
1539CONFIG_SND_HDA_CODEC_ANALOG=y
1540CONFIG_SND_HDA_CODEC_SIGMATEL=y
1541CONFIG_SND_HDA_CODEC_VIA=y
1542CONFIG_SND_HDA_CODEC_ATIHDMI=y
1543CONFIG_SND_HDA_CODEC_CONEXANT=y
1544CONFIG_SND_HDA_CODEC_CMEDIA=y
1545CONFIG_SND_HDA_CODEC_SI3054=y
1546CONFIG_SND_HDA_GENERIC=y
1547# CONFIG_SND_HDA_POWER_SAVE is not set
1548# CONFIG_SND_HDSP is not set
1549# CONFIG_SND_HDSPM is not set
1550# CONFIG_SND_HIFIER is not set
1551# CONFIG_SND_ICE1712 is not set
1552# CONFIG_SND_ICE1724 is not set
1553# CONFIG_SND_INTEL8X0 is not set
1554# CONFIG_SND_INTEL8X0M is not set
1555# CONFIG_SND_KORG1212 is not set
1556# CONFIG_SND_MAESTRO3 is not set
1557# CONFIG_SND_MIXART is not set
1558# CONFIG_SND_NM256 is not set
1559# CONFIG_SND_PCXHR is not set
1560# CONFIG_SND_RIPTIDE is not set
1561# CONFIG_SND_RME32 is not set
1562# CONFIG_SND_RME96 is not set
1563# CONFIG_SND_RME9652 is not set
1564# CONFIG_SND_SONICVIBES is not set
1565# CONFIG_SND_TRIDENT is not set
1566# CONFIG_SND_VIA82XX is not set
1567# CONFIG_SND_VIA82XX_MODEM is not set
1568# CONFIG_SND_VIRTUOSO is not set
1569# CONFIG_SND_VX222 is not set
1570# CONFIG_SND_YMFPCI is not set
1571
1572#
1573# USB devices
1574#
1575# CONFIG_SND_USB_AUDIO is not set
1576# CONFIG_SND_USB_USX2Y is not set
1577# CONFIG_SND_USB_CAIAQ is not set
1578
1579#
1580# PCMCIA devices
1581#
1582# CONFIG_SND_VXPOCKET is not set
1583# CONFIG_SND_PDAUDIOCF is not set
1584
1585#
1586# System on Chip audio support
1587#
1588# CONFIG_SND_SOC is not set
1589
1590#
1591# ALSA SoC audio for Freescale SOCs
1592#
1593
1594#
1595# SoC Audio for the Texas Instruments OMAP
1596#
962 1597
963# 1598#
964# Open Sound System 1599# Open Sound System
965# 1600#
966CONFIG_SOUND_PRIME=y 1601# CONFIG_SOUND_PRIME is not set
967# CONFIG_SOUND_TRIDENT is not set
968# CONFIG_SOUND_MSNDCLAS is not set
969# CONFIG_SOUND_MSNDPIN is not set
970# CONFIG_SOUND_OSS is not set
971CONFIG_HID_SUPPORT=y 1602CONFIG_HID_SUPPORT=y
972CONFIG_HID=y 1603CONFIG_HID=y
973# CONFIG_HID_DEBUG is not set 1604CONFIG_HID_DEBUG=y
1605CONFIG_HIDRAW=y
974 1606
975# 1607#
976# USB Input Devices 1608# USB Input Devices
977# 1609#
978CONFIG_USB_HID=y 1610CONFIG_USB_HID=y
979# CONFIG_USB_HIDINPUT_POWERBOOK is not set 1611CONFIG_USB_HIDINPUT_POWERBOOK=y
980# CONFIG_HID_FF is not set 1612CONFIG_HID_FF=y
981# CONFIG_USB_HIDDEV is not set 1613CONFIG_HID_PID=y
1614CONFIG_LOGITECH_FF=y
1615# CONFIG_LOGIRUMBLEPAD2_FF is not set
1616CONFIG_PANTHERLORD_FF=y
1617CONFIG_THRUSTMASTER_FF=y
1618CONFIG_ZEROPLUS_FF=y
1619CONFIG_USB_HIDDEV=y
982CONFIG_USB_SUPPORT=y 1620CONFIG_USB_SUPPORT=y
983CONFIG_USB_ARCH_HAS_HCD=y 1621CONFIG_USB_ARCH_HAS_HCD=y
984CONFIG_USB_ARCH_HAS_OHCI=y 1622CONFIG_USB_ARCH_HAS_OHCI=y
985CONFIG_USB_ARCH_HAS_EHCI=y 1623CONFIG_USB_ARCH_HAS_EHCI=y
986CONFIG_USB=y 1624CONFIG_USB=y
987# CONFIG_USB_DEBUG is not set 1625CONFIG_USB_DEBUG=y
1626CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
988 1627
989# 1628#
990# Miscellaneous USB options 1629# Miscellaneous USB options
@@ -992,18 +1631,18 @@ CONFIG_USB=y
992CONFIG_USB_DEVICEFS=y 1631CONFIG_USB_DEVICEFS=y
993# CONFIG_USB_DEVICE_CLASS is not set 1632# CONFIG_USB_DEVICE_CLASS is not set
994# CONFIG_USB_DYNAMIC_MINORS is not set 1633# CONFIG_USB_DYNAMIC_MINORS is not set
995# CONFIG_USB_SUSPEND is not set 1634CONFIG_USB_SUSPEND=y
996# CONFIG_USB_PERSIST is not set
997# CONFIG_USB_OTG is not set 1635# CONFIG_USB_OTG is not set
998 1636
999# 1637#
1000# USB Host Controller Drivers 1638# USB Host Controller Drivers
1001# 1639#
1640# CONFIG_USB_C67X00_HCD is not set
1002CONFIG_USB_EHCI_HCD=y 1641CONFIG_USB_EHCI_HCD=y
1003# CONFIG_USB_EHCI_SPLIT_ISO is not set
1004# CONFIG_USB_EHCI_ROOT_HUB_TT is not set 1642# CONFIG_USB_EHCI_ROOT_HUB_TT is not set
1005# CONFIG_USB_EHCI_TT_NEWSCHED is not set 1643# CONFIG_USB_EHCI_TT_NEWSCHED is not set
1006# CONFIG_USB_ISP116X_HCD is not set 1644# CONFIG_USB_ISP116X_HCD is not set
1645# CONFIG_USB_ISP1760_HCD is not set
1007CONFIG_USB_OHCI_HCD=y 1646CONFIG_USB_OHCI_HCD=y
1008# CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set 1647# CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set
1009# CONFIG_USB_OHCI_BIG_ENDIAN_MMIO is not set 1648# CONFIG_USB_OHCI_BIG_ENDIAN_MMIO is not set
@@ -1036,8 +1675,10 @@ CONFIG_USB_STORAGE=y
1036# CONFIG_USB_STORAGE_SDDR55 is not set 1675# CONFIG_USB_STORAGE_SDDR55 is not set
1037# CONFIG_USB_STORAGE_JUMPSHOT is not set 1676# CONFIG_USB_STORAGE_JUMPSHOT is not set
1038# CONFIG_USB_STORAGE_ALAUDA is not set 1677# CONFIG_USB_STORAGE_ALAUDA is not set
1678# CONFIG_USB_STORAGE_ONETOUCH is not set
1039# CONFIG_USB_STORAGE_KARMA is not set 1679# CONFIG_USB_STORAGE_KARMA is not set
1040# CONFIG_USB_LIBUSUAL is not set 1680# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set
1681CONFIG_USB_LIBUSUAL=y
1041 1682
1042# 1683#
1043# USB Imaging devices 1684# USB Imaging devices
@@ -1049,10 +1690,6 @@ CONFIG_USB_MON=y
1049# 1690#
1050# USB port drivers 1691# USB port drivers
1051# 1692#
1052
1053#
1054# USB Serial Converter support
1055#
1056# CONFIG_USB_SERIAL is not set 1693# CONFIG_USB_SERIAL is not set
1057 1694
1058# 1695#
@@ -1078,98 +1715,126 @@ CONFIG_USB_MON=y
1078# CONFIG_USB_TRANCEVIBRATOR is not set 1715# CONFIG_USB_TRANCEVIBRATOR is not set
1079# CONFIG_USB_IOWARRIOR is not set 1716# CONFIG_USB_IOWARRIOR is not set
1080# CONFIG_USB_TEST is not set 1717# CONFIG_USB_TEST is not set
1718# CONFIG_USB_GADGET is not set
1719# CONFIG_MMC is not set
1720# CONFIG_MEMSTICK is not set
1721CONFIG_NEW_LEDS=y
1722CONFIG_LEDS_CLASS=y
1081 1723
1082# 1724#
1083# USB DSL modem support 1725# LED drivers
1084# 1726#
1727# CONFIG_LEDS_CLEVO_MAIL is not set
1085 1728
1086# 1729#
1087# USB Gadget Support 1730# LED Triggers
1088# 1731#
1089# CONFIG_USB_GADGET is not set 1732CONFIG_LEDS_TRIGGERS=y
1090# CONFIG_MMC is not set 1733# CONFIG_LEDS_TRIGGER_TIMER is not set
1734# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
1735# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
1736# CONFIG_ACCESSIBILITY is not set
1737# CONFIG_INFINIBAND is not set
1738CONFIG_EDAC=y
1091 1739
1092# 1740#
1093# LED devices 1741# Reporting subsystems
1094# 1742#
1095# CONFIG_NEW_LEDS is not set 1743# CONFIG_EDAC_DEBUG is not set
1744# CONFIG_EDAC_MM_EDAC is not set
1745CONFIG_RTC_LIB=y
1746CONFIG_RTC_CLASS=y
1747# CONFIG_RTC_HCTOSYS is not set
1748# CONFIG_RTC_DEBUG is not set
1096 1749
1097# 1750#
1098# LED drivers 1751# RTC interfaces
1099# 1752#
1753CONFIG_RTC_INTF_SYSFS=y
1754CONFIG_RTC_INTF_PROC=y
1755CONFIG_RTC_INTF_DEV=y
1756# CONFIG_RTC_INTF_DEV_UIE_EMUL is not set
1757# CONFIG_RTC_DRV_TEST is not set
1100 1758
1101# 1759#
1102# LED Triggers 1760# I2C RTC drivers
1103# 1761#
1104# CONFIG_INFINIBAND is not set 1762# CONFIG_RTC_DRV_DS1307 is not set
1105# CONFIG_EDAC is not set 1763# CONFIG_RTC_DRV_DS1374 is not set
1764# CONFIG_RTC_DRV_DS1672 is not set
1765# CONFIG_RTC_DRV_MAX6900 is not set
1766# CONFIG_RTC_DRV_RS5C372 is not set
1767# CONFIG_RTC_DRV_ISL1208 is not set
1768# CONFIG_RTC_DRV_X1205 is not set
1769# CONFIG_RTC_DRV_PCF8563 is not set
1770# CONFIG_RTC_DRV_PCF8583 is not set
1771# CONFIG_RTC_DRV_M41T80 is not set
1772# CONFIG_RTC_DRV_S35390A is not set
1106 1773
1107# 1774#
1108# Real Time Clock 1775# SPI RTC drivers
1109# 1776#
1110# CONFIG_RTC_CLASS is not set
1111 1777
1112# 1778#
1113# DMA Engine support 1779# Platform RTC drivers
1114# 1780#
1115# CONFIG_DMA_ENGINE is not set 1781CONFIG_RTC_DRV_CMOS=y
1782# CONFIG_RTC_DRV_DS1511 is not set
1783# CONFIG_RTC_DRV_DS1553 is not set
1784# CONFIG_RTC_DRV_DS1742 is not set
1785# CONFIG_RTC_DRV_STK17TA8 is not set
1786# CONFIG_RTC_DRV_M48T86 is not set
1787# CONFIG_RTC_DRV_M48T59 is not set
1788# CONFIG_RTC_DRV_V3020 is not set
1116 1789
1117# 1790#
1118# DMA Clients 1791# on-CPU RTC drivers
1119# 1792#
1793CONFIG_DMADEVICES=y
1120 1794
1121# 1795#
1122# DMA Devices 1796# DMA Devices
1123# 1797#
1124CONFIG_VIRTUALIZATION=y 1798# CONFIG_INTEL_IOATDMA is not set
1125# CONFIG_KVM is not set
1126
1127#
1128# Userspace I/O
1129#
1130# CONFIG_UIO is not set 1799# CONFIG_UIO is not set
1131 1800
1132# 1801#
1133# Firmware Drivers 1802# Firmware Drivers
1134# 1803#
1135# CONFIG_EDD is not set 1804# CONFIG_EDD is not set
1805CONFIG_EFI_VARS=y
1136# CONFIG_DELL_RBU is not set 1806# CONFIG_DELL_RBU is not set
1137# CONFIG_DCDBAS is not set 1807# CONFIG_DCDBAS is not set
1138CONFIG_DMIID=y 1808CONFIG_DMIID=y
1809# CONFIG_ISCSI_IBFT_FIND is not set
1139 1810
1140# 1811#
1141# File systems 1812# File systems
1142# 1813#
1143CONFIG_EXT2_FS=y 1814# CONFIG_EXT2_FS is not set
1144CONFIG_EXT2_FS_XATTR=y
1145CONFIG_EXT2_FS_POSIX_ACL=y
1146# CONFIG_EXT2_FS_SECURITY is not set
1147# CONFIG_EXT2_FS_XIP is not set
1148CONFIG_EXT3_FS=y 1815CONFIG_EXT3_FS=y
1149CONFIG_EXT3_FS_XATTR=y 1816CONFIG_EXT3_FS_XATTR=y
1150CONFIG_EXT3_FS_POSIX_ACL=y 1817CONFIG_EXT3_FS_POSIX_ACL=y
1151# CONFIG_EXT3_FS_SECURITY is not set 1818CONFIG_EXT3_FS_SECURITY=y
1152# CONFIG_EXT4DEV_FS is not set 1819# CONFIG_EXT4DEV_FS is not set
1153CONFIG_JBD=y 1820CONFIG_JBD=y
1154# CONFIG_JBD_DEBUG is not set 1821# CONFIG_JBD_DEBUG is not set
1155CONFIG_FS_MBCACHE=y 1822CONFIG_FS_MBCACHE=y
1156CONFIG_REISERFS_FS=y 1823# CONFIG_REISERFS_FS is not set
1157# CONFIG_REISERFS_CHECK is not set
1158# CONFIG_REISERFS_PROC_INFO is not set
1159CONFIG_REISERFS_FS_XATTR=y
1160CONFIG_REISERFS_FS_POSIX_ACL=y
1161# CONFIG_REISERFS_FS_SECURITY is not set
1162# CONFIG_JFS_FS is not set 1824# CONFIG_JFS_FS is not set
1163CONFIG_FS_POSIX_ACL=y 1825CONFIG_FS_POSIX_ACL=y
1164# CONFIG_XFS_FS is not set 1826# CONFIG_XFS_FS is not set
1165# CONFIG_GFS2_FS is not set 1827# CONFIG_GFS2_FS is not set
1166# CONFIG_OCFS2_FS is not set 1828# CONFIG_OCFS2_FS is not set
1167# CONFIG_MINIX_FS is not set 1829CONFIG_DNOTIFY=y
1168# CONFIG_ROMFS_FS is not set
1169CONFIG_INOTIFY=y 1830CONFIG_INOTIFY=y
1170CONFIG_INOTIFY_USER=y 1831CONFIG_INOTIFY_USER=y
1171# CONFIG_QUOTA is not set 1832CONFIG_QUOTA=y
1172CONFIG_DNOTIFY=y 1833CONFIG_QUOTA_NETLINK_INTERFACE=y
1834# CONFIG_PRINT_QUOTA_WARNING is not set
1835# CONFIG_QFMT_V1 is not set
1836CONFIG_QFMT_V2=y
1837CONFIG_QUOTACTL=y
1173# CONFIG_AUTOFS_FS is not set 1838# CONFIG_AUTOFS_FS is not set
1174CONFIG_AUTOFS4_FS=y 1839CONFIG_AUTOFS4_FS=y
1175# CONFIG_FUSE_FS is not set 1840# CONFIG_FUSE_FS is not set
@@ -1180,7 +1845,7 @@ CONFIG_GENERIC_ACL=y
1180# 1845#
1181CONFIG_ISO9660_FS=y 1846CONFIG_ISO9660_FS=y
1182CONFIG_JOLIET=y 1847CONFIG_JOLIET=y
1183# CONFIG_ZISOFS is not set 1848CONFIG_ZISOFS=y
1184# CONFIG_UDF_FS is not set 1849# CONFIG_UDF_FS is not set
1185 1850
1186# 1851#
@@ -1198,13 +1863,13 @@ CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1"
1198# 1863#
1199CONFIG_PROC_FS=y 1864CONFIG_PROC_FS=y
1200CONFIG_PROC_KCORE=y 1865CONFIG_PROC_KCORE=y
1866CONFIG_PROC_VMCORE=y
1201CONFIG_PROC_SYSCTL=y 1867CONFIG_PROC_SYSCTL=y
1202CONFIG_SYSFS=y 1868CONFIG_SYSFS=y
1203CONFIG_TMPFS=y 1869CONFIG_TMPFS=y
1204CONFIG_TMPFS_POSIX_ACL=y 1870CONFIG_TMPFS_POSIX_ACL=y
1205CONFIG_HUGETLBFS=y 1871CONFIG_HUGETLBFS=y
1206CONFIG_HUGETLB_PAGE=y 1872CONFIG_HUGETLB_PAGE=y
1207CONFIG_RAMFS=y
1208# CONFIG_CONFIGFS_FS is not set 1873# CONFIG_CONFIGFS_FS is not set
1209 1874
1210# 1875#
@@ -1212,6 +1877,7 @@ CONFIG_RAMFS=y
1212# 1877#
1213# CONFIG_ADFS_FS is not set 1878# CONFIG_ADFS_FS is not set
1214# CONFIG_AFFS_FS is not set 1879# CONFIG_AFFS_FS is not set
1880# CONFIG_ECRYPT_FS is not set
1215# CONFIG_HFS_FS is not set 1881# CONFIG_HFS_FS is not set
1216# CONFIG_HFSPLUS_FS is not set 1882# CONFIG_HFSPLUS_FS is not set
1217# CONFIG_BEFS_FS is not set 1883# CONFIG_BEFS_FS is not set
@@ -1219,33 +1885,15 @@ CONFIG_RAMFS=y
1219# CONFIG_EFS_FS is not set 1885# CONFIG_EFS_FS is not set
1220# CONFIG_CRAMFS is not set 1886# CONFIG_CRAMFS is not set
1221# CONFIG_VXFS_FS is not set 1887# CONFIG_VXFS_FS is not set
1888# CONFIG_MINIX_FS is not set
1222# CONFIG_HPFS_FS is not set 1889# CONFIG_HPFS_FS is not set
1223# CONFIG_QNX4FS_FS is not set 1890# CONFIG_QNX4FS_FS is not set
1891# CONFIG_ROMFS_FS is not set
1224# CONFIG_SYSV_FS is not set 1892# CONFIG_SYSV_FS is not set
1225# CONFIG_UFS_FS is not set 1893# CONFIG_UFS_FS is not set
1226 1894CONFIG_NETWORK_FILESYSTEMS=y
1227# 1895# CONFIG_NFS_FS is not set
1228# Network File Systems 1896# CONFIG_NFSD is not set
1229#
1230CONFIG_NFS_FS=y
1231CONFIG_NFS_V3=y
1232# CONFIG_NFS_V3_ACL is not set
1233# CONFIG_NFS_V4 is not set
1234# CONFIG_NFS_DIRECTIO is not set
1235CONFIG_NFSD=y
1236CONFIG_NFSD_V3=y
1237# CONFIG_NFSD_V3_ACL is not set
1238# CONFIG_NFSD_V4 is not set
1239CONFIG_NFSD_TCP=y
1240CONFIG_ROOT_NFS=y
1241CONFIG_LOCKD=y
1242CONFIG_LOCKD_V4=y
1243CONFIG_EXPORTFS=y
1244CONFIG_NFS_COMMON=y
1245CONFIG_SUNRPC=y
1246# CONFIG_SUNRPC_BIND34 is not set
1247# CONFIG_RPCSEC_GSS_KRB5 is not set
1248# CONFIG_RPCSEC_GSS_SPKM3 is not set
1249# CONFIG_SMB_FS is not set 1897# CONFIG_SMB_FS is not set
1250# CONFIG_CIFS is not set 1898# CONFIG_CIFS is not set
1251# CONFIG_NCP_FS is not set 1899# CONFIG_NCP_FS is not set
@@ -1255,14 +1903,26 @@ CONFIG_SUNRPC=y
1255# 1903#
1256# Partition Types 1904# Partition Types
1257# 1905#
1258# CONFIG_PARTITION_ADVANCED is not set 1906CONFIG_PARTITION_ADVANCED=y
1907# CONFIG_ACORN_PARTITION is not set
1908CONFIG_OSF_PARTITION=y
1909CONFIG_AMIGA_PARTITION=y
1910# CONFIG_ATARI_PARTITION is not set
1911CONFIG_MAC_PARTITION=y
1259CONFIG_MSDOS_PARTITION=y 1912CONFIG_MSDOS_PARTITION=y
1260 1913CONFIG_BSD_DISKLABEL=y
1261# 1914CONFIG_MINIX_SUBPARTITION=y
1262# Native Language Support 1915CONFIG_SOLARIS_X86_PARTITION=y
1263# 1916CONFIG_UNIXWARE_DISKLABEL=y
1917# CONFIG_LDM_PARTITION is not set
1918CONFIG_SGI_PARTITION=y
1919# CONFIG_ULTRIX_PARTITION is not set
1920CONFIG_SUN_PARTITION=y
1921CONFIG_KARMA_PARTITION=y
1922CONFIG_EFI_PARTITION=y
1923# CONFIG_SYSV68_PARTITION is not set
1264CONFIG_NLS=y 1924CONFIG_NLS=y
1265CONFIG_NLS_DEFAULT="iso8859-1" 1925CONFIG_NLS_DEFAULT="utf8"
1266CONFIG_NLS_CODEPAGE_437=y 1926CONFIG_NLS_CODEPAGE_437=y
1267# CONFIG_NLS_CODEPAGE_737 is not set 1927# CONFIG_NLS_CODEPAGE_737 is not set
1268# CONFIG_NLS_CODEPAGE_775 is not set 1928# CONFIG_NLS_CODEPAGE_775 is not set
@@ -1297,40 +1957,33 @@ CONFIG_NLS_ISO8859_1=y
1297# CONFIG_NLS_ISO8859_9 is not set 1957# CONFIG_NLS_ISO8859_9 is not set
1298# CONFIG_NLS_ISO8859_13 is not set 1958# CONFIG_NLS_ISO8859_13 is not set
1299# CONFIG_NLS_ISO8859_14 is not set 1959# CONFIG_NLS_ISO8859_14 is not set
1300CONFIG_NLS_ISO8859_15=y 1960# CONFIG_NLS_ISO8859_15 is not set
1301# CONFIG_NLS_KOI8_R is not set 1961# CONFIG_NLS_KOI8_R is not set
1302# CONFIG_NLS_KOI8_U is not set 1962# CONFIG_NLS_KOI8_U is not set
1303CONFIG_NLS_UTF8=y 1963CONFIG_NLS_UTF8=y
1304
1305#
1306# Distributed Lock Manager
1307#
1308# CONFIG_DLM is not set 1964# CONFIG_DLM is not set
1309 1965
1310# 1966#
1311# Instrumentation Support
1312#
1313CONFIG_PROFILING=y
1314CONFIG_OPROFILE=y
1315CONFIG_KPROBES=y
1316
1317#
1318# Kernel hacking 1967# Kernel hacking
1319# 1968#
1320CONFIG_TRACE_IRQFLAGS_SUPPORT=y 1969CONFIG_TRACE_IRQFLAGS_SUPPORT=y
1321# CONFIG_PRINTK_TIME is not set 1970# CONFIG_PRINTK_TIME is not set
1971# CONFIG_ENABLE_WARN_DEPRECATED is not set
1322# CONFIG_ENABLE_MUST_CHECK is not set 1972# CONFIG_ENABLE_MUST_CHECK is not set
1973CONFIG_FRAME_WARN=2048
1323CONFIG_MAGIC_SYSRQ=y 1974CONFIG_MAGIC_SYSRQ=y
1324CONFIG_UNUSED_SYMBOLS=y 1975# CONFIG_UNUSED_SYMBOLS is not set
1325CONFIG_DEBUG_FS=y 1976CONFIG_DEBUG_FS=y
1326# CONFIG_HEADERS_CHECK is not set 1977# CONFIG_HEADERS_CHECK is not set
1327CONFIG_DEBUG_KERNEL=y 1978CONFIG_DEBUG_KERNEL=y
1328# CONFIG_DEBUG_SHIRQ is not set 1979# CONFIG_DEBUG_SHIRQ is not set
1329CONFIG_DETECT_SOFTLOCKUP=y 1980# CONFIG_DETECT_SOFTLOCKUP is not set
1330# CONFIG_SCHED_DEBUG is not set 1981# CONFIG_SCHED_DEBUG is not set
1331# CONFIG_SCHEDSTATS is not set 1982CONFIG_SCHEDSTATS=y
1332CONFIG_TIMER_STATS=y 1983CONFIG_TIMER_STATS=y
1333# CONFIG_DEBUG_SLAB is not set 1984# CONFIG_DEBUG_OBJECTS is not set
1985# CONFIG_SLUB_DEBUG_ON is not set
1986# CONFIG_SLUB_STATS is not set
1334# CONFIG_DEBUG_RT_MUTEXES is not set 1987# CONFIG_DEBUG_RT_MUTEXES is not set
1335# CONFIG_RT_MUTEX_TESTER is not set 1988# CONFIG_RT_MUTEX_TESTER is not set
1336# CONFIG_DEBUG_SPINLOCK is not set 1989# CONFIG_DEBUG_SPINLOCK is not set
@@ -1344,28 +1997,162 @@ CONFIG_TIMER_STATS=y
1344CONFIG_DEBUG_BUGVERBOSE=y 1997CONFIG_DEBUG_BUGVERBOSE=y
1345# CONFIG_DEBUG_INFO is not set 1998# CONFIG_DEBUG_INFO is not set
1346# CONFIG_DEBUG_VM is not set 1999# CONFIG_DEBUG_VM is not set
2000# CONFIG_DEBUG_WRITECOUNT is not set
1347# CONFIG_DEBUG_LIST is not set 2001# CONFIG_DEBUG_LIST is not set
1348# CONFIG_FRAME_POINTER is not set 2002# CONFIG_DEBUG_SG is not set
1349CONFIG_OPTIMIZE_INLINING=y 2003CONFIG_FRAME_POINTER=y
2004# CONFIG_BOOT_PRINTK_DELAY is not set
1350# CONFIG_RCU_TORTURE_TEST is not set 2005# CONFIG_RCU_TORTURE_TEST is not set
2006# CONFIG_KPROBES_SANITY_TEST is not set
2007# CONFIG_BACKTRACE_SELF_TEST is not set
1351# CONFIG_LKDTM is not set 2008# CONFIG_LKDTM is not set
1352# CONFIG_FAULT_INJECTION is not set 2009# CONFIG_FAULT_INJECTION is not set
1353# CONFIG_DEBUG_RODATA is not set 2010# CONFIG_LATENCYTOP is not set
1354# CONFIG_IOMMU_DEBUG is not set 2011CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
2012# CONFIG_SAMPLES is not set
2013# CONFIG_KGDB is not set
2014CONFIG_HAVE_ARCH_KGDB=y
2015# CONFIG_STRICT_DEVMEM is not set
2016CONFIG_EARLY_PRINTK=y
1355CONFIG_DEBUG_STACKOVERFLOW=y 2017CONFIG_DEBUG_STACKOVERFLOW=y
1356# CONFIG_DEBUG_STACK_USAGE is not set 2018CONFIG_DEBUG_STACK_USAGE=y
2019# CONFIG_DEBUG_PAGEALLOC is not set
2020# CONFIG_DEBUG_PER_CPU_MAPS is not set
2021# CONFIG_X86_PTDUMP is not set
2022CONFIG_DEBUG_RODATA=y
2023# CONFIG_DIRECT_GBPAGES is not set
2024# CONFIG_DEBUG_RODATA_TEST is not set
2025CONFIG_DEBUG_NX_TEST=m
2026CONFIG_X86_MPPARSE=y
2027# CONFIG_IOMMU_DEBUG is not set
2028CONFIG_IO_DELAY_TYPE_0X80=0
2029CONFIG_IO_DELAY_TYPE_0XED=1
2030CONFIG_IO_DELAY_TYPE_UDELAY=2
2031CONFIG_IO_DELAY_TYPE_NONE=3
2032CONFIG_IO_DELAY_0X80=y
2033# CONFIG_IO_DELAY_0XED is not set
2034# CONFIG_IO_DELAY_UDELAY is not set
2035# CONFIG_IO_DELAY_NONE is not set
2036CONFIG_DEFAULT_IO_DELAY_TYPE=0
2037CONFIG_DEBUG_BOOT_PARAMS=y
2038# CONFIG_CPA_DEBUG is not set
1357 2039
1358# 2040#
1359# Security options 2041# Security options
1360# 2042#
1361# CONFIG_KEYS is not set 2043CONFIG_KEYS=y
1362# CONFIG_SECURITY is not set 2044CONFIG_KEYS_DEBUG_PROC_KEYS=y
1363# CONFIG_CRYPTO is not set 2045CONFIG_SECURITY=y
2046CONFIG_SECURITY_NETWORK=y
2047# CONFIG_SECURITY_NETWORK_XFRM is not set
2048CONFIG_SECURITY_CAPABILITIES=y
2049CONFIG_SECURITY_FILE_CAPABILITIES=y
2050# CONFIG_SECURITY_ROOTPLUG is not set
2051CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536
2052CONFIG_SECURITY_SELINUX=y
2053CONFIG_SECURITY_SELINUX_BOOTPARAM=y
2054CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE=1
2055CONFIG_SECURITY_SELINUX_DISABLE=y
2056CONFIG_SECURITY_SELINUX_DEVELOP=y
2057CONFIG_SECURITY_SELINUX_AVC_STATS=y
2058CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
2059# CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT is not set
2060# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
2061# CONFIG_SECURITY_SMACK is not set
2062CONFIG_CRYPTO=y
2063
2064#
2065# Crypto core or helper
2066#
2067CONFIG_CRYPTO_ALGAPI=y
2068CONFIG_CRYPTO_AEAD=y
2069CONFIG_CRYPTO_BLKCIPHER=y
2070CONFIG_CRYPTO_HASH=y
2071CONFIG_CRYPTO_MANAGER=y
2072# CONFIG_CRYPTO_GF128MUL is not set
2073# CONFIG_CRYPTO_NULL is not set
2074# CONFIG_CRYPTO_CRYPTD is not set
2075CONFIG_CRYPTO_AUTHENC=y
2076# CONFIG_CRYPTO_TEST is not set
2077
2078#
2079# Authenticated Encryption with Associated Data
2080#
2081# CONFIG_CRYPTO_CCM is not set
2082# CONFIG_CRYPTO_GCM is not set
2083# CONFIG_CRYPTO_SEQIV is not set
2084
2085#
2086# Block modes
2087#
2088CONFIG_CRYPTO_CBC=y
2089# CONFIG_CRYPTO_CTR is not set
2090# CONFIG_CRYPTO_CTS is not set
2091CONFIG_CRYPTO_ECB=y
2092# CONFIG_CRYPTO_LRW is not set
2093# CONFIG_CRYPTO_PCBC is not set
2094# CONFIG_CRYPTO_XTS is not set
2095
2096#
2097# Hash modes
2098#
2099CONFIG_CRYPTO_HMAC=y
2100# CONFIG_CRYPTO_XCBC is not set
2101
2102#
2103# Digest
2104#
2105# CONFIG_CRYPTO_CRC32C is not set
2106# CONFIG_CRYPTO_MD4 is not set
2107CONFIG_CRYPTO_MD5=y
2108# CONFIG_CRYPTO_MICHAEL_MIC is not set
2109CONFIG_CRYPTO_SHA1=y
2110# CONFIG_CRYPTO_SHA256 is not set
2111# CONFIG_CRYPTO_SHA512 is not set
2112# CONFIG_CRYPTO_TGR192 is not set
2113# CONFIG_CRYPTO_WP512 is not set
2114
2115#
2116# Ciphers
2117#
2118CONFIG_CRYPTO_AES=y
2119# CONFIG_CRYPTO_AES_X86_64 is not set
2120# CONFIG_CRYPTO_ANUBIS is not set
2121CONFIG_CRYPTO_ARC4=y
2122# CONFIG_CRYPTO_BLOWFISH is not set
2123# CONFIG_CRYPTO_CAMELLIA is not set
2124# CONFIG_CRYPTO_CAST5 is not set
2125# CONFIG_CRYPTO_CAST6 is not set
2126CONFIG_CRYPTO_DES=y
2127# CONFIG_CRYPTO_FCRYPT is not set
2128# CONFIG_CRYPTO_KHAZAD is not set
2129# CONFIG_CRYPTO_SALSA20 is not set
2130# CONFIG_CRYPTO_SALSA20_X86_64 is not set
2131# CONFIG_CRYPTO_SEED is not set
2132# CONFIG_CRYPTO_SERPENT is not set
2133# CONFIG_CRYPTO_TEA is not set
2134# CONFIG_CRYPTO_TWOFISH is not set
2135# CONFIG_CRYPTO_TWOFISH_X86_64 is not set
2136
2137#
2138# Compression
2139#
2140# CONFIG_CRYPTO_DEFLATE is not set
2141# CONFIG_CRYPTO_LZO is not set
2142CONFIG_CRYPTO_HW=y
2143# CONFIG_CRYPTO_DEV_HIFN_795X is not set
2144CONFIG_HAVE_KVM=y
2145CONFIG_VIRTUALIZATION=y
2146# CONFIG_KVM is not set
2147# CONFIG_VIRTIO_PCI is not set
2148# CONFIG_VIRTIO_BALLOON is not set
1364 2149
1365# 2150#
1366# Library routines 2151# Library routines
1367# 2152#
1368CONFIG_BITREVERSE=y 2153CONFIG_BITREVERSE=y
2154CONFIG_GENERIC_FIND_FIRST_BIT=y
2155CONFIG_GENERIC_FIND_NEXT_BIT=y
1369# CONFIG_CRC_CCITT is not set 2156# CONFIG_CRC_CCITT is not set
1370# CONFIG_CRC16 is not set 2157# CONFIG_CRC16 is not set
1371# CONFIG_CRC_ITU_T is not set 2158# CONFIG_CRC_ITU_T is not set
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 58cccb6483b0..a0e1dbe67dc1 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -441,12 +441,6 @@ beyond_if:
441 regs->r8 = regs->r9 = regs->r10 = regs->r11 = 441 regs->r8 = regs->r9 = regs->r10 = regs->r11 =
442 regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; 442 regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
443 set_fs(USER_DS); 443 set_fs(USER_DS);
444 if (unlikely(current->ptrace & PT_PTRACED)) {
445 if (current->ptrace & PT_TRACE_EXEC)
446 ptrace_notify((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
447 else
448 send_sig(SIGTRAP, current, 0);
449 }
450 return 0; 444 return 0;
451} 445}
452 446
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index cb3856a18c85..20af4c79579a 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -36,6 +36,11 @@
36 36
37#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) 37#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
38 38
39#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \
40 X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
41 X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
42 X86_EFLAGS_CF)
43
39asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset); 44asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset);
40void signal_fault(struct pt_regs *regs, void __user *frame, char *where); 45void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
41 46
@@ -248,7 +253,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
248 regs->ss |= 3; 253 regs->ss |= 3;
249 254
250 err |= __get_user(tmpflags, &sc->flags); 255 err |= __get_user(tmpflags, &sc->flags);
251 regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5); 256 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
252 /* disable syscall checks */ 257 /* disable syscall checks */
253 regs->orig_ax = -1; 258 regs->orig_ax = -1;
254 259
@@ -515,7 +520,6 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
515 compat_sigset_t *set, struct pt_regs *regs) 520 compat_sigset_t *set, struct pt_regs *regs)
516{ 521{
517 struct rt_sigframe __user *frame; 522 struct rt_sigframe __user *frame;
518 struct exec_domain *ed = current_thread_info()->exec_domain;
519 void __user *restorer; 523 void __user *restorer;
520 int err = 0; 524 int err = 0;
521 525
@@ -538,8 +542,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
538 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 542 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
539 goto give_sigsegv; 543 goto give_sigsegv;
540 544
541 err |= __put_user((ed && ed->signal_invmap && sig < 32 545 err |= __put_user(sig, &frame->sig);
542 ? ed->signal_invmap[sig] : sig), &frame->sig);
543 err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo); 546 err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo);
544 err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc); 547 err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc);
545 err |= copy_siginfo_to_user32(&frame->info, info); 548 err |= copy_siginfo_to_user32(&frame->info, info);
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index b5e329da166c..ffc1bb4fed7d 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -15,6 +15,16 @@
15#include <asm/irqflags.h> 15#include <asm/irqflags.h>
16#include <linux/linkage.h> 16#include <linux/linkage.h>
17 17
18/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
19#include <linux/elf-em.h>
20#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
21#define __AUDIT_ARCH_LE 0x40000000
22
23#ifndef CONFIG_AUDITSYSCALL
24#define sysexit_audit int_ret_from_sys_call
25#define sysretl_audit int_ret_from_sys_call
26#endif
27
18#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) 28#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
19 29
20 .macro IA32_ARG_FIXUP noebp=0 30 .macro IA32_ARG_FIXUP noebp=0
@@ -37,6 +47,11 @@
37 movq %rax,R8(%rsp) 47 movq %rax,R8(%rsp)
38 .endm 48 .endm
39 49
50 /*
51 * Reload arg registers from stack in case ptrace changed them.
52 * We don't reload %eax because syscall_trace_enter() returned
53 * the value it wants us to use in the table lookup.
54 */
40 .macro LOAD_ARGS32 offset 55 .macro LOAD_ARGS32 offset
41 movl \offset(%rsp),%r11d 56 movl \offset(%rsp),%r11d
42 movl \offset+8(%rsp),%r10d 57 movl \offset+8(%rsp),%r10d
@@ -46,7 +61,6 @@
46 movl \offset+48(%rsp),%edx 61 movl \offset+48(%rsp),%edx
47 movl \offset+56(%rsp),%esi 62 movl \offset+56(%rsp),%esi
48 movl \offset+64(%rsp),%edi 63 movl \offset+64(%rsp),%edi
49 movl \offset+72(%rsp),%eax
50 .endm 64 .endm
51 65
52 .macro CFI_STARTPROC32 simple 66 .macro CFI_STARTPROC32 simple
@@ -61,6 +75,19 @@
61 CFI_UNDEFINED r15 75 CFI_UNDEFINED r15
62 .endm 76 .endm
63 77
78#ifdef CONFIG_PARAVIRT
79ENTRY(native_usergs_sysret32)
80 swapgs
81 sysretl
82ENDPROC(native_usergs_sysret32)
83
84ENTRY(native_irq_enable_sysexit)
85 swapgs
86 sti
87 sysexit
88ENDPROC(native_irq_enable_sysexit)
89#endif
90
64/* 91/*
65 * 32bit SYSENTER instruction entry. 92 * 32bit SYSENTER instruction entry.
66 * 93 *
@@ -85,14 +112,14 @@ ENTRY(ia32_sysenter_target)
85 CFI_SIGNAL_FRAME 112 CFI_SIGNAL_FRAME
86 CFI_DEF_CFA rsp,0 113 CFI_DEF_CFA rsp,0
87 CFI_REGISTER rsp,rbp 114 CFI_REGISTER rsp,rbp
88 swapgs 115 SWAPGS_UNSAFE_STACK
89 movq %gs:pda_kernelstack, %rsp 116 movq %gs:pda_kernelstack, %rsp
90 addq $(PDA_STACKOFFSET),%rsp 117 addq $(PDA_STACKOFFSET),%rsp
91 /* 118 /*
92 * No need to follow this irqs on/off section: the syscall 119 * No need to follow this irqs on/off section: the syscall
93 * disabled irqs, here we enable it straight after entry: 120 * disabled irqs, here we enable it straight after entry:
94 */ 121 */
95 sti 122 ENABLE_INTERRUPTS(CLBR_NONE)
96 movl %ebp,%ebp /* zero extension */ 123 movl %ebp,%ebp /* zero extension */
97 pushq $__USER32_DS 124 pushq $__USER32_DS
98 CFI_ADJUST_CFA_OFFSET 8 125 CFI_ADJUST_CFA_OFFSET 8
@@ -103,7 +130,7 @@ ENTRY(ia32_sysenter_target)
103 pushfq 130 pushfq
104 CFI_ADJUST_CFA_OFFSET 8 131 CFI_ADJUST_CFA_OFFSET 8
105 /*CFI_REL_OFFSET rflags,0*/ 132 /*CFI_REL_OFFSET rflags,0*/
106 movl 8*3-THREAD_SIZE+threadinfo_sysenter_return(%rsp), %r10d 133 movl 8*3-THREAD_SIZE+TI_sysenter_return(%rsp), %r10d
107 CFI_REGISTER rip,r10 134 CFI_REGISTER rip,r10
108 pushq $__USER32_CS 135 pushq $__USER32_CS
109 CFI_ADJUST_CFA_OFFSET 8 136 CFI_ADJUST_CFA_OFFSET 8
@@ -123,22 +150,24 @@ ENTRY(ia32_sysenter_target)
123 .quad 1b,ia32_badarg 150 .quad 1b,ia32_badarg
124 .previous 151 .previous
125 GET_THREAD_INFO(%r10) 152 GET_THREAD_INFO(%r10)
126 orl $TS_COMPAT,threadinfo_status(%r10) 153 orl $TS_COMPAT,TI_status(%r10)
127 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) 154 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
128 CFI_REMEMBER_STATE 155 CFI_REMEMBER_STATE
129 jnz sysenter_tracesys 156 jnz sysenter_tracesys
130sysenter_do_call:
131 cmpl $(IA32_NR_syscalls-1),%eax 157 cmpl $(IA32_NR_syscalls-1),%eax
132 ja ia32_badsys 158 ja ia32_badsys
159sysenter_do_call:
133 IA32_ARG_FIXUP 1 160 IA32_ARG_FIXUP 1
161sysenter_dispatch:
134 call *ia32_sys_call_table(,%rax,8) 162 call *ia32_sys_call_table(,%rax,8)
135 movq %rax,RAX-ARGOFFSET(%rsp) 163 movq %rax,RAX-ARGOFFSET(%rsp)
136 GET_THREAD_INFO(%r10) 164 GET_THREAD_INFO(%r10)
137 cli 165 DISABLE_INTERRUPTS(CLBR_NONE)
138 TRACE_IRQS_OFF 166 TRACE_IRQS_OFF
139 testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10) 167 testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
140 jnz int_ret_from_sys_call 168 jnz sysexit_audit
141 andl $~TS_COMPAT,threadinfo_status(%r10) 169sysexit_from_sys_call:
170 andl $~TS_COMPAT,TI_status(%r10)
142 /* clear IF, that popfq doesn't enable interrupts early */ 171 /* clear IF, that popfq doesn't enable interrupts early */
143 andl $~0x200,EFLAGS-R11(%rsp) 172 andl $~0x200,EFLAGS-R11(%rsp)
144 movl RIP-R11(%rsp),%edx /* User %eip */ 173 movl RIP-R11(%rsp),%edx /* User %eip */
@@ -151,14 +180,65 @@ sysenter_do_call:
151 CFI_ADJUST_CFA_OFFSET -8 180 CFI_ADJUST_CFA_OFFSET -8
152 CFI_REGISTER rsp,rcx 181 CFI_REGISTER rsp,rcx
153 TRACE_IRQS_ON 182 TRACE_IRQS_ON
154 swapgs 183 ENABLE_INTERRUPTS_SYSEXIT32
155 sti /* sti only takes effect after the next instruction */
156 /* sysexit */
157 .byte 0xf, 0x35
158 184
159sysenter_tracesys: 185#ifdef CONFIG_AUDITSYSCALL
186 .macro auditsys_entry_common
187 movl %esi,%r9d /* 6th arg: 4th syscall arg */
188 movl %edx,%r8d /* 5th arg: 3rd syscall arg */
189 /* (already in %ecx) 4th arg: 2nd syscall arg */
190 movl %ebx,%edx /* 3rd arg: 1st syscall arg */
191 movl %eax,%esi /* 2nd arg: syscall number */
192 movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */
193 call audit_syscall_entry
194 movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */
195 cmpl $(IA32_NR_syscalls-1),%eax
196 ja ia32_badsys
197 movl %ebx,%edi /* reload 1st syscall arg */
198 movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */
199 movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */
200 movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */
201 movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */
202 .endm
203
204 .macro auditsys_exit exit,ebpsave=RBP
205 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
206 jnz int_ret_from_sys_call
207 TRACE_IRQS_ON
208 sti
209 movl %eax,%esi /* second arg, syscall return value */
210 cmpl $0,%eax /* is it < 0? */
211 setl %al /* 1 if so, 0 if not */
212 movzbl %al,%edi /* zero-extend that into %edi */
213 inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
214 call audit_syscall_exit
215 GET_THREAD_INFO(%r10)
216 movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */
217 movl \ebpsave-ARGOFFSET(%rsp),%ebp /* reload user register value */
218 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
219 cli
220 TRACE_IRQS_OFF
221 testl %edi,TI_flags(%r10)
222 jnz int_with_check
223 jmp \exit
224 .endm
225
226sysenter_auditsys:
160 CFI_RESTORE_STATE 227 CFI_RESTORE_STATE
228 auditsys_entry_common
229 movl %ebp,%r9d /* reload 6th syscall arg */
230 jmp sysenter_dispatch
231
232sysexit_audit:
233 auditsys_exit sysexit_from_sys_call
234#endif
235
236sysenter_tracesys:
161 xchgl %r9d,%ebp 237 xchgl %r9d,%ebp
238#ifdef CONFIG_AUDITSYSCALL
239 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
240 jz sysenter_auditsys
241#endif
162 SAVE_REST 242 SAVE_REST
163 CLEAR_RREGS 243 CLEAR_RREGS
164 movq %r9,R9(%rsp) 244 movq %r9,R9(%rsp)
@@ -200,7 +280,7 @@ ENTRY(ia32_cstar_target)
200 CFI_DEF_CFA rsp,PDA_STACKOFFSET 280 CFI_DEF_CFA rsp,PDA_STACKOFFSET
201 CFI_REGISTER rip,rcx 281 CFI_REGISTER rip,rcx
202 /*CFI_REGISTER rflags,r11*/ 282 /*CFI_REGISTER rflags,r11*/
203 swapgs 283 SWAPGS_UNSAFE_STACK
204 movl %esp,%r8d 284 movl %esp,%r8d
205 CFI_REGISTER rsp,r8 285 CFI_REGISTER rsp,r8
206 movq %gs:pda_kernelstack,%rsp 286 movq %gs:pda_kernelstack,%rsp
@@ -208,7 +288,7 @@ ENTRY(ia32_cstar_target)
208 * No need to follow this irqs on/off section: the syscall 288 * No need to follow this irqs on/off section: the syscall
209 * disabled irqs and here we enable it straight after entry: 289 * disabled irqs and here we enable it straight after entry:
210 */ 290 */
211 sti 291 ENABLE_INTERRUPTS(CLBR_NONE)
212 SAVE_ARGS 8,1,1 292 SAVE_ARGS 8,1,1
213 movl %eax,%eax /* zero extension */ 293 movl %eax,%eax /* zero extension */
214 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) 294 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
@@ -230,22 +310,24 @@ ENTRY(ia32_cstar_target)
230 .quad 1b,ia32_badarg 310 .quad 1b,ia32_badarg
231 .previous 311 .previous
232 GET_THREAD_INFO(%r10) 312 GET_THREAD_INFO(%r10)
233 orl $TS_COMPAT,threadinfo_status(%r10) 313 orl $TS_COMPAT,TI_status(%r10)
234 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) 314 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
235 CFI_REMEMBER_STATE 315 CFI_REMEMBER_STATE
236 jnz cstar_tracesys 316 jnz cstar_tracesys
237cstar_do_call: 317cstar_do_call:
238 cmpl $IA32_NR_syscalls-1,%eax 318 cmpl $IA32_NR_syscalls-1,%eax
239 ja ia32_badsys 319 ja ia32_badsys
240 IA32_ARG_FIXUP 1 320 IA32_ARG_FIXUP 1
321cstar_dispatch:
241 call *ia32_sys_call_table(,%rax,8) 322 call *ia32_sys_call_table(,%rax,8)
242 movq %rax,RAX-ARGOFFSET(%rsp) 323 movq %rax,RAX-ARGOFFSET(%rsp)
243 GET_THREAD_INFO(%r10) 324 GET_THREAD_INFO(%r10)
244 cli 325 DISABLE_INTERRUPTS(CLBR_NONE)
245 TRACE_IRQS_OFF 326 TRACE_IRQS_OFF
246 testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10) 327 testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
247 jnz int_ret_from_sys_call 328 jnz sysretl_audit
248 andl $~TS_COMPAT,threadinfo_status(%r10) 329sysretl_from_sys_call:
330 andl $~TS_COMPAT,TI_status(%r10)
249 RESTORE_ARGS 1,-ARG_SKIP,1,1,1 331 RESTORE_ARGS 1,-ARG_SKIP,1,1,1
250 movl RIP-ARGOFFSET(%rsp),%ecx 332 movl RIP-ARGOFFSET(%rsp),%ecx
251 CFI_REGISTER rip,rcx 333 CFI_REGISTER rip,rcx
@@ -254,11 +336,25 @@ cstar_do_call:
254 TRACE_IRQS_ON 336 TRACE_IRQS_ON
255 movl RSP-ARGOFFSET(%rsp),%esp 337 movl RSP-ARGOFFSET(%rsp),%esp
256 CFI_RESTORE rsp 338 CFI_RESTORE rsp
257 swapgs 339 USERGS_SYSRET32
258 sysretl
259 340
260cstar_tracesys: 341#ifdef CONFIG_AUDITSYSCALL
342cstar_auditsys:
261 CFI_RESTORE_STATE 343 CFI_RESTORE_STATE
344 movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */
345 auditsys_entry_common
346 movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */
347 jmp cstar_dispatch
348
349sysretl_audit:
350 auditsys_exit sysretl_from_sys_call, RCX /* user %ebp in RCX slot */
351#endif
352
353cstar_tracesys:
354#ifdef CONFIG_AUDITSYSCALL
355 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
356 jz cstar_auditsys
357#endif
262 xchgl %r9d,%ebp 358 xchgl %r9d,%ebp
263 SAVE_REST 359 SAVE_REST
264 CLEAR_RREGS 360 CLEAR_RREGS
@@ -310,12 +406,13 @@ ENTRY(ia32_syscall)
310 /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/ 406 /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/
311 /*CFI_REL_OFFSET cs,CS-RIP*/ 407 /*CFI_REL_OFFSET cs,CS-RIP*/
312 CFI_REL_OFFSET rip,RIP-RIP 408 CFI_REL_OFFSET rip,RIP-RIP
313 swapgs 409 PARAVIRT_ADJUST_EXCEPTION_FRAME
410 SWAPGS
314 /* 411 /*
315 * No need to follow this irqs on/off section: the syscall 412 * No need to follow this irqs on/off section: the syscall
316 * disabled irqs and here we enable it straight after entry: 413 * disabled irqs and here we enable it straight after entry:
317 */ 414 */
318 sti 415 ENABLE_INTERRUPTS(CLBR_NONE)
319 movl %eax,%eax 416 movl %eax,%eax
320 pushq %rax 417 pushq %rax
321 CFI_ADJUST_CFA_OFFSET 8 418 CFI_ADJUST_CFA_OFFSET 8
@@ -324,8 +421,8 @@ ENTRY(ia32_syscall)
324 this could be a problem. */ 421 this could be a problem. */
325 SAVE_ARGS 0,0,1 422 SAVE_ARGS 0,0,1
326 GET_THREAD_INFO(%r10) 423 GET_THREAD_INFO(%r10)
327 orl $TS_COMPAT,threadinfo_status(%r10) 424 orl $TS_COMPAT,TI_status(%r10)
328 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) 425 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
329 jnz ia32_tracesys 426 jnz ia32_tracesys
330ia32_do_syscall: 427ia32_do_syscall:
331 cmpl $(IA32_NR_syscalls-1),%eax 428 cmpl $(IA32_NR_syscalls-1),%eax
@@ -370,13 +467,11 @@ quiet_ni_syscall:
370 PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi 467 PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
371 PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi 468 PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
372 PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx 469 PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
373 PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx
374 PTREGSCALL stub32_execve, sys32_execve, %rcx 470 PTREGSCALL stub32_execve, sys32_execve, %rcx
375 PTREGSCALL stub32_fork, sys_fork, %rdi 471 PTREGSCALL stub32_fork, sys_fork, %rdi
376 PTREGSCALL stub32_clone, sys32_clone, %rdx 472 PTREGSCALL stub32_clone, sys32_clone, %rdx
377 PTREGSCALL stub32_vfork, sys_vfork, %rdi 473 PTREGSCALL stub32_vfork, sys_vfork, %rdi
378 PTREGSCALL stub32_iopl, sys_iopl, %rsi 474 PTREGSCALL stub32_iopl, sys_iopl, %rsi
379 PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx
380 475
381ENTRY(ia32_ptregs_common) 476ENTRY(ia32_ptregs_common)
382 popq %r11 477 popq %r11
@@ -476,7 +571,7 @@ ia32_sys_call_table:
476 .quad sys_ssetmask 571 .quad sys_ssetmask
477 .quad sys_setreuid16 /* 70 */ 572 .quad sys_setreuid16 /* 70 */
478 .quad sys_setregid16 573 .quad sys_setregid16
479 .quad stub32_sigsuspend 574 .quad sys32_sigsuspend
480 .quad compat_sys_sigpending 575 .quad compat_sys_sigpending
481 .quad sys_sethostname 576 .quad sys_sethostname
482 .quad compat_sys_setrlimit /* 75 */ 577 .quad compat_sys_setrlimit /* 75 */
@@ -583,7 +678,7 @@ ia32_sys_call_table:
583 .quad sys32_rt_sigpending 678 .quad sys32_rt_sigpending
584 .quad compat_sys_rt_sigtimedwait 679 .quad compat_sys_rt_sigtimedwait
585 .quad sys32_rt_sigqueueinfo 680 .quad sys32_rt_sigqueueinfo
586 .quad stub32_rt_sigsuspend 681 .quad sys_rt_sigsuspend
587 .quad sys32_pread /* 180 */ 682 .quad sys32_pread /* 180 */
588 .quad sys32_pwrite 683 .quad sys32_pwrite
589 .quad sys_chown16 684 .quad sys_chown16
@@ -731,4 +826,10 @@ ia32_sys_call_table:
731 .quad sys32_fallocate 826 .quad sys32_fallocate
732 .quad compat_sys_timerfd_settime /* 325 */ 827 .quad compat_sys_timerfd_settime /* 325 */
733 .quad compat_sys_timerfd_gettime 828 .quad compat_sys_timerfd_gettime
829 .quad compat_sys_signalfd4
830 .quad sys_eventfd2
831 .quad sys_epoll_create1
832 .quad sys_dup3 /* 330 */
833 .quad sys_pipe2
834 .quad sys_inotify_init1
734ia32_syscall_end: 835ia32_syscall_end:
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index f00afdf61e67..d3c64088b981 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -238,7 +238,7 @@ asmlinkage long sys32_pipe(int __user *fd)
238 int retval; 238 int retval;
239 int fds[2]; 239 int fds[2];
240 240
241 retval = do_pipe(fds); 241 retval = do_pipe_flags(fds, 0);
242 if (retval) 242 if (retval)
243 goto out; 243 goto out;
244 if (copy_to_user(fd, fds, sizeof(fds))) 244 if (copy_to_user(fd, fds, sizeof(fds)))
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 77807d4769c9..3db651fc8ec5 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -2,10 +2,17 @@
2# Makefile for the linux kernel. 2# Makefile for the linux kernel.
3# 3#
4 4
5extra-y := head_$(BITS).o head$(BITS).o init_task.o vmlinux.lds 5extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinux.lds
6 6
7CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) 7CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
8 8
9ifdef CONFIG_FTRACE
10# Do not profile debug and lowlevel utilities
11CFLAGS_REMOVE_tsc.o = -pg
12CFLAGS_REMOVE_rtc.o = -pg
13CFLAGS_REMOVE_paravirt.o = -pg
14endif
15
9# 16#
10# vsyscalls (which work on the user stack) should have 17# vsyscalls (which work on the user stack) should have
11# no stack-protector checks: 18# no stack-protector checks:
@@ -13,20 +20,21 @@ CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
13nostackp := $(call cc-option, -fno-stack-protector) 20nostackp := $(call cc-option, -fno-stack-protector)
14CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) 21CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
15CFLAGS_hpet.o := $(nostackp) 22CFLAGS_hpet.o := $(nostackp)
16CFLAGS_tsc_64.o := $(nostackp) 23CFLAGS_tsc.o := $(nostackp)
17 24
18obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o 25obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
19obj-y += traps_$(BITS).o irq_$(BITS).o 26obj-y += traps_$(BITS).o irq_$(BITS).o
20obj-y += time_$(BITS).o ioport.o ldt.o 27obj-y += time_$(BITS).o ioport.o ldt.o
21obj-y += setup_$(BITS).o i8259_$(BITS).o setup.o 28obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
29obj-$(CONFIG_X86_VISWS) += visws_quirks.o
30obj-$(CONFIG_X86_32) += probe_roms_32.o
22obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 31obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
23obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o 32obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
24obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o 33obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
25obj-y += bootflag.o e820_$(BITS).o 34obj-y += bootflag.o e820.o
26obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o 35obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
27obj-y += alternative.o i8253.o pci-nommu.o 36obj-y += alternative.o i8253.o pci-nommu.o
28obj-$(CONFIG_X86_64) += bugs_64.o 37obj-y += tsc.o io_delay.o rtc.o
29obj-y += tsc_$(BITS).o io_delay.o rtc.o
30 38
31obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 39obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
32obj-y += process.o 40obj-y += process.o
@@ -53,9 +61,10 @@ obj-$(CONFIG_X86_32_SMP) += smpcommon.o
53obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o 61obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o
54obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o 62obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
55obj-$(CONFIG_X86_MPPARSE) += mpparse.o 63obj-$(CONFIG_X86_MPPARSE) += mpparse.o
56obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi_$(BITS).o 64obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi.o
57obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o 65obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o
58obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o 66obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
67obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
59obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o 68obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
60obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 69obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
61obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 70obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
@@ -64,7 +73,6 @@ obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o
64obj-y += vsmp_64.o 73obj-y += vsmp_64.o
65obj-$(CONFIG_KPROBES) += kprobes.o 74obj-$(CONFIG_KPROBES) += kprobes.o
66obj-$(CONFIG_MODULES) += module_$(BITS).o 75obj-$(CONFIG_MODULES) += module_$(BITS).o
67obj-$(CONFIG_ACPI_SRAT) += srat_32.o
68obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o 76obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
69obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o 77obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
70obj-$(CONFIG_KGDB) += kgdb.o 78obj-$(CONFIG_KGDB) += kgdb.o
@@ -94,12 +102,14 @@ obj-$(CONFIG_OLPC) += olpc.o
94### 102###
95# 64 bit specific files 103# 64 bit specific files
96ifeq ($(CONFIG_X86_64),y) 104ifeq ($(CONFIG_X86_64),y)
97 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o 105 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
106 obj-y += bios_uv.o
98 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o 107 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
99 obj-$(CONFIG_AUDIT) += audit_64.o 108 obj-$(CONFIG_AUDIT) += audit_64.o
100 109
101 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o 110 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
102 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o 111 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
112 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
103 obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o 113 obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o
104 114
105 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o 115 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 33c5216fd3e1..fa88a1d71290 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -37,6 +37,7 @@
37#include <asm/pgtable.h> 37#include <asm/pgtable.h>
38#include <asm/io_apic.h> 38#include <asm/io_apic.h>
39#include <asm/apic.h> 39#include <asm/apic.h>
40#include <asm/genapic.h>
40#include <asm/io.h> 41#include <asm/io.h>
41#include <asm/mpspec.h> 42#include <asm/mpspec.h>
42#include <asm/smp.h> 43#include <asm/smp.h>
@@ -106,21 +107,6 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
106 */ 107 */
107enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC; 108enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
108 109
109#ifdef CONFIG_X86_64
110
111/* rely on all ACPI tables being in the direct mapping */
112char *__init __acpi_map_table(unsigned long phys_addr, unsigned long size)
113{
114 if (!phys_addr || !size)
115 return NULL;
116
117 if (phys_addr+size <= (max_pfn_mapped << PAGE_SHIFT) + PAGE_SIZE)
118 return __va(phys_addr);
119
120 return NULL;
121}
122
123#else
124 110
125/* 111/*
126 * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END, 112 * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
@@ -139,11 +125,15 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
139 unsigned long base, offset, mapped_size; 125 unsigned long base, offset, mapped_size;
140 int idx; 126 int idx;
141 127
142 if (phys + size < 8 * 1024 * 1024) 128 if (!phys || !size)
129 return NULL;
130
131 if (phys+size <= (max_low_pfn_mapped << PAGE_SHIFT))
143 return __va(phys); 132 return __va(phys);
144 133
145 offset = phys & (PAGE_SIZE - 1); 134 offset = phys & (PAGE_SIZE - 1);
146 mapped_size = PAGE_SIZE - offset; 135 mapped_size = PAGE_SIZE - offset;
136 clear_fixmap(FIX_ACPI_END);
147 set_fixmap(FIX_ACPI_END, phys); 137 set_fixmap(FIX_ACPI_END, phys);
148 base = fix_to_virt(FIX_ACPI_END); 138 base = fix_to_virt(FIX_ACPI_END);
149 139
@@ -155,13 +145,13 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
155 if (--idx < FIX_ACPI_BEGIN) 145 if (--idx < FIX_ACPI_BEGIN)
156 return NULL; /* cannot handle this */ 146 return NULL; /* cannot handle this */
157 phys += PAGE_SIZE; 147 phys += PAGE_SIZE;
148 clear_fixmap(idx);
158 set_fixmap(idx, phys); 149 set_fixmap(idx, phys);
159 mapped_size += PAGE_SIZE; 150 mapped_size += PAGE_SIZE;
160 } 151 }
161 152
162 return ((unsigned char *)base + offset); 153 return ((unsigned char *)base + offset);
163} 154}
164#endif
165 155
166#ifdef CONFIG_PCI_MMCONFIG 156#ifdef CONFIG_PCI_MMCONFIG
167/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ 157/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */
@@ -338,8 +328,6 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e
338 328
339#ifdef CONFIG_X86_IO_APIC 329#ifdef CONFIG_X86_IO_APIC
340 330
341struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
342
343static int __init 331static int __init
344acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end) 332acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
345{ 333{
@@ -514,8 +502,6 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity)
514 * Make sure all (legacy) PCI IRQs are set as level-triggered. 502 * Make sure all (legacy) PCI IRQs are set as level-triggered.
515 */ 503 */
516 if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { 504 if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
517 extern void eisa_set_level_irq(unsigned int irq);
518
519 if (triggering == ACPI_LEVEL_SENSITIVE) 505 if (triggering == ACPI_LEVEL_SENSITIVE)
520 eisa_set_level_irq(gsi); 506 eisa_set_level_irq(gsi);
521 } 507 }
@@ -860,6 +846,364 @@ static int __init acpi_parse_madt_lapic_entries(void)
860#endif /* CONFIG_X86_LOCAL_APIC */ 846#endif /* CONFIG_X86_LOCAL_APIC */
861 847
862#ifdef CONFIG_X86_IO_APIC 848#ifdef CONFIG_X86_IO_APIC
849#define MP_ISA_BUS 0
850
851#ifdef CONFIG_X86_ES7000
852extern int es7000_plat;
853#endif
854
855static struct {
856 int apic_id;
857 int gsi_base;
858 int gsi_end;
859 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
860} mp_ioapic_routing[MAX_IO_APICS];
861
862static int mp_find_ioapic(int gsi)
863{
864 int i = 0;
865
866 /* Find the IOAPIC that manages this GSI. */
867 for (i = 0; i < nr_ioapics; i++) {
868 if ((gsi >= mp_ioapic_routing[i].gsi_base)
869 && (gsi <= mp_ioapic_routing[i].gsi_end))
870 return i;
871 }
872
873 printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
874 return -1;
875}
876
877static u8 __init uniq_ioapic_id(u8 id)
878{
879#ifdef CONFIG_X86_32
880 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
881 !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
882 return io_apic_get_unique_id(nr_ioapics, id);
883 else
884 return id;
885#else
886 int i;
887 DECLARE_BITMAP(used, 256);
888 bitmap_zero(used, 256);
889 for (i = 0; i < nr_ioapics; i++) {
890 struct mp_config_ioapic *ia = &mp_ioapics[i];
891 __set_bit(ia->mp_apicid, used);
892 }
893 if (!test_bit(id, used))
894 return id;
895 return find_first_zero_bit(used, 256);
896#endif
897}
898
899static int bad_ioapic(unsigned long address)
900{
901 if (nr_ioapics >= MAX_IO_APICS) {
902 printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
903 "(found %d)\n", MAX_IO_APICS, nr_ioapics);
904 panic("Recompile kernel with bigger MAX_IO_APICS!\n");
905 }
906 if (!address) {
907 printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
908 " found in table, skipping!\n");
909 return 1;
910 }
911 return 0;
912}
913
914void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
915{
916 int idx = 0;
917
918 if (bad_ioapic(address))
919 return;
920
921 idx = nr_ioapics;
922
923 mp_ioapics[idx].mp_type = MP_IOAPIC;
924 mp_ioapics[idx].mp_flags = MPC_APIC_USABLE;
925 mp_ioapics[idx].mp_apicaddr = address;
926
927 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
928 mp_ioapics[idx].mp_apicid = uniq_ioapic_id(id);
929#ifdef CONFIG_X86_32
930 mp_ioapics[idx].mp_apicver = io_apic_get_version(idx);
931#else
932 mp_ioapics[idx].mp_apicver = 0;
933#endif
934 /*
935 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
936 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
937 */
938 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mp_apicid;
939 mp_ioapic_routing[idx].gsi_base = gsi_base;
940 mp_ioapic_routing[idx].gsi_end = gsi_base +
941 io_apic_get_redir_entries(idx);
942
943 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
944 "GSI %d-%d\n", idx, mp_ioapics[idx].mp_apicid,
945 mp_ioapics[idx].mp_apicver, mp_ioapics[idx].mp_apicaddr,
946 mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
947
948 nr_ioapics++;
949}
950
951static void assign_to_mp_irq(struct mp_config_intsrc *m,
952 struct mp_config_intsrc *mp_irq)
953{
954 memcpy(mp_irq, m, sizeof(struct mp_config_intsrc));
955}
956
957static int mp_irq_cmp(struct mp_config_intsrc *mp_irq,
958 struct mp_config_intsrc *m)
959{
960 return memcmp(mp_irq, m, sizeof(struct mp_config_intsrc));
961}
962
963static void save_mp_irq(struct mp_config_intsrc *m)
964{
965 int i;
966
967 for (i = 0; i < mp_irq_entries; i++) {
968 if (!mp_irq_cmp(&mp_irqs[i], m))
969 return;
970 }
971
972 assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
973 if (++mp_irq_entries == MAX_IRQ_SOURCES)
974 panic("Max # of irq sources exceeded!!\n");
975}
976
977void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
978{
979 int ioapic;
980 int pin;
981 struct mp_config_intsrc mp_irq;
982
983 /*
984 * Convert 'gsi' to 'ioapic.pin'.
985 */
986 ioapic = mp_find_ioapic(gsi);
987 if (ioapic < 0)
988 return;
989 pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
990
991 /*
992 * TBD: This check is for faulty timer entries, where the override
993 * erroneously sets the trigger to level, resulting in a HUGE
994 * increase of timer interrupts!
995 */
996 if ((bus_irq == 0) && (trigger == 3))
997 trigger = 1;
998
999 mp_irq.mp_type = MP_INTSRC;
1000 mp_irq.mp_irqtype = mp_INT;
1001 mp_irq.mp_irqflag = (trigger << 2) | polarity;
1002 mp_irq.mp_srcbus = MP_ISA_BUS;
1003 mp_irq.mp_srcbusirq = bus_irq; /* IRQ */
1004 mp_irq.mp_dstapic = mp_ioapics[ioapic].mp_apicid; /* APIC ID */
1005 mp_irq.mp_dstirq = pin; /* INTIN# */
1006
1007 save_mp_irq(&mp_irq);
1008}
1009
1010void __init mp_config_acpi_legacy_irqs(void)
1011{
1012 int i;
1013 int ioapic;
1014 unsigned int dstapic;
1015 struct mp_config_intsrc mp_irq;
1016
1017#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
1018 /*
1019 * Fabricate the legacy ISA bus (bus #31).
1020 */
1021 mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
1022#endif
1023 set_bit(MP_ISA_BUS, mp_bus_not_pci);
1024 pr_debug("Bus #%d is ISA\n", MP_ISA_BUS);
1025
1026#ifdef CONFIG_X86_ES7000
1027 /*
1028 * Older generations of ES7000 have no legacy identity mappings
1029 */
1030 if (es7000_plat == 1)
1031 return;
1032#endif
1033
1034 /*
1035 * Locate the IOAPIC that manages the ISA IRQs (0-15).
1036 */
1037 ioapic = mp_find_ioapic(0);
1038 if (ioapic < 0)
1039 return;
1040 dstapic = mp_ioapics[ioapic].mp_apicid;
1041
1042 /*
1043 * Use the default configuration for the IRQs 0-15. Unless
1044 * overridden by (MADT) interrupt source override entries.
1045 */
1046 for (i = 0; i < 16; i++) {
1047 int idx;
1048
1049 for (idx = 0; idx < mp_irq_entries; idx++) {
1050 struct mp_config_intsrc *irq = mp_irqs + idx;
1051
1052 /* Do we already have a mapping for this ISA IRQ? */
1053 if (irq->mp_srcbus == MP_ISA_BUS
1054 && irq->mp_srcbusirq == i)
1055 break;
1056
1057 /* Do we already have a mapping for this IOAPIC pin */
1058 if (irq->mp_dstapic == dstapic &&
1059 irq->mp_dstirq == i)
1060 break;
1061 }
1062
1063 if (idx != mp_irq_entries) {
1064 printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
1065 continue; /* IRQ already used */
1066 }
1067
1068 mp_irq.mp_type = MP_INTSRC;
1069 mp_irq.mp_irqflag = 0; /* Conforming */
1070 mp_irq.mp_srcbus = MP_ISA_BUS;
1071 mp_irq.mp_dstapic = dstapic;
1072 mp_irq.mp_irqtype = mp_INT;
1073 mp_irq.mp_srcbusirq = i; /* Identity mapped */
1074 mp_irq.mp_dstirq = i;
1075
1076 save_mp_irq(&mp_irq);
1077 }
1078}
1079
1080int mp_register_gsi(u32 gsi, int triggering, int polarity)
1081{
1082 int ioapic;
1083 int ioapic_pin;
1084#ifdef CONFIG_X86_32
1085#define MAX_GSI_NUM 4096
1086#define IRQ_COMPRESSION_START 64
1087
1088 static int pci_irq = IRQ_COMPRESSION_START;
1089 /*
1090 * Mapping between Global System Interrupts, which
1091 * represent all possible interrupts, and IRQs
1092 * assigned to actual devices.
1093 */
1094 static int gsi_to_irq[MAX_GSI_NUM];
1095#else
1096
1097 if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
1098 return gsi;
1099#endif
1100
1101 /* Don't set up the ACPI SCI because it's already set up */
1102 if (acpi_gbl_FADT.sci_interrupt == gsi)
1103 return gsi;
1104
1105 ioapic = mp_find_ioapic(gsi);
1106 if (ioapic < 0) {
1107 printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
1108 return gsi;
1109 }
1110
1111 ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
1112
1113#ifdef CONFIG_X86_32
1114 if (ioapic_renumber_irq)
1115 gsi = ioapic_renumber_irq(ioapic, gsi);
1116#endif
1117
1118 /*
1119 * Avoid pin reprogramming. PRTs typically include entries
1120 * with redundant pin->gsi mappings (but unique PCI devices);
1121 * we only program the IOAPIC on the first.
1122 */
1123 if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
1124 printk(KERN_ERR "Invalid reference to IOAPIC pin "
1125 "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
1126 ioapic_pin);
1127 return gsi;
1128 }
1129 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
1130 pr_debug(KERN_DEBUG "Pin %d-%d already programmed\n",
1131 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
1132#ifdef CONFIG_X86_32
1133 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
1134#else
1135 return gsi;
1136#endif
1137 }
1138
1139 set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
1140#ifdef CONFIG_X86_32
1141 /*
1142 * For GSI >= 64, use IRQ compression
1143 */
1144 if ((gsi >= IRQ_COMPRESSION_START)
1145 && (triggering == ACPI_LEVEL_SENSITIVE)) {
1146 /*
1147 * For PCI devices assign IRQs in order, avoiding gaps
1148 * due to unused I/O APIC pins.
1149 */
1150 int irq = gsi;
1151 if (gsi < MAX_GSI_NUM) {
1152 /*
1153 * Retain the VIA chipset work-around (gsi > 15), but
1154 * avoid a problem where the 8254 timer (IRQ0) is setup
1155 * via an override (so it's not on pin 0 of the ioapic),
1156 * and at the same time, the pin 0 interrupt is a PCI
1157 * type. The gsi > 15 test could cause these two pins
1158 * to be shared as IRQ0, and they are not shareable.
1159 * So test for this condition, and if necessary, avoid
1160 * the pin collision.
1161 */
1162 gsi = pci_irq++;
1163 /*
1164 * Don't assign IRQ used by ACPI SCI
1165 */
1166 if (gsi == acpi_gbl_FADT.sci_interrupt)
1167 gsi = pci_irq++;
1168 gsi_to_irq[irq] = gsi;
1169 } else {
1170 printk(KERN_ERR "GSI %u is too high\n", gsi);
1171 return gsi;
1172 }
1173 }
1174#endif
1175 io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
1176 triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
1177 polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
1178 return gsi;
1179}
1180
1181int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
1182 u32 gsi, int triggering, int polarity)
1183{
1184#ifdef CONFIG_X86_MPPARSE
1185 struct mp_config_intsrc mp_irq;
1186 int ioapic;
1187
1188 if (!acpi_ioapic)
1189 return 0;
1190
1191 /* print the entry should happen on mptable identically */
1192 mp_irq.mp_type = MP_INTSRC;
1193 mp_irq.mp_irqtype = mp_INT;
1194 mp_irq.mp_irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
1195 (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
1196 mp_irq.mp_srcbus = number;
1197 mp_irq.mp_srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
1198 ioapic = mp_find_ioapic(gsi);
1199 mp_irq.mp_dstapic = mp_ioapic_routing[ioapic].apic_id;
1200 mp_irq.mp_dstirq = gsi - mp_ioapic_routing[ioapic].gsi_base;
1201
1202 save_mp_irq(&mp_irq);
1203#endif
1204 return 0;
1205}
1206
863/* 1207/*
864 * Parse IOAPIC related entries in MADT 1208 * Parse IOAPIC related entries in MADT
865 * returns 0 on success, < 0 on error 1209 * returns 0 on success, < 0 on error
@@ -1009,8 +1353,6 @@ static void __init acpi_process_madt(void)
1009 return; 1353 return;
1010} 1354}
1011 1355
1012#ifdef __i386__
1013
1014static int __init disable_acpi_irq(const struct dmi_system_id *d) 1356static int __init disable_acpi_irq(const struct dmi_system_id *d)
1015{ 1357{
1016 if (!acpi_force) { 1358 if (!acpi_force) {
@@ -1061,6 +1403,16 @@ static int __init force_acpi_ht(const struct dmi_system_id *d)
1061} 1403}
1062 1404
1063/* 1405/*
1406 * Force ignoring BIOS IRQ0 pin2 override
1407 */
1408static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
1409{
1410 pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", d->ident);
1411 acpi_skip_timer_override = 1;
1412 return 0;
1413}
1414
1415/*
1064 * If your system is blacklisted here, but you find that acpi=force 1416 * If your system is blacklisted here, but you find that acpi=force
1065 * works for you, please contact acpi-devel@sourceforge.net 1417 * works for you, please contact acpi-devel@sourceforge.net
1066 */ 1418 */
@@ -1227,11 +1579,35 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1227 DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), 1579 DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
1228 }, 1580 },
1229 }, 1581 },
1582 /*
1583 * HP laptops which use a DSDT reporting as HP/SB400/10000,
1584 * which includes some code which overrides all temperature
1585 * trip points to 16C if the INTIN2 input of the I/O APIC
1586 * is enabled. This input is incorrectly designated the
1587 * ISA IRQ 0 via an interrupt source override even though
1588 * it is wired to the output of the master 8259A and INTIN0
1589 * is not connected at all. Force ignoring BIOS IRQ0 pin2
1590 * override in that cases.
1591 */
1592 {
1593 .callback = dmi_ignore_irq0_timer_override,
1594 .ident = "HP NX6125 laptop",
1595 .matches = {
1596 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
1597 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6125"),
1598 },
1599 },
1600 {
1601 .callback = dmi_ignore_irq0_timer_override,
1602 .ident = "HP NX6325 laptop",
1603 .matches = {
1604 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
1605 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"),
1606 },
1607 },
1230 {} 1608 {}
1231}; 1609};
1232 1610
1233#endif /* __i386__ */
1234
1235/* 1611/*
1236 * acpi_boot_table_init() and acpi_boot_init() 1612 * acpi_boot_table_init() and acpi_boot_init()
1237 * called from setup_arch(), always. 1613 * called from setup_arch(), always.
@@ -1259,9 +1635,7 @@ int __init acpi_boot_table_init(void)
1259{ 1635{
1260 int error; 1636 int error;
1261 1637
1262#ifdef __i386__
1263 dmi_check_system(acpi_dmi_table); 1638 dmi_check_system(acpi_dmi_table);
1264#endif
1265 1639
1266 /* 1640 /*
1267 * If acpi_disabled, bail out 1641 * If acpi_disabled, bail out
@@ -1386,6 +1760,20 @@ static int __init parse_pci(char *arg)
1386} 1760}
1387early_param("pci", parse_pci); 1761early_param("pci", parse_pci);
1388 1762
1763int __init acpi_mps_check(void)
1764{
1765#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE)
1766/* mptable code is not built-in*/
1767 if (acpi_disabled || acpi_noirq) {
1768 printk(KERN_WARNING "MPS support code is not built-in.\n"
1769 "Using acpi=off or acpi=noirq or pci=noacpi "
1770 "may have problem\n");
1771 return 1;
1772 }
1773#endif
1774 return 0;
1775}
1776
1389#ifdef CONFIG_X86_IO_APIC 1777#ifdef CONFIG_X86_IO_APIC
1390static int __init parse_acpi_skip_timer_override(char *arg) 1778static int __init parse_acpi_skip_timer_override(char *arg)
1391{ 1779{
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index c2502eb9aa83..9220cf46aa10 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -73,6 +73,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
73 struct cpuinfo_x86 *c = &cpu_data(cpu); 73 struct cpuinfo_x86 *c = &cpu_data(cpu);
74 74
75 cpumask_t saved_mask; 75 cpumask_t saved_mask;
76 cpumask_of_cpu_ptr(new_mask, cpu);
76 int retval; 77 int retval;
77 unsigned int eax, ebx, ecx, edx; 78 unsigned int eax, ebx, ecx, edx;
78 unsigned int edx_part; 79 unsigned int edx_part;
@@ -91,7 +92,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
91 92
92 /* Make sure we are running on right CPU */ 93 /* Make sure we are running on right CPU */
93 saved_mask = current->cpus_allowed; 94 saved_mask = current->cpus_allowed;
94 retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 95 retval = set_cpus_allowed_ptr(current, new_mask);
95 if (retval) 96 if (retval)
96 return -1; 97 return -1;
97 98
diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c
index de2d2e4ebad9..7c074eec39fb 100644
--- a/arch/x86/kernel/acpi/processor.c
+++ b/arch/x86/kernel/acpi/processor.c
@@ -56,6 +56,12 @@ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c)
56 if (cpu_has(c, X86_FEATURE_ACPI)) 56 if (cpu_has(c, X86_FEATURE_ACPI))
57 buf[2] |= ACPI_PDC_T_FFH; 57 buf[2] |= ACPI_PDC_T_FFH;
58 58
59 /*
60 * If mwait/monitor is unsupported, C2/C3_FFH will be disabled
61 */
62 if (!cpu_has(c, X86_FEATURE_MWAIT))
63 buf[2] &= ~(ACPI_PDC_C_C2C3_FFH);
64
59 obj->type = ACPI_TYPE_BUFFER; 65 obj->type = ACPI_TYPE_BUFFER;
60 obj->buffer.length = 12; 66 obj->buffer.length = 12;
61 obj->buffer.pointer = (u8 *) buf; 67 obj->buffer.pointer = (u8 *) buf;
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 36af01f029ed..fa2161d5003b 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -9,6 +9,7 @@
9#include <linux/bootmem.h> 9#include <linux/bootmem.h>
10#include <linux/dmi.h> 10#include <linux/dmi.h>
11#include <linux/cpumask.h> 11#include <linux/cpumask.h>
12#include <asm/segment.h>
12 13
13#include "realmode/wakeup.h" 14#include "realmode/wakeup.h"
14#include "sleep.h" 15#include "sleep.h"
@@ -51,18 +52,27 @@ int acpi_save_state_mem(void)
51 header->video_mode = saved_video_mode; 52 header->video_mode = saved_video_mode;
52 53
53 header->wakeup_jmp_seg = acpi_wakeup_address >> 4; 54 header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
55
56 /*
57 * Set up the wakeup GDT. We set these up as Big Real Mode,
58 * that is, with limits set to 4 GB. At least the Lenovo
59 * Thinkpad X61 is known to need this for the video BIOS
60 * initialization quirk to work; this is likely to also
61 * be the case for other laptops or integrated video devices.
62 */
63
54 /* GDT[0]: GDT self-pointer */ 64 /* GDT[0]: GDT self-pointer */
55 header->wakeup_gdt[0] = 65 header->wakeup_gdt[0] =
56 (u64)(sizeof(header->wakeup_gdt) - 1) + 66 (u64)(sizeof(header->wakeup_gdt) - 1) +
57 ((u64)(acpi_wakeup_address + 67 ((u64)(acpi_wakeup_address +
58 ((char *)&header->wakeup_gdt - (char *)acpi_realmode)) 68 ((char *)&header->wakeup_gdt - (char *)acpi_realmode))
59 << 16); 69 << 16);
60 /* GDT[1]: real-mode-like code segment */ 70 /* GDT[1]: big real mode-like code segment */
61 header->wakeup_gdt[1] = (0x009bULL << 40) + 71 header->wakeup_gdt[1] =
62 ((u64)acpi_wakeup_address << 16) + 0xffff; 72 GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
63 /* GDT[2]: real-mode-like data segment */ 73 /* GDT[2]: big real mode-like data segment */
64 header->wakeup_gdt[2] = (0x0093ULL << 40) + 74 header->wakeup_gdt[2] =
65 ((u64)acpi_wakeup_address << 16) + 0xffff; 75 GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff);
66 76
67#ifndef CONFIG_64BIT 77#ifndef CONFIG_64BIT
68 store_gdt((struct desc_ptr *)&header->pmode_gdt); 78 store_gdt((struct desc_ptr *)&header->pmode_gdt);
@@ -86,7 +96,9 @@ int acpi_save_state_mem(void)
86 saved_magic = 0x12345678; 96 saved_magic = 0x12345678;
87#else /* CONFIG_64BIT */ 97#else /* CONFIG_64BIT */
88 header->trampoline_segment = setup_trampoline() >> 4; 98 header->trampoline_segment = setup_trampoline() >> 4;
89 init_rsp = (unsigned long)temp_stack + 4096; 99#ifdef CONFIG_SMP
100 stack_start.sp = temp_stack + 4096;
101#endif
90 initial_code = (unsigned long)wakeup_long64; 102 initial_code = (unsigned long)wakeup_long64;
91 saved_magic = 0x123456789abcdef0; 103 saved_magic = 0x123456789abcdef0;
92#endif /* CONFIG_64BIT */ 104#endif /* CONFIG_64BIT */
@@ -138,6 +150,12 @@ static int __init acpi_sleep_setup(char *str)
138 acpi_realmode_flags |= 2; 150 acpi_realmode_flags |= 2;
139 if (strncmp(str, "s3_beep", 7) == 0) 151 if (strncmp(str, "s3_beep", 7) == 0)
140 acpi_realmode_flags |= 4; 152 acpi_realmode_flags |= 4;
153#ifdef CONFIG_HIBERNATION
154 if (strncmp(str, "s4_nohwsig", 10) == 0)
155 acpi_no_s4_hw_signature();
156#endif
157 if (strncmp(str, "old_ordering", 12) == 0)
158 acpi_old_suspend_ordering();
141 str = strchr(str, ','); 159 str = strchr(str, ',');
142 if (str != NULL) 160 if (str != NULL)
143 str += strspn(str, ", \t"); 161 str += strspn(str, ", \t");
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 65c7857a90dd..2763cb37b553 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1,6 +1,6 @@
1#include <linux/module.h> 1#include <linux/module.h>
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include <linux/spinlock.h> 3#include <linux/mutex.h>
4#include <linux/list.h> 4#include <linux/list.h>
5#include <linux/kprobes.h> 5#include <linux/kprobes.h>
6#include <linux/mm.h> 6#include <linux/mm.h>
@@ -143,7 +143,7 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = {
143#ifdef CONFIG_X86_64 143#ifdef CONFIG_X86_64
144 144
145extern char __vsyscall_0; 145extern char __vsyscall_0;
146static inline const unsigned char*const * find_nop_table(void) 146const unsigned char *const *find_nop_table(void)
147{ 147{
148 return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || 148 return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
149 boot_cpu_data.x86 < 6 ? k8_nops : p6_nops; 149 boot_cpu_data.x86 < 6 ? k8_nops : p6_nops;
@@ -162,7 +162,7 @@ static const struct nop {
162 { -1, NULL } 162 { -1, NULL }
163}; 163};
164 164
165static const unsigned char*const * find_nop_table(void) 165const unsigned char *const *find_nop_table(void)
166{ 166{
167 const unsigned char *const *noptable = intel_nops; 167 const unsigned char *const *noptable = intel_nops;
168 int i; 168 int i;
@@ -279,7 +279,7 @@ struct smp_alt_module {
279 struct list_head next; 279 struct list_head next;
280}; 280};
281static LIST_HEAD(smp_alt_modules); 281static LIST_HEAD(smp_alt_modules);
282static DEFINE_SPINLOCK(smp_alt); 282static DEFINE_MUTEX(smp_alt);
283static int smp_mode = 1; /* protected by smp_alt */ 283static int smp_mode = 1; /* protected by smp_alt */
284 284
285void alternatives_smp_module_add(struct module *mod, char *name, 285void alternatives_smp_module_add(struct module *mod, char *name,
@@ -312,12 +312,12 @@ void alternatives_smp_module_add(struct module *mod, char *name,
312 __func__, smp->locks, smp->locks_end, 312 __func__, smp->locks, smp->locks_end,
313 smp->text, smp->text_end, smp->name); 313 smp->text, smp->text_end, smp->name);
314 314
315 spin_lock(&smp_alt); 315 mutex_lock(&smp_alt);
316 list_add_tail(&smp->next, &smp_alt_modules); 316 list_add_tail(&smp->next, &smp_alt_modules);
317 if (boot_cpu_has(X86_FEATURE_UP)) 317 if (boot_cpu_has(X86_FEATURE_UP))
318 alternatives_smp_unlock(smp->locks, smp->locks_end, 318 alternatives_smp_unlock(smp->locks, smp->locks_end,
319 smp->text, smp->text_end); 319 smp->text, smp->text_end);
320 spin_unlock(&smp_alt); 320 mutex_unlock(&smp_alt);
321} 321}
322 322
323void alternatives_smp_module_del(struct module *mod) 323void alternatives_smp_module_del(struct module *mod)
@@ -327,17 +327,17 @@ void alternatives_smp_module_del(struct module *mod)
327 if (smp_alt_once || noreplace_smp) 327 if (smp_alt_once || noreplace_smp)
328 return; 328 return;
329 329
330 spin_lock(&smp_alt); 330 mutex_lock(&smp_alt);
331 list_for_each_entry(item, &smp_alt_modules, next) { 331 list_for_each_entry(item, &smp_alt_modules, next) {
332 if (mod != item->mod) 332 if (mod != item->mod)
333 continue; 333 continue;
334 list_del(&item->next); 334 list_del(&item->next);
335 spin_unlock(&smp_alt); 335 mutex_unlock(&smp_alt);
336 DPRINTK("%s: %s\n", __func__, item->name); 336 DPRINTK("%s: %s\n", __func__, item->name);
337 kfree(item); 337 kfree(item);
338 return; 338 return;
339 } 339 }
340 spin_unlock(&smp_alt); 340 mutex_unlock(&smp_alt);
341} 341}
342 342
343void alternatives_smp_switch(int smp) 343void alternatives_smp_switch(int smp)
@@ -359,7 +359,7 @@ void alternatives_smp_switch(int smp)
359 return; 359 return;
360 BUG_ON(!smp && (num_online_cpus() > 1)); 360 BUG_ON(!smp && (num_online_cpus() > 1));
361 361
362 spin_lock(&smp_alt); 362 mutex_lock(&smp_alt);
363 363
364 /* 364 /*
365 * Avoid unnecessary switches because it forces JIT based VMs to 365 * Avoid unnecessary switches because it forces JIT based VMs to
@@ -383,7 +383,7 @@ void alternatives_smp_switch(int smp)
383 mod->text, mod->text_end); 383 mod->text, mod->text_end);
384 } 384 }
385 smp_mode = smp; 385 smp_mode = smp;
386 spin_unlock(&smp_alt); 386 mutex_unlock(&smp_alt);
387} 387}
388 388
389#endif 389#endif
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
new file mode 100644
index 000000000000..c25210e6ac88
--- /dev/null
+++ b/arch/x86/kernel/amd_iommu.c
@@ -0,0 +1,1167 @@
1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#include <linux/pci.h>
21#include <linux/gfp.h>
22#include <linux/bitops.h>
23#include <linux/scatterlist.h>
24#include <linux/iommu-helper.h>
25#include <asm/proto.h>
26#include <asm/iommu.h>
27#include <asm/amd_iommu_types.h>
28#include <asm/amd_iommu.h>
29
30#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
31
32#define to_pages(addr, size) \
33 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
34
35#define EXIT_LOOP_COUNT 10000000
36
37static DEFINE_RWLOCK(amd_iommu_devtable_lock);
38
39/*
40 * general struct to manage commands send to an IOMMU
41 */
42struct iommu_cmd {
43 u32 data[4];
44};
45
46static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
47 struct unity_map_entry *e);
48
49/* returns !0 if the IOMMU is caching non-present entries in its TLB */
50static int iommu_has_npcache(struct amd_iommu *iommu)
51{
52 return iommu->cap & IOMMU_CAP_NPCACHE;
53}
54
55/****************************************************************************
56 *
57 * IOMMU command queuing functions
58 *
59 ****************************************************************************/
60
61/*
62 * Writes the command to the IOMMUs command buffer and informs the
63 * hardware about the new command. Must be called with iommu->lock held.
64 */
65static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
66{
67 u32 tail, head;
68 u8 *target;
69
70 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
71 target = (iommu->cmd_buf + tail);
72 memcpy_toio(target, cmd, sizeof(*cmd));
73 tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
74 head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
75 if (tail == head)
76 return -ENOMEM;
77 writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
78
79 return 0;
80}
81
82/*
83 * General queuing function for commands. Takes iommu->lock and calls
84 * __iommu_queue_command().
85 */
86static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
87{
88 unsigned long flags;
89 int ret;
90
91 spin_lock_irqsave(&iommu->lock, flags);
92 ret = __iommu_queue_command(iommu, cmd);
93 spin_unlock_irqrestore(&iommu->lock, flags);
94
95 return ret;
96}
97
98/*
99 * This function is called whenever we need to ensure that the IOMMU has
100 * completed execution of all commands we sent. It sends a
101 * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
102 * us about that by writing a value to a physical address we pass with
103 * the command.
104 */
105static int iommu_completion_wait(struct amd_iommu *iommu)
106{
107 int ret;
108 struct iommu_cmd cmd;
109 volatile u64 ready = 0;
110 unsigned long ready_phys = virt_to_phys(&ready);
111 unsigned long i = 0;
112
113 memset(&cmd, 0, sizeof(cmd));
114 cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK;
115 cmd.data[1] = upper_32_bits(ready_phys);
116 cmd.data[2] = 1; /* value written to 'ready' */
117 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
118
119 iommu->need_sync = 0;
120
121 ret = iommu_queue_command(iommu, &cmd);
122
123 if (ret)
124 return ret;
125
126 while (!ready && (i < EXIT_LOOP_COUNT)) {
127 ++i;
128 cpu_relax();
129 }
130
131 if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit()))
132 printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n");
133
134 return 0;
135}
136
137/*
138 * Command send function for invalidating a device table entry
139 */
140static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
141{
142 struct iommu_cmd cmd;
143
144 BUG_ON(iommu == NULL);
145
146 memset(&cmd, 0, sizeof(cmd));
147 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
148 cmd.data[0] = devid;
149
150 iommu->need_sync = 1;
151
152 return iommu_queue_command(iommu, &cmd);
153}
154
155/*
156 * Generic command send function for invalidaing TLB entries
157 */
158static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
159 u64 address, u16 domid, int pde, int s)
160{
161 struct iommu_cmd cmd;
162
163 memset(&cmd, 0, sizeof(cmd));
164 address &= PAGE_MASK;
165 CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
166 cmd.data[1] |= domid;
167 cmd.data[2] = LOW_U32(address);
168 cmd.data[3] = upper_32_bits(address);
169 if (s) /* size bit - we flush more than one 4kb page */
170 cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
171 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
172 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
173
174 iommu->need_sync = 1;
175
176 return iommu_queue_command(iommu, &cmd);
177}
178
179/*
180 * TLB invalidation function which is called from the mapping functions.
181 * It invalidates a single PTE if the range to flush is within a single
182 * page. Otherwise it flushes the whole TLB of the IOMMU.
183 */
184static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
185 u64 address, size_t size)
186{
187 int s = 0;
188 unsigned pages = to_pages(address, size);
189
190 address &= PAGE_MASK;
191
192 if (pages > 1) {
193 /*
194 * If we have to flush more than one page, flush all
195 * TLB entries for this domain
196 */
197 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
198 s = 1;
199 }
200
201 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, s);
202
203 return 0;
204}
205
206/****************************************************************************
207 *
208 * The functions below are used the create the page table mappings for
209 * unity mapped regions.
210 *
211 ****************************************************************************/
212
213/*
214 * Generic mapping functions. It maps a physical address into a DMA
215 * address space. It allocates the page table pages if necessary.
216 * In the future it can be extended to a generic mapping function
217 * supporting all features of AMD IOMMU page tables like level skipping
218 * and full 64 bit address spaces.
219 */
220static int iommu_map(struct protection_domain *dom,
221 unsigned long bus_addr,
222 unsigned long phys_addr,
223 int prot)
224{
225 u64 __pte, *pte, *page;
226
227 bus_addr = PAGE_ALIGN(bus_addr);
228 phys_addr = PAGE_ALIGN(bus_addr);
229
230 /* only support 512GB address spaces for now */
231 if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
232 return -EINVAL;
233
234 pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)];
235
236 if (!IOMMU_PTE_PRESENT(*pte)) {
237 page = (u64 *)get_zeroed_page(GFP_KERNEL);
238 if (!page)
239 return -ENOMEM;
240 *pte = IOMMU_L2_PDE(virt_to_phys(page));
241 }
242
243 pte = IOMMU_PTE_PAGE(*pte);
244 pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
245
246 if (!IOMMU_PTE_PRESENT(*pte)) {
247 page = (u64 *)get_zeroed_page(GFP_KERNEL);
248 if (!page)
249 return -ENOMEM;
250 *pte = IOMMU_L1_PDE(virt_to_phys(page));
251 }
252
253 pte = IOMMU_PTE_PAGE(*pte);
254 pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)];
255
256 if (IOMMU_PTE_PRESENT(*pte))
257 return -EBUSY;
258
259 __pte = phys_addr | IOMMU_PTE_P;
260 if (prot & IOMMU_PROT_IR)
261 __pte |= IOMMU_PTE_IR;
262 if (prot & IOMMU_PROT_IW)
263 __pte |= IOMMU_PTE_IW;
264
265 *pte = __pte;
266
267 return 0;
268}
269
270/*
271 * This function checks if a specific unity mapping entry is needed for
272 * this specific IOMMU.
273 */
274static int iommu_for_unity_map(struct amd_iommu *iommu,
275 struct unity_map_entry *entry)
276{
277 u16 bdf, i;
278
279 for (i = entry->devid_start; i <= entry->devid_end; ++i) {
280 bdf = amd_iommu_alias_table[i];
281 if (amd_iommu_rlookup_table[bdf] == iommu)
282 return 1;
283 }
284
285 return 0;
286}
287
288/*
289 * Init the unity mappings for a specific IOMMU in the system
290 *
291 * Basically iterates over all unity mapping entries and applies them to
292 * the default domain DMA of that IOMMU if necessary.
293 */
294static int iommu_init_unity_mappings(struct amd_iommu *iommu)
295{
296 struct unity_map_entry *entry;
297 int ret;
298
299 list_for_each_entry(entry, &amd_iommu_unity_map, list) {
300 if (!iommu_for_unity_map(iommu, entry))
301 continue;
302 ret = dma_ops_unity_map(iommu->default_dom, entry);
303 if (ret)
304 return ret;
305 }
306
307 return 0;
308}
309
310/*
311 * This function actually applies the mapping to the page table of the
312 * dma_ops domain.
313 */
314static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
315 struct unity_map_entry *e)
316{
317 u64 addr;
318 int ret;
319
320 for (addr = e->address_start; addr < e->address_end;
321 addr += PAGE_SIZE) {
322 ret = iommu_map(&dma_dom->domain, addr, addr, e->prot);
323 if (ret)
324 return ret;
325 /*
326 * if unity mapping is in aperture range mark the page
327 * as allocated in the aperture
328 */
329 if (addr < dma_dom->aperture_size)
330 __set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap);
331 }
332
333 return 0;
334}
335
336/*
337 * Inits the unity mappings required for a specific device
338 */
339static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
340 u16 devid)
341{
342 struct unity_map_entry *e;
343 int ret;
344
345 list_for_each_entry(e, &amd_iommu_unity_map, list) {
346 if (!(devid >= e->devid_start && devid <= e->devid_end))
347 continue;
348 ret = dma_ops_unity_map(dma_dom, e);
349 if (ret)
350 return ret;
351 }
352
353 return 0;
354}
355
356/****************************************************************************
357 *
358 * The next functions belong to the address allocator for the dma_ops
359 * interface functions. They work like the allocators in the other IOMMU
360 * drivers. Its basically a bitmap which marks the allocated pages in
361 * the aperture. Maybe it could be enhanced in the future to a more
362 * efficient allocator.
363 *
364 ****************************************************************************/
365static unsigned long dma_mask_to_pages(unsigned long mask)
366{
367 return (mask >> PAGE_SHIFT) +
368 (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT);
369}
370
371/*
372 * The address allocator core function.
373 *
374 * called with domain->lock held
375 */
376static unsigned long dma_ops_alloc_addresses(struct device *dev,
377 struct dma_ops_domain *dom,
378 unsigned int pages)
379{
380 unsigned long limit = dma_mask_to_pages(*dev->dma_mask);
381 unsigned long address;
382 unsigned long size = dom->aperture_size >> PAGE_SHIFT;
383 unsigned long boundary_size;
384
385 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
386 PAGE_SIZE) >> PAGE_SHIFT;
387 limit = limit < size ? limit : size;
388
389 if (dom->next_bit >= limit)
390 dom->next_bit = 0;
391
392 address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
393 0 , boundary_size, 0);
394 if (address == -1)
395 address = iommu_area_alloc(dom->bitmap, limit, 0, pages,
396 0, boundary_size, 0);
397
398 if (likely(address != -1)) {
399 dom->next_bit = address + pages;
400 address <<= PAGE_SHIFT;
401 } else
402 address = bad_dma_address;
403
404 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
405
406 return address;
407}
408
409/*
410 * The address free function.
411 *
412 * called with domain->lock held
413 */
414static void dma_ops_free_addresses(struct dma_ops_domain *dom,
415 unsigned long address,
416 unsigned int pages)
417{
418 address >>= PAGE_SHIFT;
419 iommu_area_free(dom->bitmap, address, pages);
420}
421
422/****************************************************************************
423 *
424 * The next functions belong to the domain allocation. A domain is
425 * allocated for every IOMMU as the default domain. If device isolation
426 * is enabled, every device get its own domain. The most important thing
427 * about domains is the page table mapping the DMA address space they
428 * contain.
429 *
430 ****************************************************************************/
431
432static u16 domain_id_alloc(void)
433{
434 unsigned long flags;
435 int id;
436
437 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
438 id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
439 BUG_ON(id == 0);
440 if (id > 0 && id < MAX_DOMAIN_ID)
441 __set_bit(id, amd_iommu_pd_alloc_bitmap);
442 else
443 id = 0;
444 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
445
446 return id;
447}
448
449/*
450 * Used to reserve address ranges in the aperture (e.g. for exclusion
451 * ranges.
452 */
453static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
454 unsigned long start_page,
455 unsigned int pages)
456{
457 unsigned int last_page = dom->aperture_size >> PAGE_SHIFT;
458
459 if (start_page + pages > last_page)
460 pages = last_page - start_page;
461
462 set_bit_string(dom->bitmap, start_page, pages);
463}
464
465static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
466{
467 int i, j;
468 u64 *p1, *p2, *p3;
469
470 p1 = dma_dom->domain.pt_root;
471
472 if (!p1)
473 return;
474
475 for (i = 0; i < 512; ++i) {
476 if (!IOMMU_PTE_PRESENT(p1[i]))
477 continue;
478
479 p2 = IOMMU_PTE_PAGE(p1[i]);
480 for (j = 0; j < 512; ++i) {
481 if (!IOMMU_PTE_PRESENT(p2[j]))
482 continue;
483 p3 = IOMMU_PTE_PAGE(p2[j]);
484 free_page((unsigned long)p3);
485 }
486
487 free_page((unsigned long)p2);
488 }
489
490 free_page((unsigned long)p1);
491}
492
493/*
494 * Free a domain, only used if something went wrong in the
495 * allocation path and we need to free an already allocated page table
496 */
497static void dma_ops_domain_free(struct dma_ops_domain *dom)
498{
499 if (!dom)
500 return;
501
502 dma_ops_free_pagetable(dom);
503
504 kfree(dom->pte_pages);
505
506 kfree(dom->bitmap);
507
508 kfree(dom);
509}
510
511/*
512 * Allocates a new protection domain usable for the dma_ops functions.
513 * It also intializes the page table and the address allocator data
514 * structures required for the dma_ops interface
515 */
516static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
517 unsigned order)
518{
519 struct dma_ops_domain *dma_dom;
520 unsigned i, num_pte_pages;
521 u64 *l2_pde;
522 u64 address;
523
524 /*
525 * Currently the DMA aperture must be between 32 MB and 1GB in size
526 */
527 if ((order < 25) || (order > 30))
528 return NULL;
529
530 dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
531 if (!dma_dom)
532 return NULL;
533
534 spin_lock_init(&dma_dom->domain.lock);
535
536 dma_dom->domain.id = domain_id_alloc();
537 if (dma_dom->domain.id == 0)
538 goto free_dma_dom;
539 dma_dom->domain.mode = PAGE_MODE_3_LEVEL;
540 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
541 dma_dom->domain.priv = dma_dom;
542 if (!dma_dom->domain.pt_root)
543 goto free_dma_dom;
544 dma_dom->aperture_size = (1ULL << order);
545 dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8),
546 GFP_KERNEL);
547 if (!dma_dom->bitmap)
548 goto free_dma_dom;
549 /*
550 * mark the first page as allocated so we never return 0 as
551 * a valid dma-address. So we can use 0 as error value
552 */
553 dma_dom->bitmap[0] = 1;
554 dma_dom->next_bit = 0;
555
556 /* Intialize the exclusion range if necessary */
557 if (iommu->exclusion_start &&
558 iommu->exclusion_start < dma_dom->aperture_size) {
559 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
560 int pages = to_pages(iommu->exclusion_start,
561 iommu->exclusion_length);
562 dma_ops_reserve_addresses(dma_dom, startpage, pages);
563 }
564
565 /*
566 * At the last step, build the page tables so we don't need to
567 * allocate page table pages in the dma_ops mapping/unmapping
568 * path.
569 */
570 num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512);
571 dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *),
572 GFP_KERNEL);
573 if (!dma_dom->pte_pages)
574 goto free_dma_dom;
575
576 l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL);
577 if (l2_pde == NULL)
578 goto free_dma_dom;
579
580 dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde));
581
582 for (i = 0; i < num_pte_pages; ++i) {
583 dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL);
584 if (!dma_dom->pte_pages[i])
585 goto free_dma_dom;
586 address = virt_to_phys(dma_dom->pte_pages[i]);
587 l2_pde[i] = IOMMU_L1_PDE(address);
588 }
589
590 return dma_dom;
591
592free_dma_dom:
593 dma_ops_domain_free(dma_dom);
594
595 return NULL;
596}
597
598/*
599 * Find out the protection domain structure for a given PCI device. This
600 * will give us the pointer to the page table root for example.
601 */
602static struct protection_domain *domain_for_device(u16 devid)
603{
604 struct protection_domain *dom;
605 unsigned long flags;
606
607 read_lock_irqsave(&amd_iommu_devtable_lock, flags);
608 dom = amd_iommu_pd_table[devid];
609 read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
610
611 return dom;
612}
613
614/*
615 * If a device is not yet associated with a domain, this function does
616 * assigns it visible for the hardware
617 */
618static void set_device_domain(struct amd_iommu *iommu,
619 struct protection_domain *domain,
620 u16 devid)
621{
622 unsigned long flags;
623
624 u64 pte_root = virt_to_phys(domain->pt_root);
625
626 pte_root |= (domain->mode & 0x07) << 9;
627 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | 2;
628
629 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
630 amd_iommu_dev_table[devid].data[0] = pte_root;
631 amd_iommu_dev_table[devid].data[1] = pte_root >> 32;
632 amd_iommu_dev_table[devid].data[2] = domain->id;
633
634 amd_iommu_pd_table[devid] = domain;
635 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
636
637 iommu_queue_inv_dev_entry(iommu, devid);
638
639 iommu->need_sync = 1;
640}
641
642/*****************************************************************************
643 *
644 * The next functions belong to the dma_ops mapping/unmapping code.
645 *
646 *****************************************************************************/
647
648/*
649 * In the dma_ops path we only have the struct device. This function
650 * finds the corresponding IOMMU, the protection domain and the
651 * requestor id for a given device.
652 * If the device is not yet associated with a domain this is also done
653 * in this function.
654 */
655static int get_device_resources(struct device *dev,
656 struct amd_iommu **iommu,
657 struct protection_domain **domain,
658 u16 *bdf)
659{
660 struct dma_ops_domain *dma_dom;
661 struct pci_dev *pcidev;
662 u16 _bdf;
663
664 BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask);
665
666 pcidev = to_pci_dev(dev);
667 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
668
669 /* device not translated by any IOMMU in the system? */
670 if (_bdf >= amd_iommu_last_bdf) {
671 *iommu = NULL;
672 *domain = NULL;
673 *bdf = 0xffff;
674 return 0;
675 }
676
677 *bdf = amd_iommu_alias_table[_bdf];
678
679 *iommu = amd_iommu_rlookup_table[*bdf];
680 if (*iommu == NULL)
681 return 0;
682 dma_dom = (*iommu)->default_dom;
683 *domain = domain_for_device(*bdf);
684 if (*domain == NULL) {
685 *domain = &dma_dom->domain;
686 set_device_domain(*iommu, *domain, *bdf);
687 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
688 "device ", (*domain)->id);
689 print_devid(_bdf, 1);
690 }
691
692 return 1;
693}
694
695/*
696 * This is the generic map function. It maps one 4kb page at paddr to
697 * the given address in the DMA address space for the domain.
698 */
699static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
700 struct dma_ops_domain *dom,
701 unsigned long address,
702 phys_addr_t paddr,
703 int direction)
704{
705 u64 *pte, __pte;
706
707 WARN_ON(address > dom->aperture_size);
708
709 paddr &= PAGE_MASK;
710
711 pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
712 pte += IOMMU_PTE_L0_INDEX(address);
713
714 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
715
716 if (direction == DMA_TO_DEVICE)
717 __pte |= IOMMU_PTE_IR;
718 else if (direction == DMA_FROM_DEVICE)
719 __pte |= IOMMU_PTE_IW;
720 else if (direction == DMA_BIDIRECTIONAL)
721 __pte |= IOMMU_PTE_IR | IOMMU_PTE_IW;
722
723 WARN_ON(*pte);
724
725 *pte = __pte;
726
727 return (dma_addr_t)address;
728}
729
730/*
731 * The generic unmapping function for on page in the DMA address space.
732 */
733static void dma_ops_domain_unmap(struct amd_iommu *iommu,
734 struct dma_ops_domain *dom,
735 unsigned long address)
736{
737 u64 *pte;
738
739 if (address >= dom->aperture_size)
740 return;
741
742 WARN_ON(address & 0xfffULL || address > dom->aperture_size);
743
744 pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
745 pte += IOMMU_PTE_L0_INDEX(address);
746
747 WARN_ON(!*pte);
748
749 *pte = 0ULL;
750}
751
752/*
753 * This function contains common code for mapping of a physically
754 * contiguous memory region into DMA address space. It is uses by all
755 * mapping functions provided by this IOMMU driver.
756 * Must be called with the domain lock held.
757 */
758static dma_addr_t __map_single(struct device *dev,
759 struct amd_iommu *iommu,
760 struct dma_ops_domain *dma_dom,
761 phys_addr_t paddr,
762 size_t size,
763 int dir)
764{
765 dma_addr_t offset = paddr & ~PAGE_MASK;
766 dma_addr_t address, start;
767 unsigned int pages;
768 int i;
769
770 pages = to_pages(paddr, size);
771 paddr &= PAGE_MASK;
772
773 address = dma_ops_alloc_addresses(dev, dma_dom, pages);
774 if (unlikely(address == bad_dma_address))
775 goto out;
776
777 start = address;
778 for (i = 0; i < pages; ++i) {
779 dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
780 paddr += PAGE_SIZE;
781 start += PAGE_SIZE;
782 }
783 address += offset;
784
785out:
786 return address;
787}
788
789/*
790 * Does the reverse of the __map_single function. Must be called with
791 * the domain lock held too
792 */
793static void __unmap_single(struct amd_iommu *iommu,
794 struct dma_ops_domain *dma_dom,
795 dma_addr_t dma_addr,
796 size_t size,
797 int dir)
798{
799 dma_addr_t i, start;
800 unsigned int pages;
801
802 if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size))
803 return;
804
805 pages = to_pages(dma_addr, size);
806 dma_addr &= PAGE_MASK;
807 start = dma_addr;
808
809 for (i = 0; i < pages; ++i) {
810 dma_ops_domain_unmap(iommu, dma_dom, start);
811 start += PAGE_SIZE;
812 }
813
814 dma_ops_free_addresses(dma_dom, dma_addr, pages);
815}
816
817/*
818 * The exported map_single function for dma_ops.
819 */
820static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
821 size_t size, int dir)
822{
823 unsigned long flags;
824 struct amd_iommu *iommu;
825 struct protection_domain *domain;
826 u16 devid;
827 dma_addr_t addr;
828
829 get_device_resources(dev, &iommu, &domain, &devid);
830
831 if (iommu == NULL || domain == NULL)
832 /* device not handled by any AMD IOMMU */
833 return (dma_addr_t)paddr;
834
835 spin_lock_irqsave(&domain->lock, flags);
836 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir);
837 if (addr == bad_dma_address)
838 goto out;
839
840 if (iommu_has_npcache(iommu))
841 iommu_flush_pages(iommu, domain->id, addr, size);
842
843 if (iommu->need_sync)
844 iommu_completion_wait(iommu);
845
846out:
847 spin_unlock_irqrestore(&domain->lock, flags);
848
849 return addr;
850}
851
852/*
853 * The exported unmap_single function for dma_ops.
854 */
855static void unmap_single(struct device *dev, dma_addr_t dma_addr,
856 size_t size, int dir)
857{
858 unsigned long flags;
859 struct amd_iommu *iommu;
860 struct protection_domain *domain;
861 u16 devid;
862
863 if (!get_device_resources(dev, &iommu, &domain, &devid))
864 /* device not handled by any AMD IOMMU */
865 return;
866
867 spin_lock_irqsave(&domain->lock, flags);
868
869 __unmap_single(iommu, domain->priv, dma_addr, size, dir);
870
871 iommu_flush_pages(iommu, domain->id, dma_addr, size);
872
873 if (iommu->need_sync)
874 iommu_completion_wait(iommu);
875
876 spin_unlock_irqrestore(&domain->lock, flags);
877}
878
879/*
880 * This is a special map_sg function which is used if we should map a
881 * device which is not handled by an AMD IOMMU in the system.
882 */
883static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
884 int nelems, int dir)
885{
886 struct scatterlist *s;
887 int i;
888
889 for_each_sg(sglist, s, nelems, i) {
890 s->dma_address = (dma_addr_t)sg_phys(s);
891 s->dma_length = s->length;
892 }
893
894 return nelems;
895}
896
897/*
898 * The exported map_sg function for dma_ops (handles scatter-gather
899 * lists).
900 */
901static int map_sg(struct device *dev, struct scatterlist *sglist,
902 int nelems, int dir)
903{
904 unsigned long flags;
905 struct amd_iommu *iommu;
906 struct protection_domain *domain;
907 u16 devid;
908 int i;
909 struct scatterlist *s;
910 phys_addr_t paddr;
911 int mapped_elems = 0;
912
913 get_device_resources(dev, &iommu, &domain, &devid);
914
915 if (!iommu || !domain)
916 return map_sg_no_iommu(dev, sglist, nelems, dir);
917
918 spin_lock_irqsave(&domain->lock, flags);
919
920 for_each_sg(sglist, s, nelems, i) {
921 paddr = sg_phys(s);
922
923 s->dma_address = __map_single(dev, iommu, domain->priv,
924 paddr, s->length, dir);
925
926 if (s->dma_address) {
927 s->dma_length = s->length;
928 mapped_elems++;
929 } else
930 goto unmap;
931 if (iommu_has_npcache(iommu))
932 iommu_flush_pages(iommu, domain->id, s->dma_address,
933 s->dma_length);
934 }
935
936 if (iommu->need_sync)
937 iommu_completion_wait(iommu);
938
939out:
940 spin_unlock_irqrestore(&domain->lock, flags);
941
942 return mapped_elems;
943unmap:
944 for_each_sg(sglist, s, mapped_elems, i) {
945 if (s->dma_address)
946 __unmap_single(iommu, domain->priv, s->dma_address,
947 s->dma_length, dir);
948 s->dma_address = s->dma_length = 0;
949 }
950
951 mapped_elems = 0;
952
953 goto out;
954}
955
956/*
957 * The exported map_sg function for dma_ops (handles scatter-gather
958 * lists).
959 */
960static void unmap_sg(struct device *dev, struct scatterlist *sglist,
961 int nelems, int dir)
962{
963 unsigned long flags;
964 struct amd_iommu *iommu;
965 struct protection_domain *domain;
966 struct scatterlist *s;
967 u16 devid;
968 int i;
969
970 if (!get_device_resources(dev, &iommu, &domain, &devid))
971 return;
972
973 spin_lock_irqsave(&domain->lock, flags);
974
975 for_each_sg(sglist, s, nelems, i) {
976 __unmap_single(iommu, domain->priv, s->dma_address,
977 s->dma_length, dir);
978 iommu_flush_pages(iommu, domain->id, s->dma_address,
979 s->dma_length);
980 s->dma_address = s->dma_length = 0;
981 }
982
983 if (iommu->need_sync)
984 iommu_completion_wait(iommu);
985
986 spin_unlock_irqrestore(&domain->lock, flags);
987}
988
989/*
990 * The exported alloc_coherent function for dma_ops.
991 */
992static void *alloc_coherent(struct device *dev, size_t size,
993 dma_addr_t *dma_addr, gfp_t flag)
994{
995 unsigned long flags;
996 void *virt_addr;
997 struct amd_iommu *iommu;
998 struct protection_domain *domain;
999 u16 devid;
1000 phys_addr_t paddr;
1001
1002 virt_addr = (void *)__get_free_pages(flag, get_order(size));
1003 if (!virt_addr)
1004 return 0;
1005
1006 memset(virt_addr, 0, size);
1007 paddr = virt_to_phys(virt_addr);
1008
1009 get_device_resources(dev, &iommu, &domain, &devid);
1010
1011 if (!iommu || !domain) {
1012 *dma_addr = (dma_addr_t)paddr;
1013 return virt_addr;
1014 }
1015
1016 spin_lock_irqsave(&domain->lock, flags);
1017
1018 *dma_addr = __map_single(dev, iommu, domain->priv, paddr,
1019 size, DMA_BIDIRECTIONAL);
1020
1021 if (*dma_addr == bad_dma_address) {
1022 free_pages((unsigned long)virt_addr, get_order(size));
1023 virt_addr = NULL;
1024 goto out;
1025 }
1026
1027 if (iommu_has_npcache(iommu))
1028 iommu_flush_pages(iommu, domain->id, *dma_addr, size);
1029
1030 if (iommu->need_sync)
1031 iommu_completion_wait(iommu);
1032
1033out:
1034 spin_unlock_irqrestore(&domain->lock, flags);
1035
1036 return virt_addr;
1037}
1038
1039/*
1040 * The exported free_coherent function for dma_ops.
1041 * FIXME: fix the generic x86 DMA layer so that it actually calls that
1042 * function.
1043 */
1044static void free_coherent(struct device *dev, size_t size,
1045 void *virt_addr, dma_addr_t dma_addr)
1046{
1047 unsigned long flags;
1048 struct amd_iommu *iommu;
1049 struct protection_domain *domain;
1050 u16 devid;
1051
1052 get_device_resources(dev, &iommu, &domain, &devid);
1053
1054 if (!iommu || !domain)
1055 goto free_mem;
1056
1057 spin_lock_irqsave(&domain->lock, flags);
1058
1059 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
1060 iommu_flush_pages(iommu, domain->id, dma_addr, size);
1061
1062 if (iommu->need_sync)
1063 iommu_completion_wait(iommu);
1064
1065 spin_unlock_irqrestore(&domain->lock, flags);
1066
1067free_mem:
1068 free_pages((unsigned long)virt_addr, get_order(size));
1069}
1070
1071/*
1072 * The function for pre-allocating protection domains.
1073 *
1074 * If the driver core informs the DMA layer if a driver grabs a device
1075 * we don't need to preallocate the protection domains anymore.
1076 * For now we have to.
1077 */
1078void prealloc_protection_domains(void)
1079{
1080 struct pci_dev *dev = NULL;
1081 struct dma_ops_domain *dma_dom;
1082 struct amd_iommu *iommu;
1083 int order = amd_iommu_aperture_order;
1084 u16 devid;
1085
1086 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
1087 devid = (dev->bus->number << 8) | dev->devfn;
1088 if (devid >= amd_iommu_last_bdf)
1089 continue;
1090 devid = amd_iommu_alias_table[devid];
1091 if (domain_for_device(devid))
1092 continue;
1093 iommu = amd_iommu_rlookup_table[devid];
1094 if (!iommu)
1095 continue;
1096 dma_dom = dma_ops_domain_alloc(iommu, order);
1097 if (!dma_dom)
1098 continue;
1099 init_unity_mappings_for_device(dma_dom, devid);
1100 set_device_domain(iommu, &dma_dom->domain, devid);
1101 printk(KERN_INFO "AMD IOMMU: Allocated domain %d for device ",
1102 dma_dom->domain.id);
1103 print_devid(devid, 1);
1104 }
1105}
1106
1107static struct dma_mapping_ops amd_iommu_dma_ops = {
1108 .alloc_coherent = alloc_coherent,
1109 .free_coherent = free_coherent,
1110 .map_single = map_single,
1111 .unmap_single = unmap_single,
1112 .map_sg = map_sg,
1113 .unmap_sg = unmap_sg,
1114};
1115
1116/*
1117 * The function which clues the AMD IOMMU driver into dma_ops.
1118 */
1119int __init amd_iommu_init_dma_ops(void)
1120{
1121 struct amd_iommu *iommu;
1122 int order = amd_iommu_aperture_order;
1123 int ret;
1124
1125 /*
1126 * first allocate a default protection domain for every IOMMU we
1127 * found in the system. Devices not assigned to any other
1128 * protection domain will be assigned to the default one.
1129 */
1130 list_for_each_entry(iommu, &amd_iommu_list, list) {
1131 iommu->default_dom = dma_ops_domain_alloc(iommu, order);
1132 if (iommu->default_dom == NULL)
1133 return -ENOMEM;
1134 ret = iommu_init_unity_mappings(iommu);
1135 if (ret)
1136 goto free_domains;
1137 }
1138
1139 /*
1140 * If device isolation is enabled, pre-allocate the protection
1141 * domains for each device.
1142 */
1143 if (amd_iommu_isolate)
1144 prealloc_protection_domains();
1145
1146 iommu_detected = 1;
1147 force_iommu = 1;
1148 bad_dma_address = 0;
1149#ifdef CONFIG_GART_IOMMU
1150 gart_iommu_aperture_disabled = 1;
1151 gart_iommu_aperture = 0;
1152#endif
1153
1154 /* Make the driver finally visible to the drivers */
1155 dma_ops = &amd_iommu_dma_ops;
1156
1157 return 0;
1158
1159free_domains:
1160
1161 list_for_each_entry(iommu, &amd_iommu_list, list) {
1162 if (iommu->default_dom)
1163 dma_ops_domain_free(iommu->default_dom);
1164 }
1165
1166 return ret;
1167}
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
new file mode 100644
index 000000000000..c9d8ff2eb130
--- /dev/null
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -0,0 +1,1060 @@
1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#include <linux/pci.h>
21#include <linux/acpi.h>
22#include <linux/gfp.h>
23#include <linux/list.h>
24#include <linux/sysdev.h>
25#include <asm/pci-direct.h>
26#include <asm/amd_iommu_types.h>
27#include <asm/amd_iommu.h>
28#include <asm/iommu.h>
29
30/*
31 * definitions for the ACPI scanning code
32 */
33#define PCI_BUS(x) (((x) >> 8) & 0xff)
34#define IVRS_HEADER_LENGTH 48
35
36#define ACPI_IVHD_TYPE 0x10
37#define ACPI_IVMD_TYPE_ALL 0x20
38#define ACPI_IVMD_TYPE 0x21
39#define ACPI_IVMD_TYPE_RANGE 0x22
40
41#define IVHD_DEV_ALL 0x01
42#define IVHD_DEV_SELECT 0x02
43#define IVHD_DEV_SELECT_RANGE_START 0x03
44#define IVHD_DEV_RANGE_END 0x04
45#define IVHD_DEV_ALIAS 0x42
46#define IVHD_DEV_ALIAS_RANGE 0x43
47#define IVHD_DEV_EXT_SELECT 0x46
48#define IVHD_DEV_EXT_SELECT_RANGE 0x47
49
50#define IVHD_FLAG_HT_TUN_EN 0x00
51#define IVHD_FLAG_PASSPW_EN 0x01
52#define IVHD_FLAG_RESPASSPW_EN 0x02
53#define IVHD_FLAG_ISOC_EN 0x03
54
55#define IVMD_FLAG_EXCL_RANGE 0x08
56#define IVMD_FLAG_UNITY_MAP 0x01
57
58#define ACPI_DEVFLAG_INITPASS 0x01
59#define ACPI_DEVFLAG_EXTINT 0x02
60#define ACPI_DEVFLAG_NMI 0x04
61#define ACPI_DEVFLAG_SYSMGT1 0x10
62#define ACPI_DEVFLAG_SYSMGT2 0x20
63#define ACPI_DEVFLAG_LINT0 0x40
64#define ACPI_DEVFLAG_LINT1 0x80
65#define ACPI_DEVFLAG_ATSDIS 0x10000000
66
67/*
68 * ACPI table definitions
69 *
70 * These data structures are laid over the table to parse the important values
71 * out of it.
72 */
73
74/*
75 * structure describing one IOMMU in the ACPI table. Typically followed by one
76 * or more ivhd_entrys.
77 */
78struct ivhd_header {
79 u8 type;
80 u8 flags;
81 u16 length;
82 u16 devid;
83 u16 cap_ptr;
84 u64 mmio_phys;
85 u16 pci_seg;
86 u16 info;
87 u32 reserved;
88} __attribute__((packed));
89
90/*
91 * A device entry describing which devices a specific IOMMU translates and
92 * which requestor ids they use.
93 */
94struct ivhd_entry {
95 u8 type;
96 u16 devid;
97 u8 flags;
98 u32 ext;
99} __attribute__((packed));
100
101/*
102 * An AMD IOMMU memory definition structure. It defines things like exclusion
103 * ranges for devices and regions that should be unity mapped.
104 */
105struct ivmd_header {
106 u8 type;
107 u8 flags;
108 u16 length;
109 u16 devid;
110 u16 aux;
111 u64 resv;
112 u64 range_start;
113 u64 range_length;
114} __attribute__((packed));
115
116static int __initdata amd_iommu_detected;
117
118u16 amd_iommu_last_bdf; /* largest PCI device id we have
119 to handle */
120LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
121 we find in ACPI */
122unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
123int amd_iommu_isolate; /* if 1, device isolation is enabled */
124
125LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
126 system */
127
128/*
129 * Pointer to the device table which is shared by all AMD IOMMUs
130 * it is indexed by the PCI device id or the HT unit id and contains
131 * information about the domain the device belongs to as well as the
132 * page table root pointer.
133 */
134struct dev_table_entry *amd_iommu_dev_table;
135
136/*
137 * The alias table is a driver specific data structure which contains the
138 * mappings of the PCI device ids to the actual requestor ids on the IOMMU.
139 * More than one device can share the same requestor id.
140 */
141u16 *amd_iommu_alias_table;
142
143/*
144 * The rlookup table is used to find the IOMMU which is responsible
145 * for a specific device. It is also indexed by the PCI device id.
146 */
147struct amd_iommu **amd_iommu_rlookup_table;
148
149/*
150 * The pd table (protection domain table) is used to find the protection domain
151 * data structure a device belongs to. Indexed with the PCI device id too.
152 */
153struct protection_domain **amd_iommu_pd_table;
154
155/*
156 * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
157 * to know which ones are already in use.
158 */
159unsigned long *amd_iommu_pd_alloc_bitmap;
160
161static u32 dev_table_size; /* size of the device table */
162static u32 alias_table_size; /* size of the alias table */
163static u32 rlookup_table_size; /* size if the rlookup table */
164
165static inline void update_last_devid(u16 devid)
166{
167 if (devid > amd_iommu_last_bdf)
168 amd_iommu_last_bdf = devid;
169}
170
171static inline unsigned long tbl_size(int entry_size)
172{
173 unsigned shift = PAGE_SHIFT +
174 get_order(amd_iommu_last_bdf * entry_size);
175
176 return 1UL << shift;
177}
178
179/****************************************************************************
180 *
181 * AMD IOMMU MMIO register space handling functions
182 *
183 * These functions are used to program the IOMMU device registers in
184 * MMIO space required for that driver.
185 *
186 ****************************************************************************/
187
188/*
189 * This function set the exclusion range in the IOMMU. DMA accesses to the
190 * exclusion range are passed through untranslated
191 */
192static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
193{
194 u64 start = iommu->exclusion_start & PAGE_MASK;
195 u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
196 u64 entry;
197
198 if (!iommu->exclusion_start)
199 return;
200
201 entry = start | MMIO_EXCL_ENABLE_MASK;
202 memcpy_toio(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET,
203 &entry, sizeof(entry));
204
205 entry = limit;
206 memcpy_toio(iommu->mmio_base + MMIO_EXCL_LIMIT_OFFSET,
207 &entry, sizeof(entry));
208}
209
210/* Programs the physical address of the device table into the IOMMU hardware */
211static void __init iommu_set_device_table(struct amd_iommu *iommu)
212{
213 u32 entry;
214
215 BUG_ON(iommu->mmio_base == NULL);
216
217 entry = virt_to_phys(amd_iommu_dev_table);
218 entry |= (dev_table_size >> 12) - 1;
219 memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
220 &entry, sizeof(entry));
221}
222
223/* Generic functions to enable/disable certain features of the IOMMU. */
224static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
225{
226 u32 ctrl;
227
228 ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
229 ctrl |= (1 << bit);
230 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
231}
232
233static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
234{
235 u32 ctrl;
236
237 ctrl = (u64)readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
238 ctrl &= ~(1 << bit);
239 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
240}
241
242/* Function to enable the hardware */
243void __init iommu_enable(struct amd_iommu *iommu)
244{
245 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at ");
246 print_devid(iommu->devid, 0);
247 printk(" cap 0x%hx\n", iommu->cap_ptr);
248
249 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
250}
251
252/*
253 * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
254 * the system has one.
255 */
256static u8 * __init iommu_map_mmio_space(u64 address)
257{
258 u8 *ret;
259
260 if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu"))
261 return NULL;
262
263 ret = ioremap_nocache(address, MMIO_REGION_LENGTH);
264 if (ret != NULL)
265 return ret;
266
267 release_mem_region(address, MMIO_REGION_LENGTH);
268
269 return NULL;
270}
271
272static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
273{
274 if (iommu->mmio_base)
275 iounmap(iommu->mmio_base);
276 release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
277}
278
279/****************************************************************************
280 *
281 * The functions below belong to the first pass of AMD IOMMU ACPI table
282 * parsing. In this pass we try to find out the highest device id this
283 * code has to handle. Upon this information the size of the shared data
284 * structures is determined later.
285 *
286 ****************************************************************************/
287
288/*
289 * This function reads the last device id the IOMMU has to handle from the PCI
290 * capability header for this IOMMU
291 */
292static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
293{
294 u32 cap;
295
296 cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
297 update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
298
299 return 0;
300}
301
302/*
303 * After reading the highest device id from the IOMMU PCI capability header
304 * this function looks if there is a higher device id defined in the ACPI table
305 */
306static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
307{
308 u8 *p = (void *)h, *end = (void *)h;
309 struct ivhd_entry *dev;
310
311 p += sizeof(*h);
312 end += h->length;
313
314 find_last_devid_on_pci(PCI_BUS(h->devid),
315 PCI_SLOT(h->devid),
316 PCI_FUNC(h->devid),
317 h->cap_ptr);
318
319 while (p < end) {
320 dev = (struct ivhd_entry *)p;
321 switch (dev->type) {
322 case IVHD_DEV_SELECT:
323 case IVHD_DEV_RANGE_END:
324 case IVHD_DEV_ALIAS:
325 case IVHD_DEV_EXT_SELECT:
326 /* all the above subfield types refer to device ids */
327 update_last_devid(dev->devid);
328 break;
329 default:
330 break;
331 }
332 p += 0x04 << (*p >> 6);
333 }
334
335 WARN_ON(p != end);
336
337 return 0;
338}
339
340/*
341 * Iterate over all IVHD entries in the ACPI table and find the highest device
342 * id which we need to handle. This is the first of three functions which parse
343 * the ACPI table. So we check the checksum here.
344 */
345static int __init find_last_devid_acpi(struct acpi_table_header *table)
346{
347 int i;
348 u8 checksum = 0, *p = (u8 *)table, *end = (u8 *)table;
349 struct ivhd_header *h;
350
351 /*
352 * Validate checksum here so we don't need to do it when
353 * we actually parse the table
354 */
355 for (i = 0; i < table->length; ++i)
356 checksum += p[i];
357 if (checksum != 0)
358 /* ACPI table corrupt */
359 return -ENODEV;
360
361 p += IVRS_HEADER_LENGTH;
362
363 end += table->length;
364 while (p < end) {
365 h = (struct ivhd_header *)p;
366 switch (h->type) {
367 case ACPI_IVHD_TYPE:
368 find_last_devid_from_ivhd(h);
369 break;
370 default:
371 break;
372 }
373 p += h->length;
374 }
375 WARN_ON(p != end);
376
377 return 0;
378}
379
380/****************************************************************************
381 *
382 * The following functions belong the the code path which parses the ACPI table
383 * the second time. In this ACPI parsing iteration we allocate IOMMU specific
384 * data structures, initialize the device/alias/rlookup table and also
385 * basically initialize the hardware.
386 *
387 ****************************************************************************/
388
389/*
390 * Allocates the command buffer. This buffer is per AMD IOMMU. We can
391 * write commands to that buffer later and the IOMMU will execute them
392 * asynchronously
393 */
394static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
395{
396 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
397 get_order(CMD_BUFFER_SIZE));
398 u64 entry;
399
400 if (cmd_buf == NULL)
401 return NULL;
402
403 iommu->cmd_buf_size = CMD_BUFFER_SIZE;
404
405 entry = (u64)virt_to_phys(cmd_buf);
406 entry |= MMIO_CMD_SIZE_512;
407 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
408 &entry, sizeof(entry));
409
410 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
411
412 return cmd_buf;
413}
414
415static void __init free_command_buffer(struct amd_iommu *iommu)
416{
417 free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE));
418}
419
420/* sets a specific bit in the device table entry. */
421static void set_dev_entry_bit(u16 devid, u8 bit)
422{
423 int i = (bit >> 5) & 0x07;
424 int _bit = bit & 0x1f;
425
426 amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
427}
428
429/* Writes the specific IOMMU for a device into the rlookup table */
430static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
431{
432 amd_iommu_rlookup_table[devid] = iommu;
433}
434
435/*
436 * This function takes the device specific flags read from the ACPI
437 * table and sets up the device table entry with that information
438 */
439static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
440 u16 devid, u32 flags, u32 ext_flags)
441{
442 if (flags & ACPI_DEVFLAG_INITPASS)
443 set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
444 if (flags & ACPI_DEVFLAG_EXTINT)
445 set_dev_entry_bit(devid, DEV_ENTRY_EINT_PASS);
446 if (flags & ACPI_DEVFLAG_NMI)
447 set_dev_entry_bit(devid, DEV_ENTRY_NMI_PASS);
448 if (flags & ACPI_DEVFLAG_SYSMGT1)
449 set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1);
450 if (flags & ACPI_DEVFLAG_SYSMGT2)
451 set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2);
452 if (flags & ACPI_DEVFLAG_LINT0)
453 set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
454 if (flags & ACPI_DEVFLAG_LINT1)
455 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
456
457 set_iommu_for_device(iommu, devid);
458}
459
460/*
461 * Reads the device exclusion range from ACPI and initialize IOMMU with
462 * it
463 */
464static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
465{
466 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
467
468 if (!(m->flags & IVMD_FLAG_EXCL_RANGE))
469 return;
470
471 if (iommu) {
472 /*
473 * We only can configure exclusion ranges per IOMMU, not
474 * per device. But we can enable the exclusion range per
475 * device. This is done here
476 */
477 set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
478 iommu->exclusion_start = m->range_start;
479 iommu->exclusion_length = m->range_length;
480 }
481}
482
483/*
484 * This function reads some important data from the IOMMU PCI space and
485 * initializes the driver data structure with it. It reads the hardware
486 * capabilities and the first/last device entries
487 */
488static void __init init_iommu_from_pci(struct amd_iommu *iommu)
489{
490 int bus = PCI_BUS(iommu->devid);
491 int dev = PCI_SLOT(iommu->devid);
492 int fn = PCI_FUNC(iommu->devid);
493 int cap_ptr = iommu->cap_ptr;
494 u32 range;
495
496 iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET);
497
498 range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
499 iommu->first_device = calc_devid(MMIO_GET_BUS(range),
500 MMIO_GET_FD(range));
501 iommu->last_device = calc_devid(MMIO_GET_BUS(range),
502 MMIO_GET_LD(range));
503}
504
505/*
506 * Takes a pointer to an AMD IOMMU entry in the ACPI table and
507 * initializes the hardware and our data structures with it.
508 */
509static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
510 struct ivhd_header *h)
511{
512 u8 *p = (u8 *)h;
513 u8 *end = p, flags = 0;
514 u16 dev_i, devid = 0, devid_start = 0, devid_to = 0;
515 u32 ext_flags = 0;
516 bool alias = false;
517 struct ivhd_entry *e;
518
519 /*
520 * First set the recommended feature enable bits from ACPI
521 * into the IOMMU control registers
522 */
523 h->flags & IVHD_FLAG_HT_TUN_EN ?
524 iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
525 iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
526
527 h->flags & IVHD_FLAG_PASSPW_EN ?
528 iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
529 iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
530
531 h->flags & IVHD_FLAG_RESPASSPW_EN ?
532 iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
533 iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
534
535 h->flags & IVHD_FLAG_ISOC_EN ?
536 iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
537 iommu_feature_disable(iommu, CONTROL_ISOC_EN);
538
539 /*
540 * make IOMMU memory accesses cache coherent
541 */
542 iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
543
544 /*
545 * Done. Now parse the device entries
546 */
547 p += sizeof(struct ivhd_header);
548 end += h->length;
549
550 while (p < end) {
551 e = (struct ivhd_entry *)p;
552 switch (e->type) {
553 case IVHD_DEV_ALL:
554 for (dev_i = iommu->first_device;
555 dev_i <= iommu->last_device; ++dev_i)
556 set_dev_entry_from_acpi(iommu, dev_i,
557 e->flags, 0);
558 break;
559 case IVHD_DEV_SELECT:
560 devid = e->devid;
561 set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
562 break;
563 case IVHD_DEV_SELECT_RANGE_START:
564 devid_start = e->devid;
565 flags = e->flags;
566 ext_flags = 0;
567 alias = false;
568 break;
569 case IVHD_DEV_ALIAS:
570 devid = e->devid;
571 devid_to = e->ext >> 8;
572 set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
573 amd_iommu_alias_table[devid] = devid_to;
574 break;
575 case IVHD_DEV_ALIAS_RANGE:
576 devid_start = e->devid;
577 flags = e->flags;
578 devid_to = e->ext >> 8;
579 ext_flags = 0;
580 alias = true;
581 break;
582 case IVHD_DEV_EXT_SELECT:
583 devid = e->devid;
584 set_dev_entry_from_acpi(iommu, devid, e->flags,
585 e->ext);
586 break;
587 case IVHD_DEV_EXT_SELECT_RANGE:
588 devid_start = e->devid;
589 flags = e->flags;
590 ext_flags = e->ext;
591 alias = false;
592 break;
593 case IVHD_DEV_RANGE_END:
594 devid = e->devid;
595 for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
596 if (alias)
597 amd_iommu_alias_table[dev_i] = devid_to;
598 set_dev_entry_from_acpi(iommu,
599 amd_iommu_alias_table[dev_i],
600 flags, ext_flags);
601 }
602 break;
603 default:
604 break;
605 }
606
607 p += 0x04 << (e->type >> 6);
608 }
609}
610
611/* Initializes the device->iommu mapping for the driver */
612static int __init init_iommu_devices(struct amd_iommu *iommu)
613{
614 u16 i;
615
616 for (i = iommu->first_device; i <= iommu->last_device; ++i)
617 set_iommu_for_device(iommu, i);
618
619 return 0;
620}
621
622static void __init free_iommu_one(struct amd_iommu *iommu)
623{
624 free_command_buffer(iommu);
625 iommu_unmap_mmio_space(iommu);
626}
627
628static void __init free_iommu_all(void)
629{
630 struct amd_iommu *iommu, *next;
631
632 list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) {
633 list_del(&iommu->list);
634 free_iommu_one(iommu);
635 kfree(iommu);
636 }
637}
638
639/*
640 * This function clues the initialization function for one IOMMU
641 * together and also allocates the command buffer and programs the
642 * hardware. It does NOT enable the IOMMU. This is done afterwards.
643 */
644static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
645{
646 spin_lock_init(&iommu->lock);
647 list_add_tail(&iommu->list, &amd_iommu_list);
648
649 /*
650 * Copy data from ACPI table entry to the iommu struct
651 */
652 iommu->devid = h->devid;
653 iommu->cap_ptr = h->cap_ptr;
654 iommu->mmio_phys = h->mmio_phys;
655 iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
656 if (!iommu->mmio_base)
657 return -ENOMEM;
658
659 iommu_set_device_table(iommu);
660 iommu->cmd_buf = alloc_command_buffer(iommu);
661 if (!iommu->cmd_buf)
662 return -ENOMEM;
663
664 init_iommu_from_pci(iommu);
665 init_iommu_from_acpi(iommu, h);
666 init_iommu_devices(iommu);
667
668 return 0;
669}
670
671/*
672 * Iterates over all IOMMU entries in the ACPI table, allocates the
673 * IOMMU structure and initializes it with init_iommu_one()
674 */
675static int __init init_iommu_all(struct acpi_table_header *table)
676{
677 u8 *p = (u8 *)table, *end = (u8 *)table;
678 struct ivhd_header *h;
679 struct amd_iommu *iommu;
680 int ret;
681
682 end += table->length;
683 p += IVRS_HEADER_LENGTH;
684
685 while (p < end) {
686 h = (struct ivhd_header *)p;
687 switch (*p) {
688 case ACPI_IVHD_TYPE:
689 iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
690 if (iommu == NULL)
691 return -ENOMEM;
692 ret = init_iommu_one(iommu, h);
693 if (ret)
694 return ret;
695 break;
696 default:
697 break;
698 }
699 p += h->length;
700
701 }
702 WARN_ON(p != end);
703
704 return 0;
705}
706
707/****************************************************************************
708 *
709 * The next functions belong to the third pass of parsing the ACPI
710 * table. In this last pass the memory mapping requirements are
711 * gathered (like exclusion and unity mapping reanges).
712 *
713 ****************************************************************************/
714
715static void __init free_unity_maps(void)
716{
717 struct unity_map_entry *entry, *next;
718
719 list_for_each_entry_safe(entry, next, &amd_iommu_unity_map, list) {
720 list_del(&entry->list);
721 kfree(entry);
722 }
723}
724
725/* called when we find an exclusion range definition in ACPI */
726static int __init init_exclusion_range(struct ivmd_header *m)
727{
728 int i;
729
730 switch (m->type) {
731 case ACPI_IVMD_TYPE:
732 set_device_exclusion_range(m->devid, m);
733 break;
734 case ACPI_IVMD_TYPE_ALL:
735 for (i = 0; i < amd_iommu_last_bdf; ++i)
736 set_device_exclusion_range(i, m);
737 break;
738 case ACPI_IVMD_TYPE_RANGE:
739 for (i = m->devid; i <= m->aux; ++i)
740 set_device_exclusion_range(i, m);
741 break;
742 default:
743 break;
744 }
745
746 return 0;
747}
748
749/* called for unity map ACPI definition */
750static int __init init_unity_map_range(struct ivmd_header *m)
751{
752 struct unity_map_entry *e = 0;
753
754 e = kzalloc(sizeof(*e), GFP_KERNEL);
755 if (e == NULL)
756 return -ENOMEM;
757
758 switch (m->type) {
759 default:
760 case ACPI_IVMD_TYPE:
761 e->devid_start = e->devid_end = m->devid;
762 break;
763 case ACPI_IVMD_TYPE_ALL:
764 e->devid_start = 0;
765 e->devid_end = amd_iommu_last_bdf;
766 break;
767 case ACPI_IVMD_TYPE_RANGE:
768 e->devid_start = m->devid;
769 e->devid_end = m->aux;
770 break;
771 }
772 e->address_start = PAGE_ALIGN(m->range_start);
773 e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
774 e->prot = m->flags >> 1;
775
776 list_add_tail(&e->list, &amd_iommu_unity_map);
777
778 return 0;
779}
780
781/* iterates over all memory definitions we find in the ACPI table */
782static int __init init_memory_definitions(struct acpi_table_header *table)
783{
784 u8 *p = (u8 *)table, *end = (u8 *)table;
785 struct ivmd_header *m;
786
787 end += table->length;
788 p += IVRS_HEADER_LENGTH;
789
790 while (p < end) {
791 m = (struct ivmd_header *)p;
792 if (m->flags & IVMD_FLAG_EXCL_RANGE)
793 init_exclusion_range(m);
794 else if (m->flags & IVMD_FLAG_UNITY_MAP)
795 init_unity_map_range(m);
796
797 p += m->length;
798 }
799
800 return 0;
801}
802
803/*
804 * This function finally enables all IOMMUs found in the system after
805 * they have been initialized
806 */
807static void __init enable_iommus(void)
808{
809 struct amd_iommu *iommu;
810
811 list_for_each_entry(iommu, &amd_iommu_list, list) {
812 iommu_set_exclusion_range(iommu);
813 iommu_enable(iommu);
814 }
815}
816
817/*
818 * Suspend/Resume support
819 * disable suspend until real resume implemented
820 */
821
822static int amd_iommu_resume(struct sys_device *dev)
823{
824 return 0;
825}
826
827static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
828{
829 return -EINVAL;
830}
831
832static struct sysdev_class amd_iommu_sysdev_class = {
833 .name = "amd_iommu",
834 .suspend = amd_iommu_suspend,
835 .resume = amd_iommu_resume,
836};
837
838static struct sys_device device_amd_iommu = {
839 .id = 0,
840 .cls = &amd_iommu_sysdev_class,
841};
842
843/*
844 * This is the core init function for AMD IOMMU hardware in the system.
845 * This function is called from the generic x86 DMA layer initialization
846 * code.
847 *
848 * This function basically parses the ACPI table for AMD IOMMU (IVRS)
849 * three times:
850 *
851 * 1 pass) Find the highest PCI device id the driver has to handle.
852 * Upon this information the size of the data structures is
853 * determined that needs to be allocated.
854 *
855 * 2 pass) Initialize the data structures just allocated with the
856 * information in the ACPI table about available AMD IOMMUs
857 * in the system. It also maps the PCI devices in the
858 * system to specific IOMMUs
859 *
860 * 3 pass) After the basic data structures are allocated and
861 * initialized we update them with information about memory
862 * remapping requirements parsed out of the ACPI table in
863 * this last pass.
864 *
865 * After that the hardware is initialized and ready to go. In the last
866 * step we do some Linux specific things like registering the driver in
867 * the dma_ops interface and initializing the suspend/resume support
868 * functions. Finally it prints some information about AMD IOMMUs and
869 * the driver state and enables the hardware.
870 */
871int __init amd_iommu_init(void)
872{
873 int i, ret = 0;
874
875
876 if (no_iommu) {
877 printk(KERN_INFO "AMD IOMMU disabled by kernel command line\n");
878 return 0;
879 }
880
881 if (!amd_iommu_detected)
882 return -ENODEV;
883
884 /*
885 * First parse ACPI tables to find the largest Bus/Dev/Func
886 * we need to handle. Upon this information the shared data
887 * structures for the IOMMUs in the system will be allocated
888 */
889 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
890 return -ENODEV;
891
892 dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE);
893 alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
894 rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
895
896 ret = -ENOMEM;
897
898 /* Device table - directly used by all IOMMUs */
899 amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
900 get_order(dev_table_size));
901 if (amd_iommu_dev_table == NULL)
902 goto out;
903
904 /*
905 * Alias table - map PCI Bus/Dev/Func to Bus/Dev/Func the
906 * IOMMU see for that device
907 */
908 amd_iommu_alias_table = (void *)__get_free_pages(GFP_KERNEL,
909 get_order(alias_table_size));
910 if (amd_iommu_alias_table == NULL)
911 goto free;
912
913 /* IOMMU rlookup table - find the IOMMU for a specific device */
914 amd_iommu_rlookup_table = (void *)__get_free_pages(GFP_KERNEL,
915 get_order(rlookup_table_size));
916 if (amd_iommu_rlookup_table == NULL)
917 goto free;
918
919 /*
920 * Protection Domain table - maps devices to protection domains
921 * This table has the same size as the rlookup_table
922 */
923 amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
924 get_order(rlookup_table_size));
925 if (amd_iommu_pd_table == NULL)
926 goto free;
927
928 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
929 GFP_KERNEL | __GFP_ZERO,
930 get_order(MAX_DOMAIN_ID/8));
931 if (amd_iommu_pd_alloc_bitmap == NULL)
932 goto free;
933
934 /*
935 * let all alias entries point to itself
936 */
937 for (i = 0; i < amd_iommu_last_bdf; ++i)
938 amd_iommu_alias_table[i] = i;
939
940 /*
941 * never allocate domain 0 because its used as the non-allocated and
942 * error value placeholder
943 */
944 amd_iommu_pd_alloc_bitmap[0] = 1;
945
946 /*
947 * now the data structures are allocated and basically initialized
948 * start the real acpi table scan
949 */
950 ret = -ENODEV;
951 if (acpi_table_parse("IVRS", init_iommu_all) != 0)
952 goto free;
953
954 if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
955 goto free;
956
957 ret = amd_iommu_init_dma_ops();
958 if (ret)
959 goto free;
960
961 ret = sysdev_class_register(&amd_iommu_sysdev_class);
962 if (ret)
963 goto free;
964
965 ret = sysdev_register(&device_amd_iommu);
966 if (ret)
967 goto free;
968
969 enable_iommus();
970
971 printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n",
972 (1 << (amd_iommu_aperture_order-20)));
973
974 printk(KERN_INFO "AMD IOMMU: device isolation ");
975 if (amd_iommu_isolate)
976 printk("enabled\n");
977 else
978 printk("disabled\n");
979
980out:
981 return ret;
982
983free:
984 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1);
985
986 free_pages((unsigned long)amd_iommu_pd_table,
987 get_order(rlookup_table_size));
988
989 free_pages((unsigned long)amd_iommu_rlookup_table,
990 get_order(rlookup_table_size));
991
992 free_pages((unsigned long)amd_iommu_alias_table,
993 get_order(alias_table_size));
994
995 free_pages((unsigned long)amd_iommu_dev_table,
996 get_order(dev_table_size));
997
998 free_iommu_all();
999
1000 free_unity_maps();
1001
1002 goto out;
1003}
1004
1005/****************************************************************************
1006 *
1007 * Early detect code. This code runs at IOMMU detection time in the DMA
1008 * layer. It just looks if there is an IVRS ACPI table to detect AMD
1009 * IOMMUs
1010 *
1011 ****************************************************************************/
1012static int __init early_amd_iommu_detect(struct acpi_table_header *table)
1013{
1014 return 0;
1015}
1016
1017void __init amd_iommu_detect(void)
1018{
1019 if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture))
1020 return;
1021
1022 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
1023 iommu_detected = 1;
1024 amd_iommu_detected = 1;
1025#ifdef CONFIG_GART_IOMMU
1026 gart_iommu_aperture_disabled = 1;
1027 gart_iommu_aperture = 0;
1028#endif
1029 }
1030}
1031
1032/****************************************************************************
1033 *
1034 * Parsing functions for the AMD IOMMU specific kernel command line
1035 * options.
1036 *
1037 ****************************************************************************/
1038
1039static int __init parse_amd_iommu_options(char *str)
1040{
1041 for (; *str; ++str) {
1042 if (strcmp(str, "isolate") == 0)
1043 amd_iommu_isolate = 1;
1044 }
1045
1046 return 1;
1047}
1048
1049static int __init parse_amd_iommu_size_options(char *str)
1050{
1051 unsigned order = PAGE_SHIFT + get_order(memparse(str, &str));
1052
1053 if ((order > 24) && (order < 31))
1054 amd_iommu_aperture_order = order;
1055
1056 return 1;
1057}
1058
1059__setup("amd_iommu=", parse_amd_iommu_options);
1060__setup("amd_iommu_size=", parse_amd_iommu_size_options);
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 479926d9e004..44e21826db11 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -21,6 +21,7 @@
21#include <linux/suspend.h> 21#include <linux/suspend.h>
22#include <asm/e820.h> 22#include <asm/e820.h>
23#include <asm/io.h> 23#include <asm/io.h>
24#include <asm/iommu.h>
24#include <asm/gart.h> 25#include <asm/gart.h>
25#include <asm/pci-direct.h> 26#include <asm/pci-direct.h>
26#include <asm/dma.h> 27#include <asm/dma.h>
@@ -35,6 +36,18 @@ int fallback_aper_force __initdata;
35 36
36int fix_aperture __initdata = 1; 37int fix_aperture __initdata = 1;
37 38
39struct bus_dev_range {
40 int bus;
41 int dev_base;
42 int dev_limit;
43};
44
45static struct bus_dev_range bus_dev_ranges[] __initdata = {
46 { 0x00, 0x18, 0x20},
47 { 0xff, 0x00, 0x20},
48 { 0xfe, 0x00, 0x20}
49};
50
38static struct resource gart_resource = { 51static struct resource gart_resource = {
39 .name = "GART", 52 .name = "GART",
40 .flags = IORESOURCE_MEM, 53 .flags = IORESOURCE_MEM,
@@ -55,8 +68,9 @@ static u32 __init allocate_aperture(void)
55 u32 aper_size; 68 u32 aper_size;
56 void *p; 69 void *p;
57 70
58 if (fallback_aper_order > 7) 71 /* aper_size should <= 1G */
59 fallback_aper_order = 7; 72 if (fallback_aper_order > 5)
73 fallback_aper_order = 5;
60 aper_size = (32 * 1024 * 1024) << fallback_aper_order; 74 aper_size = (32 * 1024 * 1024) << fallback_aper_order;
61 75
62 /* 76 /*
@@ -65,7 +79,20 @@ static u32 __init allocate_aperture(void)
65 * memory. Unfortunately we cannot move it up because that would 79 * memory. Unfortunately we cannot move it up because that would
66 * make the IOMMU useless. 80 * make the IOMMU useless.
67 */ 81 */
68 p = __alloc_bootmem_nopanic(aper_size, aper_size, 0); 82 /*
83 * using 512M as goal, in case kexec will load kernel_big
84 * that will do the on position decompress, and could overlap with
85 * that positon with gart that is used.
86 * sequende:
87 * kernel_small
88 * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
89 * ==> kernel_small(gart area become e820_reserved)
90 * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
91 * ==> kerne_big (uncompressed size will be big than 64M or 128M)
92 * so don't use 512M below as gart iommu, leave the space for kernel
93 * code for safe
94 */
95 p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20);
69 if (!p || __pa(p)+aper_size > 0xffffffff) { 96 if (!p || __pa(p)+aper_size > 0xffffffff) {
70 printk(KERN_ERR 97 printk(KERN_ERR
71 "Cannot allocate aperture memory hole (%p,%uK)\n", 98 "Cannot allocate aperture memory hole (%p,%uK)\n",
@@ -83,69 +110,53 @@ static u32 __init allocate_aperture(void)
83 return (u32)__pa(p); 110 return (u32)__pa(p);
84} 111}
85 112
86static int __init aperture_valid(u64 aper_base, u32 aper_size)
87{
88 if (!aper_base)
89 return 0;
90
91 if (aper_base + aper_size > 0x100000000UL) {
92 printk(KERN_ERR "Aperture beyond 4GB. Ignoring.\n");
93 return 0;
94 }
95 if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
96 printk(KERN_ERR "Aperture pointing to e820 RAM. Ignoring.\n");
97 return 0;
98 }
99 if (aper_size < 64*1024*1024) {
100 printk(KERN_ERR "Aperture too small (%d MB)\n", aper_size>>20);
101 return 0;
102 }
103
104 return 1;
105}
106 113
107/* Find a PCI capability */ 114/* Find a PCI capability */
108static __u32 __init find_cap(int num, int slot, int func, int cap) 115static u32 __init find_cap(int bus, int slot, int func, int cap)
109{ 116{
110 int bytes; 117 int bytes;
111 u8 pos; 118 u8 pos;
112 119
113 if (!(read_pci_config_16(num, slot, func, PCI_STATUS) & 120 if (!(read_pci_config_16(bus, slot, func, PCI_STATUS) &
114 PCI_STATUS_CAP_LIST)) 121 PCI_STATUS_CAP_LIST))
115 return 0; 122 return 0;
116 123
117 pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST); 124 pos = read_pci_config_byte(bus, slot, func, PCI_CAPABILITY_LIST);
118 for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { 125 for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
119 u8 id; 126 u8 id;
120 127
121 pos &= ~3; 128 pos &= ~3;
122 id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID); 129 id = read_pci_config_byte(bus, slot, func, pos+PCI_CAP_LIST_ID);
123 if (id == 0xff) 130 if (id == 0xff)
124 break; 131 break;
125 if (id == cap) 132 if (id == cap)
126 return pos; 133 return pos;
127 pos = read_pci_config_byte(num, slot, func, 134 pos = read_pci_config_byte(bus, slot, func,
128 pos+PCI_CAP_LIST_NEXT); 135 pos+PCI_CAP_LIST_NEXT);
129 } 136 }
130 return 0; 137 return 0;
131} 138}
132 139
133/* Read a standard AGPv3 bridge header */ 140/* Read a standard AGPv3 bridge header */
134static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) 141static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
135{ 142{
136 u32 apsize; 143 u32 apsize;
137 u32 apsizereg; 144 u32 apsizereg;
138 int nbits; 145 int nbits;
139 u32 aper_low, aper_hi; 146 u32 aper_low, aper_hi;
140 u64 aper; 147 u64 aper;
148 u32 old_order;
141 149
142 printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", num, slot, func); 150 printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", bus, slot, func);
143 apsizereg = read_pci_config_16(num, slot, func, cap + 0x14); 151 apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14);
144 if (apsizereg == 0xffffffff) { 152 if (apsizereg == 0xffffffff) {
145 printk(KERN_ERR "APSIZE in AGP bridge unreadable\n"); 153 printk(KERN_ERR "APSIZE in AGP bridge unreadable\n");
146 return 0; 154 return 0;
147 } 155 }
148 156
157 /* old_order could be the value from NB gart setting */
158 old_order = *order;
159
149 apsize = apsizereg & 0xfff; 160 apsize = apsizereg & 0xfff;
150 /* Some BIOS use weird encodings not in the AGPv3 table. */ 161 /* Some BIOS use weird encodings not in the AGPv3 table. */
151 if (apsize & 0xff) 162 if (apsize & 0xff)
@@ -155,14 +166,26 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
155 if ((int)*order < 0) /* < 32MB */ 166 if ((int)*order < 0) /* < 32MB */
156 *order = 0; 167 *order = 0;
157 168
158 aper_low = read_pci_config(num, slot, func, 0x10); 169 aper_low = read_pci_config(bus, slot, func, 0x10);
159 aper_hi = read_pci_config(num, slot, func, 0x14); 170 aper_hi = read_pci_config(bus, slot, func, 0x14);
160 aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); 171 aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
161 172
173 /*
174 * On some sick chips, APSIZE is 0. It means it wants 4G
175 * so let double check that order, and lets trust AMD NB settings:
176 */
177 printk(KERN_INFO "Aperture from AGP @ %Lx old size %u MB\n",
178 aper, 32 << old_order);
179 if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) {
180 printk(KERN_INFO "Aperture size %u MB (APSIZE %x) is not right, using settings from NB\n",
181 32 << *order, apsizereg);
182 *order = old_order;
183 }
184
162 printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", 185 printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n",
163 aper, 32 << *order, apsizereg); 186 aper, 32 << *order, apsizereg);
164 187
165 if (!aperture_valid(aper, (32*1024*1024) << *order)) 188 if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20))
166 return 0; 189 return 0;
167 return (u32)aper; 190 return (u32)aper;
168} 191}
@@ -180,17 +203,17 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
180 * the AGP bridges should be always an own bus on the HT hierarchy, 203 * the AGP bridges should be always an own bus on the HT hierarchy,
181 * but do it here for future safety. 204 * but do it here for future safety.
182 */ 205 */
183static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) 206static u32 __init search_agp_bridge(u32 *order, int *valid_agp)
184{ 207{
185 int num, slot, func; 208 int bus, slot, func;
186 209
187 /* Poor man's PCI discovery */ 210 /* Poor man's PCI discovery */
188 for (num = 0; num < 256; num++) { 211 for (bus = 0; bus < 256; bus++) {
189 for (slot = 0; slot < 32; slot++) { 212 for (slot = 0; slot < 32; slot++) {
190 for (func = 0; func < 8; func++) { 213 for (func = 0; func < 8; func++) {
191 u32 class, cap; 214 u32 class, cap;
192 u8 type; 215 u8 type;
193 class = read_pci_config(num, slot, func, 216 class = read_pci_config(bus, slot, func,
194 PCI_CLASS_REVISION); 217 PCI_CLASS_REVISION);
195 if (class == 0xffffffff) 218 if (class == 0xffffffff)
196 break; 219 break;
@@ -199,17 +222,17 @@ static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
199 case PCI_CLASS_BRIDGE_HOST: 222 case PCI_CLASS_BRIDGE_HOST:
200 case PCI_CLASS_BRIDGE_OTHER: /* needed? */ 223 case PCI_CLASS_BRIDGE_OTHER: /* needed? */
201 /* AGP bridge? */ 224 /* AGP bridge? */
202 cap = find_cap(num, slot, func, 225 cap = find_cap(bus, slot, func,
203 PCI_CAP_ID_AGP); 226 PCI_CAP_ID_AGP);
204 if (!cap) 227 if (!cap)
205 break; 228 break;
206 *valid_agp = 1; 229 *valid_agp = 1;
207 return read_agp(num, slot, func, cap, 230 return read_agp(bus, slot, func, cap,
208 order); 231 order);
209 } 232 }
210 233
211 /* No multi-function device? */ 234 /* No multi-function device? */
212 type = read_pci_config_byte(num, slot, func, 235 type = read_pci_config_byte(bus, slot, func,
213 PCI_HEADER_TYPE); 236 PCI_HEADER_TYPE);
214 if (!(type & 0x80)) 237 if (!(type & 0x80))
215 break; 238 break;
@@ -249,36 +272,50 @@ void __init early_gart_iommu_check(void)
249 * or BIOS forget to put that in reserved. 272 * or BIOS forget to put that in reserved.
250 * try to update e820 to make that region as reserved. 273 * try to update e820 to make that region as reserved.
251 */ 274 */
252 int fix, num; 275 int i, fix, slot;
253 u32 ctl; 276 u32 ctl;
254 u32 aper_size = 0, aper_order = 0, last_aper_order = 0; 277 u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
255 u64 aper_base = 0, last_aper_base = 0; 278 u64 aper_base = 0, last_aper_base = 0;
256 int aper_enabled = 0, last_aper_enabled = 0; 279 int aper_enabled = 0, last_aper_enabled = 0, last_valid = 0;
257 280
258 if (!early_pci_allowed()) 281 if (!early_pci_allowed())
259 return; 282 return;
260 283
284 /* This is mostly duplicate of iommu_hole_init */
261 fix = 0; 285 fix = 0;
262 for (num = 24; num < 32; num++) { 286 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
263 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) 287 int bus;
264 continue; 288 int dev_base, dev_limit;
265 289
266 ctl = read_pci_config(0, num, 3, 0x90); 290 bus = bus_dev_ranges[i].bus;
267 aper_enabled = ctl & 1; 291 dev_base = bus_dev_ranges[i].dev_base;
268 aper_order = (ctl >> 1) & 7; 292 dev_limit = bus_dev_ranges[i].dev_limit;
269 aper_size = (32 * 1024 * 1024) << aper_order; 293
270 aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; 294 for (slot = dev_base; slot < dev_limit; slot++) {
271 aper_base <<= 25; 295 if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
272 296 continue;
273 if ((last_aper_order && aper_order != last_aper_order) || 297
274 (last_aper_base && aper_base != last_aper_base) || 298 ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
275 (last_aper_enabled && aper_enabled != last_aper_enabled)) { 299 aper_enabled = ctl & AMD64_GARTEN;
276 fix = 1; 300 aper_order = (ctl >> 1) & 7;
277 break; 301 aper_size = (32 * 1024 * 1024) << aper_order;
302 aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
303 aper_base <<= 25;
304
305 if (last_valid) {
306 if ((aper_order != last_aper_order) ||
307 (aper_base != last_aper_base) ||
308 (aper_enabled != last_aper_enabled)) {
309 fix = 1;
310 break;
311 }
312 }
313
314 last_aper_order = aper_order;
315 last_aper_base = aper_base;
316 last_aper_enabled = aper_enabled;
317 last_valid = 1;
278 } 318 }
279 last_aper_order = aper_order;
280 last_aper_base = aper_base;
281 last_aper_enabled = aper_enabled;
282 } 319 }
283 320
284 if (!fix && !aper_enabled) 321 if (!fix && !aper_enabled)
@@ -290,32 +327,46 @@ void __init early_gart_iommu_check(void)
290 if (gart_fix_e820 && !fix && aper_enabled) { 327 if (gart_fix_e820 && !fix && aper_enabled) {
291 if (e820_any_mapped(aper_base, aper_base + aper_size, 328 if (e820_any_mapped(aper_base, aper_base + aper_size,
292 E820_RAM)) { 329 E820_RAM)) {
293 /* reserved it, so we can resuse it in second kernel */ 330 /* reserve it, so we can reuse it in second kernel */
294 printk(KERN_INFO "update e820 for GART\n"); 331 printk(KERN_INFO "update e820 for GART\n");
295 add_memory_region(aper_base, aper_size, E820_RESERVED); 332 e820_add_region(aper_base, aper_size, E820_RESERVED);
296 update_e820(); 333 update_e820();
297 } 334 }
298 return;
299 } 335 }
300 336
337 if (!fix)
338 return;
339
301 /* different nodes have different setting, disable them all at first*/ 340 /* different nodes have different setting, disable them all at first*/
302 for (num = 24; num < 32; num++) { 341 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
303 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) 342 int bus;
304 continue; 343 int dev_base, dev_limit;
344
345 bus = bus_dev_ranges[i].bus;
346 dev_base = bus_dev_ranges[i].dev_base;
347 dev_limit = bus_dev_ranges[i].dev_limit;
348
349 for (slot = dev_base; slot < dev_limit; slot++) {
350 if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
351 continue;
305 352
306 ctl = read_pci_config(0, num, 3, 0x90); 353 ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
307 ctl &= ~1; 354 ctl &= ~AMD64_GARTEN;
308 write_pci_config(0, num, 3, 0x90, ctl); 355 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
356 }
309 } 357 }
310 358
311} 359}
312 360
361static int __initdata printed_gart_size_msg;
362
313void __init gart_iommu_hole_init(void) 363void __init gart_iommu_hole_init(void)
314{ 364{
365 u32 agp_aper_base = 0, agp_aper_order = 0;
315 u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; 366 u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
316 u64 aper_base, last_aper_base = 0; 367 u64 aper_base, last_aper_base = 0;
317 int fix, num, valid_agp = 0; 368 int fix, slot, valid_agp = 0;
318 int node; 369 int i, node;
319 370
320 if (gart_iommu_aperture_disabled || !fix_aperture || 371 if (gart_iommu_aperture_disabled || !fix_aperture ||
321 !early_pci_allowed()) 372 !early_pci_allowed())
@@ -323,38 +374,65 @@ void __init gart_iommu_hole_init(void)
323 374
324 printk(KERN_INFO "Checking aperture...\n"); 375 printk(KERN_INFO "Checking aperture...\n");
325 376
377 if (!fallback_aper_force)
378 agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
379
326 fix = 0; 380 fix = 0;
327 node = 0; 381 node = 0;
328 for (num = 24; num < 32; num++) { 382 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
329 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) 383 int bus;
330 continue; 384 int dev_base, dev_limit;
331 385
332 iommu_detected = 1; 386 bus = bus_dev_ranges[i].bus;
333 gart_iommu_aperture = 1; 387 dev_base = bus_dev_ranges[i].dev_base;
334 388 dev_limit = bus_dev_ranges[i].dev_limit;
335 aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; 389
336 aper_size = (32 * 1024 * 1024) << aper_order; 390 for (slot = dev_base; slot < dev_limit; slot++) {
337 aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; 391 if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
338 aper_base <<= 25; 392 continue;
339 393
340 printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n", 394 iommu_detected = 1;
341 node, aper_base, aper_size >> 20); 395 gart_iommu_aperture = 1;
342 node++; 396
343 397 aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7;
344 if (!aperture_valid(aper_base, aper_size)) { 398 aper_size = (32 * 1024 * 1024) << aper_order;
345 fix = 1; 399 aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
346 break; 400 aper_base <<= 25;
347 } 401
402 printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n",
403 node, aper_base, aper_size >> 20);
404 node++;
405
406 if (!aperture_valid(aper_base, aper_size, 64<<20)) {
407 if (valid_agp && agp_aper_base &&
408 agp_aper_base == aper_base &&
409 agp_aper_order == aper_order) {
410 /* the same between two setting from NB and agp */
411 if (!no_iommu &&
412 max_pfn > MAX_DMA32_PFN &&
413 !printed_gart_size_msg) {
414 printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n");
415 printk(KERN_ERR "please increase GART size in your BIOS setup\n");
416 printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n");
417 printed_gart_size_msg = 1;
418 }
419 } else {
420 fix = 1;
421 goto out;
422 }
423 }
348 424
349 if ((last_aper_order && aper_order != last_aper_order) || 425 if ((last_aper_order && aper_order != last_aper_order) ||
350 (last_aper_base && aper_base != last_aper_base)) { 426 (last_aper_base && aper_base != last_aper_base)) {
351 fix = 1; 427 fix = 1;
352 break; 428 goto out;
429 }
430 last_aper_order = aper_order;
431 last_aper_base = aper_base;
353 } 432 }
354 last_aper_order = aper_order;
355 last_aper_base = aper_base;
356 } 433 }
357 434
435out:
358 if (!fix && !fallback_aper_force) { 436 if (!fix && !fallback_aper_force) {
359 if (last_aper_base) { 437 if (last_aper_base) {
360 unsigned long n = (32 * 1024 * 1024) << last_aper_order; 438 unsigned long n = (32 * 1024 * 1024) << last_aper_order;
@@ -364,14 +442,16 @@ void __init gart_iommu_hole_init(void)
364 return; 442 return;
365 } 443 }
366 444
367 if (!fallback_aper_force) 445 if (!fallback_aper_force) {
368 aper_alloc = search_agp_bridge(&aper_order, &valid_agp); 446 aper_alloc = agp_aper_base;
447 aper_order = agp_aper_order;
448 }
369 449
370 if (aper_alloc) { 450 if (aper_alloc) {
371 /* Got the aperture from the AGP bridge */ 451 /* Got the aperture from the AGP bridge */
372 } else if (swiotlb && !valid_agp) { 452 } else if (swiotlb && !valid_agp) {
373 /* Do nothing */ 453 /* Do nothing */
374 } else if ((!no_iommu && end_pfn > MAX_DMA32_PFN) || 454 } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) ||
375 force_iommu || 455 force_iommu ||
376 valid_agp || 456 valid_agp ||
377 fallback_aper_force) { 457 fallback_aper_force) {
@@ -401,16 +481,24 @@ void __init gart_iommu_hole_init(void)
401 } 481 }
402 482
403 /* Fix up the north bridges */ 483 /* Fix up the north bridges */
404 for (num = 24; num < 32; num++) { 484 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
405 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) 485 int bus;
406 continue; 486 int dev_base, dev_limit;
407 487
408 /* 488 bus = bus_dev_ranges[i].bus;
409 * Don't enable translation yet. That is done later. 489 dev_base = bus_dev_ranges[i].dev_base;
410 * Assume this BIOS didn't initialise the GART so 490 dev_limit = bus_dev_ranges[i].dev_limit;
411 * just overwrite all previous bits 491 for (slot = dev_base; slot < dev_limit; slot++) {
412 */ 492 if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
413 write_pci_config(0, num, 3, 0x90, aper_order<<1); 493 continue;
414 write_pci_config(0, num, 3, 0x94, aper_alloc>>25); 494
495 /* Don't enable translation yet. That is done later.
496 Assume this BIOS didn't initialise the GART so
497 just overwrite all previous bits */
498 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1);
499 write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25);
500 }
415 } 501 }
502
503 set_up_gart_resume(aper_order, aper_alloc);
416} 504}
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 4b99b1bdeb6c..d6c898358371 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -52,29 +52,40 @@
52 52
53unsigned long mp_lapic_addr; 53unsigned long mp_lapic_addr;
54 54
55DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
56EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
57
58/* 55/*
59 * Knob to control our willingness to enable the local APIC. 56 * Knob to control our willingness to enable the local APIC.
60 * 57 *
61 * -1=force-disable, +1=force-enable 58 * +1=force-enable
62 */ 59 */
63static int enable_local_apic __initdata; 60static int force_enable_local_apic;
61int disable_apic;
64 62
65/* Local APIC timer verification ok */ 63/* Local APIC timer verification ok */
66static int local_apic_timer_verify_ok; 64static int local_apic_timer_verify_ok;
67/* Disable local APIC timer from the kernel commandline or via dmi quirk 65/* Disable local APIC timer from the kernel commandline or via dmi quirk */
68 or using CPU MSR check */ 66static int local_apic_timer_disabled;
69int local_apic_timer_disabled;
70/* Local APIC timer works in C2 */ 67/* Local APIC timer works in C2 */
71int local_apic_timer_c2_ok; 68int local_apic_timer_c2_ok;
72EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); 69EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
73 70
71int first_system_vector = 0xfe;
72
73char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
74
74/* 75/*
75 * Debug level, exported for io_apic.c 76 * Debug level, exported for io_apic.c
76 */ 77 */
77int apic_verbosity; 78unsigned int apic_verbosity;
79
80int pic_mode;
81
82/* Have we found an MP table */
83int smp_found_config;
84
85static struct resource lapic_resource = {
86 .name = "Local APIC",
87 .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
88};
78 89
79static unsigned int calibration_result; 90static unsigned int calibration_result;
80 91
@@ -166,7 +177,7 @@ void __cpuinit enable_NMI_through_LVT0(void)
166 /* Level triggered for 82489DX */ 177 /* Level triggered for 82489DX */
167 if (!lapic_is_integrated()) 178 if (!lapic_is_integrated())
168 v |= APIC_LVT_LEVEL_TRIGGER; 179 v |= APIC_LVT_LEVEL_TRIGGER;
169 apic_write_around(APIC_LVT0, v); 180 apic_write(APIC_LVT0, v);
170} 181}
171 182
172/** 183/**
@@ -201,9 +212,6 @@ int lapic_get_maxlvt(void)
201 * this function twice on the boot CPU, once with a bogus timeout 212 * this function twice on the boot CPU, once with a bogus timeout
202 * value, second time for real. The other (noncalibrating) CPUs 213 * value, second time for real. The other (noncalibrating) CPUs
203 * call this function only once, with the real, calibrated value. 214 * call this function only once, with the real, calibrated value.
204 *
205 * We do reads before writes even if unnecessary, to get around the
206 * P5 APIC double write bug.
207 */ 215 */
208static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) 216static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
209{ 217{
@@ -218,18 +226,18 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
218 if (!irqen) 226 if (!irqen)
219 lvtt_value |= APIC_LVT_MASKED; 227 lvtt_value |= APIC_LVT_MASKED;
220 228
221 apic_write_around(APIC_LVTT, lvtt_value); 229 apic_write(APIC_LVTT, lvtt_value);
222 230
223 /* 231 /*
224 * Divide PICLK by 16 232 * Divide PICLK by 16
225 */ 233 */
226 tmp_value = apic_read(APIC_TDCR); 234 tmp_value = apic_read(APIC_TDCR);
227 apic_write_around(APIC_TDCR, (tmp_value 235 apic_write(APIC_TDCR,
228 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) 236 (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
229 | APIC_TDR_DIV_16); 237 APIC_TDR_DIV_16);
230 238
231 if (!oneshot) 239 if (!oneshot)
232 apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); 240 apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
233} 241}
234 242
235/* 243/*
@@ -238,7 +246,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
238static int lapic_next_event(unsigned long delta, 246static int lapic_next_event(unsigned long delta,
239 struct clock_event_device *evt) 247 struct clock_event_device *evt)
240{ 248{
241 apic_write_around(APIC_TMICT, delta); 249 apic_write(APIC_TMICT, delta);
242 return 0; 250 return 0;
243} 251}
244 252
@@ -267,7 +275,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
267 case CLOCK_EVT_MODE_SHUTDOWN: 275 case CLOCK_EVT_MODE_SHUTDOWN:
268 v = apic_read(APIC_LVTT); 276 v = apic_read(APIC_LVTT);
269 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 277 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
270 apic_write_around(APIC_LVTT, v); 278 apic_write(APIC_LVTT, v);
271 break; 279 break;
272 case CLOCK_EVT_MODE_RESUME: 280 case CLOCK_EVT_MODE_RESUME:
273 /* Nothing to do here */ 281 /* Nothing to do here */
@@ -361,12 +369,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
361 } 369 }
362} 370}
363 371
364/* 372static int __init calibrate_APIC_clock(void)
365 * Setup the boot APIC
366 *
367 * Calibrate and verify the result.
368 */
369void __init setup_boot_APIC_clock(void)
370{ 373{
371 struct clock_event_device *levt = &__get_cpu_var(lapic_events); 374 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
372 const long pm_100ms = PMTMR_TICKS_PER_SEC/10; 375 const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
@@ -376,24 +379,6 @@ void __init setup_boot_APIC_clock(void)
376 long delta, deltapm; 379 long delta, deltapm;
377 int pm_referenced = 0; 380 int pm_referenced = 0;
378 381
379 /*
380 * The local apic timer can be disabled via the kernel
381 * commandline or from the CPU detection code. Register the lapic
382 * timer as a dummy clock event source on SMP systems, so the
383 * broadcast mechanism is used. On UP systems simply ignore it.
384 */
385 if (local_apic_timer_disabled) {
386 /* No broadcast on UP ! */
387 if (num_possible_cpus() > 1) {
388 lapic_clockevent.mult = 1;
389 setup_APIC_timer();
390 }
391 return;
392 }
393
394 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
395 "calibrating APIC timer ...\n");
396
397 local_irq_disable(); 382 local_irq_disable();
398 383
399 /* Replace the global interrupt handler */ 384 /* Replace the global interrupt handler */
@@ -478,8 +463,6 @@ void __init setup_boot_APIC_clock(void)
478 calibration_result / (1000000 / HZ), 463 calibration_result / (1000000 / HZ),
479 calibration_result % (1000000 / HZ)); 464 calibration_result % (1000000 / HZ));
480 465
481 local_apic_timer_verify_ok = 1;
482
483 /* 466 /*
484 * Do a sanity check on the APIC calibration result 467 * Do a sanity check on the APIC calibration result
485 */ 468 */
@@ -487,12 +470,11 @@ void __init setup_boot_APIC_clock(void)
487 local_irq_enable(); 470 local_irq_enable();
488 printk(KERN_WARNING 471 printk(KERN_WARNING
489 "APIC frequency too slow, disabling apic timer\n"); 472 "APIC frequency too slow, disabling apic timer\n");
490 /* No broadcast on UP ! */ 473 return -1;
491 if (num_possible_cpus() > 1)
492 setup_APIC_timer();
493 return;
494 } 474 }
495 475
476 local_apic_timer_verify_ok = 1;
477
496 /* We trust the pm timer based calibration */ 478 /* We trust the pm timer based calibration */
497 if (!pm_referenced) { 479 if (!pm_referenced) {
498 apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); 480 apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
@@ -532,22 +514,55 @@ void __init setup_boot_APIC_clock(void)
532 if (!local_apic_timer_verify_ok) { 514 if (!local_apic_timer_verify_ok) {
533 printk(KERN_WARNING 515 printk(KERN_WARNING
534 "APIC timer disabled due to verification failure.\n"); 516 "APIC timer disabled due to verification failure.\n");
517 return -1;
518 }
519
520 return 0;
521}
522
523/*
524 * Setup the boot APIC
525 *
526 * Calibrate and verify the result.
527 */
528void __init setup_boot_APIC_clock(void)
529{
530 /*
531 * The local apic timer can be disabled via the kernel
532 * commandline or from the CPU detection code. Register the lapic
533 * timer as a dummy clock event source on SMP systems, so the
534 * broadcast mechanism is used. On UP systems simply ignore it.
535 */
536 if (local_apic_timer_disabled) {
535 /* No broadcast on UP ! */ 537 /* No broadcast on UP ! */
536 if (num_possible_cpus() == 1) 538 if (num_possible_cpus() > 1) {
537 return; 539 lapic_clockevent.mult = 1;
538 } else { 540 setup_APIC_timer();
539 /* 541 }
540 * If nmi_watchdog is set to IO_APIC, we need the 542 return;
541 * PIT/HPET going. Otherwise register lapic as a dummy 543 }
542 * device. 544
543 */ 545 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
544 if (nmi_watchdog != NMI_IO_APIC) 546 "calibrating APIC timer ...\n");
545 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; 547
546 else 548 if (calibrate_APIC_clock()) {
547 printk(KERN_WARNING "APIC timer registered as dummy," 549 /* No broadcast on UP ! */
548 " due to nmi_watchdog=1!\n"); 550 if (num_possible_cpus() > 1)
551 setup_APIC_timer();
552 return;
549 } 553 }
550 554
555 /*
556 * If nmi_watchdog is set to IO_APIC, we need the
557 * PIT/HPET going. Otherwise register lapic as a dummy
558 * device.
559 */
560 if (nmi_watchdog != NMI_IO_APIC)
561 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
562 else
563 printk(KERN_WARNING "APIC timer registered as dummy,"
564 " due to nmi_watchdog=%d!\n", nmi_watchdog);
565
551 /* Setup the lapic or request the broadcast */ 566 /* Setup the lapic or request the broadcast */
552 setup_APIC_timer(); 567 setup_APIC_timer();
553} 568}
@@ -682,44 +697,44 @@ void clear_local_APIC(void)
682 */ 697 */
683 if (maxlvt >= 3) { 698 if (maxlvt >= 3) {
684 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ 699 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
685 apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); 700 apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
686 } 701 }
687 /* 702 /*
688 * Careful: we have to set masks only first to deassert 703 * Careful: we have to set masks only first to deassert
689 * any level-triggered sources. 704 * any level-triggered sources.
690 */ 705 */
691 v = apic_read(APIC_LVTT); 706 v = apic_read(APIC_LVTT);
692 apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); 707 apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
693 v = apic_read(APIC_LVT0); 708 v = apic_read(APIC_LVT0);
694 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); 709 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
695 v = apic_read(APIC_LVT1); 710 v = apic_read(APIC_LVT1);
696 apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); 711 apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
697 if (maxlvt >= 4) { 712 if (maxlvt >= 4) {
698 v = apic_read(APIC_LVTPC); 713 v = apic_read(APIC_LVTPC);
699 apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); 714 apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
700 } 715 }
701 716
702 /* lets not touch this if we didn't frob it */ 717 /* lets not touch this if we didn't frob it */
703#ifdef CONFIG_X86_MCE_P4THERMAL 718#ifdef CONFIG_X86_MCE_P4THERMAL
704 if (maxlvt >= 5) { 719 if (maxlvt >= 5) {
705 v = apic_read(APIC_LVTTHMR); 720 v = apic_read(APIC_LVTTHMR);
706 apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED); 721 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
707 } 722 }
708#endif 723#endif
709 /* 724 /*
710 * Clean APIC state for other OSs: 725 * Clean APIC state for other OSs:
711 */ 726 */
712 apic_write_around(APIC_LVTT, APIC_LVT_MASKED); 727 apic_write(APIC_LVTT, APIC_LVT_MASKED);
713 apic_write_around(APIC_LVT0, APIC_LVT_MASKED); 728 apic_write(APIC_LVT0, APIC_LVT_MASKED);
714 apic_write_around(APIC_LVT1, APIC_LVT_MASKED); 729 apic_write(APIC_LVT1, APIC_LVT_MASKED);
715 if (maxlvt >= 3) 730 if (maxlvt >= 3)
716 apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); 731 apic_write(APIC_LVTERR, APIC_LVT_MASKED);
717 if (maxlvt >= 4) 732 if (maxlvt >= 4)
718 apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); 733 apic_write(APIC_LVTPC, APIC_LVT_MASKED);
719 734
720#ifdef CONFIG_X86_MCE_P4THERMAL 735#ifdef CONFIG_X86_MCE_P4THERMAL
721 if (maxlvt >= 5) 736 if (maxlvt >= 5)
722 apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED); 737 apic_write(APIC_LVTTHMR, APIC_LVT_MASKED);
723#endif 738#endif
724 /* Integrated APIC (!82489DX) ? */ 739 /* Integrated APIC (!82489DX) ? */
725 if (lapic_is_integrated()) { 740 if (lapic_is_integrated()) {
@@ -745,7 +760,7 @@ void disable_local_APIC(void)
745 */ 760 */
746 value = apic_read(APIC_SPIV); 761 value = apic_read(APIC_SPIV);
747 value &= ~APIC_SPIV_APIC_ENABLED; 762 value &= ~APIC_SPIV_APIC_ENABLED;
748 apic_write_around(APIC_SPIV, value); 763 apic_write(APIC_SPIV, value);
749 764
750 /* 765 /*
751 * When LAPIC was disabled by the BIOS and enabled by the kernel, 766 * When LAPIC was disabled by the BIOS and enabled by the kernel,
@@ -854,8 +869,8 @@ void __init sync_Arb_IDs(void)
854 apic_wait_icr_idle(); 869 apic_wait_icr_idle();
855 870
856 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); 871 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
857 apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG 872 apic_write(APIC_ICR,
858 | APIC_DM_INIT); 873 APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT);
859} 874}
860 875
861/* 876/*
@@ -891,16 +906,16 @@ void __init init_bsp_APIC(void)
891 else 906 else
892 value |= APIC_SPIV_FOCUS_DISABLED; 907 value |= APIC_SPIV_FOCUS_DISABLED;
893 value |= SPURIOUS_APIC_VECTOR; 908 value |= SPURIOUS_APIC_VECTOR;
894 apic_write_around(APIC_SPIV, value); 909 apic_write(APIC_SPIV, value);
895 910
896 /* 911 /*
897 * Set up the virtual wire mode. 912 * Set up the virtual wire mode.
898 */ 913 */
899 apic_write_around(APIC_LVT0, APIC_DM_EXTINT); 914 apic_write(APIC_LVT0, APIC_DM_EXTINT);
900 value = APIC_DM_NMI; 915 value = APIC_DM_NMI;
901 if (!lapic_is_integrated()) /* 82489DX */ 916 if (!lapic_is_integrated()) /* 82489DX */
902 value |= APIC_LVT_LEVEL_TRIGGER; 917 value |= APIC_LVT_LEVEL_TRIGGER;
903 apic_write_around(APIC_LVT1, value); 918 apic_write(APIC_LVT1, value);
904} 919}
905 920
906static void __cpuinit lapic_setup_esr(void) 921static void __cpuinit lapic_setup_esr(void)
@@ -915,7 +930,7 @@ static void __cpuinit lapic_setup_esr(void)
915 930
916 /* enables sending errors */ 931 /* enables sending errors */
917 value = ERROR_APIC_VECTOR; 932 value = ERROR_APIC_VECTOR;
918 apic_write_around(APIC_LVTERR, value); 933 apic_write(APIC_LVTERR, value);
919 /* 934 /*
920 * spec says clear errors after enabling vector. 935 * spec says clear errors after enabling vector.
921 */ 936 */
@@ -963,7 +978,7 @@ void __cpuinit setup_local_APIC(void)
963 * Double-check whether this APIC is really registered. 978 * Double-check whether this APIC is really registered.
964 */ 979 */
965 if (!apic_id_registered()) 980 if (!apic_id_registered())
966 BUG(); 981 WARN_ON_ONCE(1);
967 982
968 /* 983 /*
969 * Intel recommends to set DFR, LDR and TPR before enabling 984 * Intel recommends to set DFR, LDR and TPR before enabling
@@ -978,7 +993,7 @@ void __cpuinit setup_local_APIC(void)
978 */ 993 */
979 value = apic_read(APIC_TASKPRI); 994 value = apic_read(APIC_TASKPRI);
980 value &= ~APIC_TPRI_MASK; 995 value &= ~APIC_TPRI_MASK;
981 apic_write_around(APIC_TASKPRI, value); 996 apic_write(APIC_TASKPRI, value);
982 997
983 /* 998 /*
984 * After a crash, we no longer service the interrupts and a pending 999 * After a crash, we no longer service the interrupts and a pending
@@ -1036,7 +1051,7 @@ void __cpuinit setup_local_APIC(void)
1036 * Set spurious IRQ vector 1051 * Set spurious IRQ vector
1037 */ 1052 */
1038 value |= SPURIOUS_APIC_VECTOR; 1053 value |= SPURIOUS_APIC_VECTOR;
1039 apic_write_around(APIC_SPIV, value); 1054 apic_write(APIC_SPIV, value);
1040 1055
1041 /* 1056 /*
1042 * Set up LVT0, LVT1: 1057 * Set up LVT0, LVT1:
@@ -1058,7 +1073,7 @@ void __cpuinit setup_local_APIC(void)
1058 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", 1073 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
1059 smp_processor_id()); 1074 smp_processor_id());
1060 } 1075 }
1061 apic_write_around(APIC_LVT0, value); 1076 apic_write(APIC_LVT0, value);
1062 1077
1063 /* 1078 /*
1064 * only the BP should see the LINT1 NMI signal, obviously. 1079 * only the BP should see the LINT1 NMI signal, obviously.
@@ -1069,7 +1084,7 @@ void __cpuinit setup_local_APIC(void)
1069 value = APIC_DM_NMI | APIC_LVT_MASKED; 1084 value = APIC_DM_NMI | APIC_LVT_MASKED;
1070 if (!integrated) /* 82489DX */ 1085 if (!integrated) /* 82489DX */
1071 value |= APIC_LVT_LEVEL_TRIGGER; 1086 value |= APIC_LVT_LEVEL_TRIGGER;
1072 apic_write_around(APIC_LVT1, value); 1087 apic_write(APIC_LVT1, value);
1073} 1088}
1074 1089
1075void __cpuinit end_local_APIC_setup(void) 1090void __cpuinit end_local_APIC_setup(void)
@@ -1080,7 +1095,7 @@ void __cpuinit end_local_APIC_setup(void)
1080 /* Disable the local apic timer */ 1095 /* Disable the local apic timer */
1081 value = apic_read(APIC_LVTT); 1096 value = apic_read(APIC_LVTT);
1082 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 1097 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
1083 apic_write_around(APIC_LVTT, value); 1098 apic_write(APIC_LVTT, value);
1084 1099
1085 setup_apic_nmi_watchdog(NULL); 1100 setup_apic_nmi_watchdog(NULL);
1086 apic_pm_activate(); 1101 apic_pm_activate();
@@ -1094,7 +1109,7 @@ static int __init detect_init_APIC(void)
1094 u32 h, l, features; 1109 u32 h, l, features;
1095 1110
1096 /* Disabled by kernel option? */ 1111 /* Disabled by kernel option? */
1097 if (enable_local_apic < 0) 1112 if (disable_apic)
1098 return -1; 1113 return -1;
1099 1114
1100 switch (boot_cpu_data.x86_vendor) { 1115 switch (boot_cpu_data.x86_vendor) {
@@ -1117,7 +1132,7 @@ static int __init detect_init_APIC(void)
1117 * Over-ride BIOS and try to enable the local APIC only if 1132 * Over-ride BIOS and try to enable the local APIC only if
1118 * "lapic" specified. 1133 * "lapic" specified.
1119 */ 1134 */
1120 if (enable_local_apic <= 0) { 1135 if (!force_enable_local_apic) {
1121 printk(KERN_INFO "Local APIC disabled by BIOS -- " 1136 printk(KERN_INFO "Local APIC disabled by BIOS -- "
1122 "you can enable it with \"lapic\"\n"); 1137 "you can enable it with \"lapic\"\n");
1123 return -1; 1138 return -1;
@@ -1154,9 +1169,6 @@ static int __init detect_init_APIC(void)
1154 if (l & MSR_IA32_APICBASE_ENABLE) 1169 if (l & MSR_IA32_APICBASE_ENABLE)
1155 mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; 1170 mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
1156 1171
1157 if (nmi_watchdog != NMI_NONE && nmi_watchdog != NMI_DISABLED)
1158 nmi_watchdog = NMI_LOCAL_APIC;
1159
1160 printk(KERN_INFO "Found and enabled local APIC!\n"); 1172 printk(KERN_INFO "Found and enabled local APIC!\n");
1161 1173
1162 apic_pm_activate(); 1174 apic_pm_activate();
@@ -1195,36 +1207,6 @@ void __init init_apic_mappings(void)
1195 if (boot_cpu_physical_apicid == -1U) 1207 if (boot_cpu_physical_apicid == -1U)
1196 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); 1208 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
1197 1209
1198#ifdef CONFIG_X86_IO_APIC
1199 {
1200 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
1201 int i;
1202
1203 for (i = 0; i < nr_ioapics; i++) {
1204 if (smp_found_config) {
1205 ioapic_phys = mp_ioapics[i].mpc_apicaddr;
1206 if (!ioapic_phys) {
1207 printk(KERN_ERR
1208 "WARNING: bogus zero IO-APIC "
1209 "address found in MPTABLE, "
1210 "disabling IO/APIC support!\n");
1211 smp_found_config = 0;
1212 skip_ioapic_setup = 1;
1213 goto fake_ioapic_page;
1214 }
1215 } else {
1216fake_ioapic_page:
1217 ioapic_phys = (unsigned long)
1218 alloc_bootmem_pages(PAGE_SIZE);
1219 ioapic_phys = __pa(ioapic_phys);
1220 }
1221 set_fixmap_nocache(idx, ioapic_phys);
1222 printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
1223 __fix_to_virt(idx), ioapic_phys);
1224 idx++;
1225 }
1226 }
1227#endif
1228} 1210}
1229 1211
1230/* 1212/*
@@ -1236,9 +1218,6 @@ int apic_version[MAX_APICS];
1236 1218
1237int __init APIC_init_uniprocessor(void) 1219int __init APIC_init_uniprocessor(void)
1238{ 1220{
1239 if (enable_local_apic < 0)
1240 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1241
1242 if (!smp_found_config && !cpu_has_apic) 1221 if (!smp_found_config && !cpu_has_apic)
1243 return -1; 1222 return -1;
1244 1223
@@ -1265,10 +1244,14 @@ int __init APIC_init_uniprocessor(void)
1265#ifdef CONFIG_CRASH_DUMP 1244#ifdef CONFIG_CRASH_DUMP
1266 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); 1245 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
1267#endif 1246#endif
1268 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); 1247 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
1269 1248
1270 setup_local_APIC(); 1249 setup_local_APIC();
1271 1250
1251#ifdef CONFIG_X86_IO_APIC
1252 if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
1253#endif
1254 localise_nmi_watchdog();
1272 end_local_APIC_setup(); 1255 end_local_APIC_setup();
1273#ifdef CONFIG_X86_IO_APIC 1256#ifdef CONFIG_X86_IO_APIC
1274 if (smp_found_config) 1257 if (smp_found_config)
@@ -1351,13 +1334,17 @@ void __init smp_intr_init(void)
1351 * The reschedule interrupt is a CPU-to-CPU reschedule-helper 1334 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
1352 * IPI, driven by wakeup. 1335 * IPI, driven by wakeup.
1353 */ 1336 */
1354 set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); 1337 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
1355 1338
1356 /* IPI for invalidation */ 1339 /* IPI for invalidation */
1357 set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); 1340 alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
1358 1341
1359 /* IPI for generic function call */ 1342 /* IPI for generic function call */
1360 set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); 1343 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
1344
1345 /* IPI for single call function */
1346 set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
1347 call_function_single_interrupt);
1361} 1348}
1362#endif 1349#endif
1363 1350
@@ -1370,15 +1357,15 @@ void __init apic_intr_init(void)
1370 smp_intr_init(); 1357 smp_intr_init();
1371#endif 1358#endif
1372 /* self generated IPI for local APIC timer */ 1359 /* self generated IPI for local APIC timer */
1373 set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); 1360 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
1374 1361
1375 /* IPI vectors for APIC spurious and error interrupts */ 1362 /* IPI vectors for APIC spurious and error interrupts */
1376 set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 1363 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
1377 set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 1364 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
1378 1365
1379 /* thermal monitor LVT interrupt */ 1366 /* thermal monitor LVT interrupt */
1380#ifdef CONFIG_X86_MCE_P4THERMAL 1367#ifdef CONFIG_X86_MCE_P4THERMAL
1381 set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); 1368 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
1382#endif 1369#endif
1383} 1370}
1384 1371
@@ -1433,7 +1420,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1433 value &= ~APIC_VECTOR_MASK; 1420 value &= ~APIC_VECTOR_MASK;
1434 value |= APIC_SPIV_APIC_ENABLED; 1421 value |= APIC_SPIV_APIC_ENABLED;
1435 value |= 0xf; 1422 value |= 0xf;
1436 apic_write_around(APIC_SPIV, value); 1423 apic_write(APIC_SPIV, value);
1437 1424
1438 if (!virt_wire_setup) { 1425 if (!virt_wire_setup) {
1439 /* 1426 /*
@@ -1446,10 +1433,10 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1446 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); 1433 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1447 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; 1434 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1448 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); 1435 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1449 apic_write_around(APIC_LVT0, value); 1436 apic_write(APIC_LVT0, value);
1450 } else { 1437 } else {
1451 /* Disable LVT0 */ 1438 /* Disable LVT0 */
1452 apic_write_around(APIC_LVT0, APIC_LVT_MASKED); 1439 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1453 } 1440 }
1454 1441
1455 /* 1442 /*
@@ -1463,7 +1450,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1463 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); 1450 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1464 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; 1451 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1465 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); 1452 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
1466 apic_write_around(APIC_LVT1, value); 1453 apic_write(APIC_LVT1, value);
1467 } 1454 }
1468} 1455}
1469 1456
@@ -1513,6 +1500,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
1513 */ 1500 */
1514 cpu = 0; 1501 cpu = 0;
1515 1502
1503 if (apicid > max_physical_apicid)
1504 max_physical_apicid = apicid;
1505
1516 /* 1506 /*
1517 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y 1507 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
1518 * but we need to work other dependencies like SMP_SUSPEND etc 1508 * but we need to work other dependencies like SMP_SUSPEND etc
@@ -1520,7 +1510,7 @@ void __cpuinit generic_processor_info(int apicid, int version)
1520 * if (CPU_HOTPLUG_ENABLED || num_processors > 8) 1510 * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
1521 * - Ashok Raj <ashok.raj@intel.com> 1511 * - Ashok Raj <ashok.raj@intel.com>
1522 */ 1512 */
1523 if (num_processors > 8) { 1513 if (max_physical_apicid >= 8) {
1524 switch (boot_cpu_data.x86_vendor) { 1514 switch (boot_cpu_data.x86_vendor) {
1525 case X86_VENDOR_INTEL: 1515 case X86_VENDOR_INTEL:
1526 if (!APIC_XAPIC(version)) { 1516 if (!APIC_XAPIC(version)) {
@@ -1534,9 +1524,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
1534 } 1524 }
1535#ifdef CONFIG_SMP 1525#ifdef CONFIG_SMP
1536 /* are we being called early in kernel startup? */ 1526 /* are we being called early in kernel startup? */
1537 if (x86_cpu_to_apicid_early_ptr) { 1527 if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
1538 u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr; 1528 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
1539 u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; 1529 u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
1540 1530
1541 cpu_to_apicid[cpu] = apicid; 1531 cpu_to_apicid[cpu] = apicid;
1542 bios_cpu_apicid[cpu] = apicid; 1532 bios_cpu_apicid[cpu] = apicid;
@@ -1703,15 +1693,15 @@ static void apic_pm_activate(void) { }
1703 */ 1693 */
1704static int __init parse_lapic(char *arg) 1694static int __init parse_lapic(char *arg)
1705{ 1695{
1706 enable_local_apic = 1; 1696 force_enable_local_apic = 1;
1707 return 0; 1697 return 0;
1708} 1698}
1709early_param("lapic", parse_lapic); 1699early_param("lapic", parse_lapic);
1710 1700
1711static int __init parse_nolapic(char *arg) 1701static int __init parse_nolapic(char *arg)
1712{ 1702{
1713 enable_local_apic = -1; 1703 disable_apic = 1;
1714 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 1704 setup_clear_cpu_cap(X86_FEATURE_APIC);
1715 return 0; 1705 return 0;
1716} 1706}
1717early_param("nolapic", parse_nolapic); 1707early_param("nolapic", parse_nolapic);
@@ -1740,3 +1730,21 @@ static int __init apic_set_verbosity(char *str)
1740} 1730}
1741__setup("apic=", apic_set_verbosity); 1731__setup("apic=", apic_set_verbosity);
1742 1732
1733static int __init lapic_insert_resource(void)
1734{
1735 if (!apic_phys)
1736 return -1;
1737
1738 /* Put local APIC into the resource map. */
1739 lapic_resource.start = apic_phys;
1740 lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
1741 insert_resource(&iomem_resource, &lapic_resource);
1742
1743 return 0;
1744}
1745
1746/*
1747 * need call insert after e820_reserve_resources()
1748 * that is using request_resource
1749 */
1750late_initcall(lapic_insert_resource);
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 0633cfd0dc29..7f1f030da7ee 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -43,7 +43,7 @@
43#include <mach_ipi.h> 43#include <mach_ipi.h>
44#include <mach_apic.h> 44#include <mach_apic.h>
45 45
46int disable_apic_timer __cpuinitdata; 46static int disable_apic_timer __cpuinitdata;
47static int apic_calibrate_pmtmr __initdata; 47static int apic_calibrate_pmtmr __initdata;
48int disable_apic; 48int disable_apic;
49 49
@@ -54,7 +54,10 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
54/* 54/*
55 * Debug level, exported for io_apic.c 55 * Debug level, exported for io_apic.c
56 */ 56 */
57int apic_verbosity; 57unsigned int apic_verbosity;
58
59/* Have we found an MP table */
60int smp_found_config;
58 61
59static struct resource lapic_resource = { 62static struct resource lapic_resource = {
60 .name = "Local APIC", 63 .name = "Local APIC",
@@ -87,9 +90,6 @@ static unsigned long apic_phys;
87 90
88unsigned long mp_lapic_addr; 91unsigned long mp_lapic_addr;
89 92
90DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
91EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
92
93unsigned int __cpuinitdata maxcpus = NR_CPUS; 93unsigned int __cpuinitdata maxcpus = NR_CPUS;
94/* 94/*
95 * Get the LAPIC version 95 * Get the LAPIC version
@@ -314,7 +314,7 @@ static void setup_APIC_timer(void)
314 314
315#define TICK_COUNT 100000000 315#define TICK_COUNT 100000000
316 316
317static void __init calibrate_APIC_clock(void) 317static int __init calibrate_APIC_clock(void)
318{ 318{
319 unsigned apic, apic_start; 319 unsigned apic, apic_start;
320 unsigned long tsc, tsc_start; 320 unsigned long tsc, tsc_start;
@@ -368,6 +368,17 @@ static void __init calibrate_APIC_clock(void)
368 clockevent_delta2ns(0xF, &lapic_clockevent); 368 clockevent_delta2ns(0xF, &lapic_clockevent);
369 369
370 calibration_result = result / HZ; 370 calibration_result = result / HZ;
371
372 /*
373 * Do a sanity check on the APIC calibration result
374 */
375 if (calibration_result < (1000000 / HZ)) {
376 printk(KERN_WARNING
377 "APIC frequency too slow, disabling apic timer\n");
378 return -1;
379 }
380
381 return 0;
371} 382}
372 383
373/* 384/*
@@ -394,14 +405,7 @@ void __init setup_boot_APIC_clock(void)
394 } 405 }
395 406
396 printk(KERN_INFO "Using local APIC timer interrupts.\n"); 407 printk(KERN_INFO "Using local APIC timer interrupts.\n");
397 calibrate_APIC_clock(); 408 if (calibrate_APIC_clock()) {
398
399 /*
400 * Do a sanity check on the APIC calibration result
401 */
402 if (calibration_result < (1000000 / HZ)) {
403 printk(KERN_WARNING
404 "APIC frequency too slow, disabling apic timer\n");
405 /* No broadcast on UP ! */ 409 /* No broadcast on UP ! */
406 if (num_possible_cpus() > 1) 410 if (num_possible_cpus() > 1)
407 setup_APIC_timer(); 411 setup_APIC_timer();
@@ -417,37 +421,13 @@ void __init setup_boot_APIC_clock(void)
417 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; 421 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
418 else 422 else
419 printk(KERN_WARNING "APIC timer registered as dummy," 423 printk(KERN_WARNING "APIC timer registered as dummy,"
420 " due to nmi_watchdog=1!\n"); 424 " due to nmi_watchdog=%d!\n", nmi_watchdog);
421 425
422 setup_APIC_timer(); 426 setup_APIC_timer();
423} 427}
424 428
425/*
426 * AMD C1E enabled CPUs have a real nasty problem: Some BIOSes set the
427 * C1E flag only in the secondary CPU, so when we detect the wreckage
428 * we already have enabled the boot CPU local apic timer. Check, if
429 * disable_apic_timer is set and the DUMMY flag is cleared. If yes,
430 * set the DUMMY flag again and force the broadcast mode in the
431 * clockevents layer.
432 */
433static void __cpuinit check_boot_apic_timer_broadcast(void)
434{
435 if (!disable_apic_timer ||
436 (lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY))
437 return;
438
439 printk(KERN_INFO "AMD C1E detected late. Force timer broadcast.\n");
440 lapic_clockevent.features |= CLOCK_EVT_FEAT_DUMMY;
441
442 local_irq_enable();
443 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
444 &boot_cpu_physical_apicid);
445 local_irq_disable();
446}
447
448void __cpuinit setup_secondary_APIC_clock(void) 429void __cpuinit setup_secondary_APIC_clock(void)
449{ 430{
450 check_boot_apic_timer_broadcast();
451 setup_APIC_timer(); 431 setup_APIC_timer();
452} 432}
453 433
@@ -850,7 +830,6 @@ static void __cpuinit lapic_setup_esr(void)
850void __cpuinit end_local_APIC_setup(void) 830void __cpuinit end_local_APIC_setup(void)
851{ 831{
852 lapic_setup_esr(); 832 lapic_setup_esr();
853 nmi_watchdog_default();
854 setup_apic_nmi_watchdog(NULL); 833 setup_apic_nmi_watchdog(NULL);
855 apic_pm_activate(); 834 apic_pm_activate();
856} 835}
@@ -875,7 +854,7 @@ static int __init detect_init_APIC(void)
875 854
876void __init early_init_lapic_mapping(void) 855void __init early_init_lapic_mapping(void)
877{ 856{
878 unsigned long apic_phys; 857 unsigned long phys_addr;
879 858
880 /* 859 /*
881 * If no local APIC can be found then go out 860 * If no local APIC can be found then go out
@@ -884,11 +863,11 @@ void __init early_init_lapic_mapping(void)
884 if (!smp_found_config) 863 if (!smp_found_config)
885 return; 864 return;
886 865
887 apic_phys = mp_lapic_addr; 866 phys_addr = mp_lapic_addr;
888 867
889 set_fixmap_nocache(FIX_APIC_BASE, apic_phys); 868 set_fixmap_nocache(FIX_APIC_BASE, phys_addr);
890 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", 869 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
891 APIC_BASE, apic_phys); 870 APIC_BASE, phys_addr);
892 871
893 /* 872 /*
894 * Fetch the APIC ID of the BSP in case we have a 873 * Fetch the APIC ID of the BSP in case we have a
@@ -942,7 +921,9 @@ int __init APIC_init_uniprocessor(void)
942 921
943 verify_local_APIC(); 922 verify_local_APIC();
944 923
945 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); 924 connect_bsp_APIC();
925
926 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
946 apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); 927 apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
947 928
948 setup_local_APIC(); 929 setup_local_APIC();
@@ -954,6 +935,8 @@ int __init APIC_init_uniprocessor(void)
954 if (!skip_ioapic_setup && nr_ioapics) 935 if (!skip_ioapic_setup && nr_ioapics)
955 enable_IO_APIC(); 936 enable_IO_APIC();
956 937
938 if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
939 localise_nmi_watchdog();
957 end_local_APIC_setup(); 940 end_local_APIC_setup();
958 941
959 if (smp_found_config && !skip_ioapic_setup && nr_ioapics) 942 if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
@@ -1021,6 +1004,14 @@ asmlinkage void smp_error_interrupt(void)
1021 irq_exit(); 1004 irq_exit();
1022} 1005}
1023 1006
1007/**
1008 * * connect_bsp_APIC - attach the APIC to the interrupt system
1009 * */
1010void __init connect_bsp_APIC(void)
1011{
1012 enable_apic_mode();
1013}
1014
1024void disconnect_bsp_APIC(int virt_wire_setup) 1015void disconnect_bsp_APIC(int virt_wire_setup)
1025{ 1016{
1026 /* Go back to Virtual Wire compatibility mode */ 1017 /* Go back to Virtual Wire compatibility mode */
@@ -1090,10 +1081,13 @@ void __cpuinit generic_processor_info(int apicid, int version)
1090 */ 1081 */
1091 cpu = 0; 1082 cpu = 0;
1092 } 1083 }
1084 if (apicid > max_physical_apicid)
1085 max_physical_apicid = apicid;
1086
1093 /* are we being called early in kernel startup? */ 1087 /* are we being called early in kernel startup? */
1094 if (x86_cpu_to_apicid_early_ptr) { 1088 if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
1095 u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr; 1089 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
1096 u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; 1090 u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
1097 1091
1098 cpu_to_apicid[cpu] = apicid; 1092 cpu_to_apicid[cpu] = apicid;
1099 bios_cpu_apicid[cpu] = apicid; 1093 bios_cpu_apicid[cpu] = apicid;
@@ -1269,7 +1263,7 @@ __cpuinit int apic_is_clustered_box(void)
1269 if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) 1263 if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
1270 return 0; 1264 return 0;
1271 1265
1272 bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; 1266 bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
1273 bitmap_zero(clustermap, NUM_APIC_CLUSTERS); 1267 bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
1274 1268
1275 for (i = 0; i < NR_CPUS; i++) { 1269 for (i = 0; i < NR_CPUS; i++) {
@@ -1347,7 +1341,7 @@ early_param("apic", apic_set_verbosity);
1347static __init int setup_disableapic(char *str) 1341static __init int setup_disableapic(char *str)
1348{ 1342{
1349 disable_apic = 1; 1343 disable_apic = 1;
1350 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 1344 setup_clear_cpu_cap(X86_FEATURE_APIC);
1351 return 0; 1345 return 0;
1352} 1346}
1353early_param("disableapic", setup_disableapic); 1347early_param("disableapic", setup_disableapic);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index bf9290e29013..9ee24e6bc4b0 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -204,6 +204,7 @@
204#include <linux/module.h> 204#include <linux/module.h>
205 205
206#include <linux/poll.h> 206#include <linux/poll.h>
207#include <linux/smp_lock.h>
207#include <linux/types.h> 208#include <linux/types.h>
208#include <linux/stddef.h> 209#include <linux/stddef.h>
209#include <linux/timer.h> 210#include <linux/timer.h>
@@ -218,7 +219,6 @@
218#include <linux/time.h> 219#include <linux/time.h>
219#include <linux/sched.h> 220#include <linux/sched.h>
220#include <linux/pm.h> 221#include <linux/pm.h>
221#include <linux/pm_legacy.h>
222#include <linux/capability.h> 222#include <linux/capability.h>
223#include <linux/device.h> 223#include <linux/device.h>
224#include <linux/kernel.h> 224#include <linux/kernel.h>
@@ -228,6 +228,7 @@
228#include <linux/suspend.h> 228#include <linux/suspend.h>
229#include <linux/kthread.h> 229#include <linux/kthread.h>
230#include <linux/jiffies.h> 230#include <linux/jiffies.h>
231#include <linux/smp_lock.h>
231 232
232#include <asm/system.h> 233#include <asm/system.h>
233#include <asm/uaccess.h> 234#include <asm/uaccess.h>
@@ -1149,7 +1150,7 @@ static void queue_event(apm_event_t event, struct apm_user *sender)
1149 as->event_tail = 0; 1150 as->event_tail = 0;
1150 } 1151 }
1151 as->events[as->event_head] = event; 1152 as->events[as->event_head] = event;
1152 if ((!as->suser) || (!as->writer)) 1153 if (!as->suser || !as->writer)
1153 continue; 1154 continue;
1154 switch (event) { 1155 switch (event) {
1155 case APM_SYS_SUSPEND: 1156 case APM_SYS_SUSPEND:
@@ -1211,9 +1212,9 @@ static int suspend(int vetoable)
1211 if (err != APM_SUCCESS) 1212 if (err != APM_SUCCESS)
1212 apm_error("suspend", err); 1213 apm_error("suspend", err);
1213 err = (err == APM_SUCCESS) ? 0 : -EIO; 1214 err = (err == APM_SUCCESS) ? 0 : -EIO;
1214 device_power_up(); 1215 device_power_up(PMSG_RESUME);
1215 local_irq_enable(); 1216 local_irq_enable();
1216 device_resume(); 1217 device_resume(PMSG_RESUME);
1217 queue_event(APM_NORMAL_RESUME, NULL); 1218 queue_event(APM_NORMAL_RESUME, NULL);
1218 spin_lock(&user_list_lock); 1219 spin_lock(&user_list_lock);
1219 for (as = user_list; as != NULL; as = as->next) { 1220 for (as = user_list; as != NULL; as = as->next) {
@@ -1238,7 +1239,7 @@ static void standby(void)
1238 apm_error("standby", err); 1239 apm_error("standby", err);
1239 1240
1240 local_irq_disable(); 1241 local_irq_disable();
1241 device_power_up(); 1242 device_power_up(PMSG_RESUME);
1242 local_irq_enable(); 1243 local_irq_enable();
1243} 1244}
1244 1245
@@ -1324,7 +1325,7 @@ static void check_events(void)
1324 ignore_bounce = 1; 1325 ignore_bounce = 1;
1325 if ((event != APM_NORMAL_RESUME) 1326 if ((event != APM_NORMAL_RESUME)
1326 || (ignore_normal_resume == 0)) { 1327 || (ignore_normal_resume == 0)) {
1327 device_resume(); 1328 device_resume(PMSG_RESUME);
1328 queue_event(event, NULL); 1329 queue_event(event, NULL);
1329 } 1330 }
1330 ignore_normal_resume = 0; 1331 ignore_normal_resume = 0;
@@ -1396,7 +1397,7 @@ static void apm_mainloop(void)
1396 1397
1397static int check_apm_user(struct apm_user *as, const char *func) 1398static int check_apm_user(struct apm_user *as, const char *func)
1398{ 1399{
1399 if ((as == NULL) || (as->magic != APM_BIOS_MAGIC)) { 1400 if (as == NULL || as->magic != APM_BIOS_MAGIC) {
1400 printk(KERN_ERR "apm: %s passed bad filp\n", func); 1401 printk(KERN_ERR "apm: %s passed bad filp\n", func);
1401 return 1; 1402 return 1;
1402 } 1403 }
@@ -1459,18 +1460,19 @@ static unsigned int do_poll(struct file *fp, poll_table *wait)
1459 return 0; 1460 return 0;
1460} 1461}
1461 1462
1462static int do_ioctl(struct inode *inode, struct file *filp, 1463static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
1463 u_int cmd, u_long arg)
1464{ 1464{
1465 struct apm_user *as; 1465 struct apm_user *as;
1466 int ret;
1466 1467
1467 as = filp->private_data; 1468 as = filp->private_data;
1468 if (check_apm_user(as, "ioctl")) 1469 if (check_apm_user(as, "ioctl"))
1469 return -EIO; 1470 return -EIO;
1470 if ((!as->suser) || (!as->writer)) 1471 if (!as->suser || !as->writer)
1471 return -EPERM; 1472 return -EPERM;
1472 switch (cmd) { 1473 switch (cmd) {
1473 case APM_IOC_STANDBY: 1474 case APM_IOC_STANDBY:
1475 lock_kernel();
1474 if (as->standbys_read > 0) { 1476 if (as->standbys_read > 0) {
1475 as->standbys_read--; 1477 as->standbys_read--;
1476 as->standbys_pending--; 1478 as->standbys_pending--;
@@ -1479,8 +1481,10 @@ static int do_ioctl(struct inode *inode, struct file *filp,
1479 queue_event(APM_USER_STANDBY, as); 1481 queue_event(APM_USER_STANDBY, as);
1480 if (standbys_pending <= 0) 1482 if (standbys_pending <= 0)
1481 standby(); 1483 standby();
1484 unlock_kernel();
1482 break; 1485 break;
1483 case APM_IOC_SUSPEND: 1486 case APM_IOC_SUSPEND:
1487 lock_kernel();
1484 if (as->suspends_read > 0) { 1488 if (as->suspends_read > 0) {
1485 as->suspends_read--; 1489 as->suspends_read--;
1486 as->suspends_pending--; 1490 as->suspends_pending--;
@@ -1488,16 +1492,17 @@ static int do_ioctl(struct inode *inode, struct file *filp,
1488 } else 1492 } else
1489 queue_event(APM_USER_SUSPEND, as); 1493 queue_event(APM_USER_SUSPEND, as);
1490 if (suspends_pending <= 0) { 1494 if (suspends_pending <= 0) {
1491 return suspend(1); 1495 ret = suspend(1);
1492 } else { 1496 } else {
1493 as->suspend_wait = 1; 1497 as->suspend_wait = 1;
1494 wait_event_interruptible(apm_suspend_waitqueue, 1498 wait_event_interruptible(apm_suspend_waitqueue,
1495 as->suspend_wait == 0); 1499 as->suspend_wait == 0);
1496 return as->suspend_result; 1500 ret = as->suspend_result;
1497 } 1501 }
1498 break; 1502 unlock_kernel();
1503 return ret;
1499 default: 1504 default:
1500 return -EINVAL; 1505 return -ENOTTY;
1501 } 1506 }
1502 return 0; 1507 return 0;
1503} 1508}
@@ -1544,10 +1549,12 @@ static int do_open(struct inode *inode, struct file *filp)
1544{ 1549{
1545 struct apm_user *as; 1550 struct apm_user *as;
1546 1551
1552 lock_kernel();
1547 as = kmalloc(sizeof(*as), GFP_KERNEL); 1553 as = kmalloc(sizeof(*as), GFP_KERNEL);
1548 if (as == NULL) { 1554 if (as == NULL) {
1549 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", 1555 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
1550 sizeof(*as)); 1556 sizeof(*as));
1557 unlock_kernel();
1551 return -ENOMEM; 1558 return -ENOMEM;
1552 } 1559 }
1553 as->magic = APM_BIOS_MAGIC; 1560 as->magic = APM_BIOS_MAGIC;
@@ -1569,6 +1576,7 @@ static int do_open(struct inode *inode, struct file *filp)
1569 user_list = as; 1576 user_list = as;
1570 spin_unlock(&user_list_lock); 1577 spin_unlock(&user_list_lock);
1571 filp->private_data = as; 1578 filp->private_data = as;
1579 unlock_kernel();
1572 return 0; 1580 return 0;
1573} 1581}
1574 1582
@@ -1860,7 +1868,7 @@ static const struct file_operations apm_bios_fops = {
1860 .owner = THIS_MODULE, 1868 .owner = THIS_MODULE,
1861 .read = do_read, 1869 .read = do_read,
1862 .poll = do_poll, 1870 .poll = do_poll,
1863 .ioctl = do_ioctl, 1871 .unlocked_ioctl = do_ioctl,
1864 .open = do_open, 1872 .open = do_open,
1865 .release = do_release, 1873 .release = do_release,
1866}; 1874};
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 92588083950f..6649d09ad88f 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -111,7 +111,7 @@ void foo(void)
111 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); 111 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
112 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); 112 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
113 OFFSET(PV_CPU_iret, pv_cpu_ops, iret); 113 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
114 OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret); 114 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
115 OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); 115 OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
116#endif 116#endif
117 117
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index f126c05d6170..aa89387006fe 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -18,6 +18,8 @@
18#include <asm/ia32.h> 18#include <asm/ia32.h>
19#include <asm/bootparam.h> 19#include <asm/bootparam.h>
20 20
21#include <xen/interface/xen.h>
22
21#define __NO_STUBS 1 23#define __NO_STUBS 1
22#undef __SYSCALL 24#undef __SYSCALL
23#undef _ASM_X86_64_UNISTD_H_ 25#undef _ASM_X86_64_UNISTD_H_
@@ -34,7 +36,7 @@ int main(void)
34 ENTRY(pid); 36 ENTRY(pid);
35 BLANK(); 37 BLANK();
36#undef ENTRY 38#undef ENTRY
37#define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry)) 39#define ENTRY(entry) DEFINE(TI_ ## entry, offsetof(struct thread_info, entry))
38 ENTRY(flags); 40 ENTRY(flags);
39 ENTRY(addr_limit); 41 ENTRY(addr_limit);
40 ENTRY(preempt_count); 42 ENTRY(preempt_count);
@@ -61,8 +63,11 @@ int main(void)
61 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); 63 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
62 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); 64 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
63 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); 65 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
66 OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame);
64 OFFSET(PV_CPU_iret, pv_cpu_ops, iret); 67 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
65 OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret); 68 OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32);
69 OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
70 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
66 OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); 71 OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
67 OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); 72 OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
68#endif 73#endif
@@ -128,5 +133,14 @@ int main(void)
128 OFFSET(BP_loadflags, boot_params, hdr.loadflags); 133 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
129 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); 134 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
130 OFFSET(BP_version, boot_params, hdr.version); 135 OFFSET(BP_version, boot_params, hdr.version);
136
137 BLANK();
138 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
139#ifdef CONFIG_XEN
140 BLANK();
141 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
142 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
143#undef ENTRY
144#endif
131 return 0; 145 return 0;
132} 146}
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
new file mode 100644
index 000000000000..c639bd55391c
--- /dev/null
+++ b/arch/x86/kernel/bios_uv.c
@@ -0,0 +1,48 @@
1/*
2 * BIOS run time interface routines.
3 *
4 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20
21#include <asm/uv/bios.h>
22
23const char *
24x86_bios_strerror(long status)
25{
26 const char *str;
27 switch (status) {
28 case 0: str = "Call completed without error"; break;
29 case -1: str = "Not implemented"; break;
30 case -2: str = "Invalid argument"; break;
31 case -3: str = "Call completed with error"; break;
32 default: str = "Unknown BIOS status code"; break;
33 }
34 return str;
35}
36
37long
38x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second,
39 unsigned long *drift_info)
40{
41 struct uv_bios_retval isrv;
42
43 BIOS_CALL(isrv, BIOS_FREQ_BASE, which, 0, 0, 0, 0, 0, 0);
44 *ticks_per_second = isrv.v0;
45 *drift_info = isrv.v1;
46 return isrv.status;
47}
48EXPORT_SYMBOL_GPL(x86_bios_freq_base);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index a0c6f8190887..ee76eaad3001 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -6,11 +6,15 @@ obj-y := intel_cacheinfo.o addon_cpuid_features.o
6obj-y += proc.o feature_names.o 6obj-y += proc.o feature_names.o
7 7
8obj-$(CONFIG_X86_32) += common.o bugs.o 8obj-$(CONFIG_X86_32) += common.o bugs.o
9obj-$(CONFIG_X86_64) += common_64.o bugs_64.o
9obj-$(CONFIG_X86_32) += amd.o 10obj-$(CONFIG_X86_32) += amd.o
11obj-$(CONFIG_X86_64) += amd_64.o
10obj-$(CONFIG_X86_32) += cyrix.o 12obj-$(CONFIG_X86_32) += cyrix.o
11obj-$(CONFIG_X86_32) += centaur.o 13obj-$(CONFIG_X86_32) += centaur.o
14obj-$(CONFIG_X86_64) += centaur_64.o
12obj-$(CONFIG_X86_32) += transmeta.o 15obj-$(CONFIG_X86_32) += transmeta.o
13obj-$(CONFIG_X86_32) += intel.o 16obj-$(CONFIG_X86_32) += intel.o
17obj-$(CONFIG_X86_64) += intel_64.o
14obj-$(CONFIG_X86_32) += umc.o 18obj-$(CONFIG_X86_32) += umc.o
15 19
16obj-$(CONFIG_X86_MCE) += mcheck/ 20obj-$(CONFIG_X86_MCE) += mcheck/
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index c2e1ce33c7cb..84a8220a6072 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -1,9 +1,7 @@
1
2/* 1/*
3 * Routines to indentify additional cpu features that are scattered in 2 * Routines to indentify additional cpu features that are scattered in
4 * cpuid space. 3 * cpuid space.
5 */ 4 */
6
7#include <linux/cpu.h> 5#include <linux/cpu.h>
8 6
9#include <asm/pat.h> 7#include <asm/pat.h>
@@ -53,19 +51,20 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
53#ifdef CONFIG_X86_PAT 51#ifdef CONFIG_X86_PAT
54void __cpuinit validate_pat_support(struct cpuinfo_x86 *c) 52void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
55{ 53{
54 if (!cpu_has_pat)
55 pat_disable("PAT not supported by CPU.");
56
56 switch (c->x86_vendor) { 57 switch (c->x86_vendor) {
57 case X86_VENDOR_AMD:
58 if (c->x86 >= 0xf && c->x86 <= 0x11)
59 return;
60 break;
61 case X86_VENDOR_INTEL: 58 case X86_VENDOR_INTEL:
62 if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15)) 59 if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15))
63 return; 60 return;
64 break; 61 break;
62 case X86_VENDOR_AMD:
63 case X86_VENDOR_CENTAUR:
64 case X86_VENDOR_TRANSMETA:
65 return;
65 } 66 }
66 67
67 pat_disable(cpu_has_pat ? 68 pat_disable("PAT disabled. Not yet verified on this CPU type.");
68 "PAT disabled. Not yet verified on this CPU type." :
69 "PAT not supported by CPU.");
70} 69}
71#endif 70#endif
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 245866828294..cae9cabc3031 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -24,45 +24,6 @@
24extern void vide(void); 24extern void vide(void);
25__asm__(".align 4\nvide: ret"); 25__asm__(".align 4\nvide: ret");
26 26
27#ifdef CONFIG_X86_LOCAL_APIC
28#define ENABLE_C1E_MASK 0x18000000
29#define CPUID_PROCESSOR_SIGNATURE 1
30#define CPUID_XFAM 0x0ff00000
31#define CPUID_XFAM_K8 0x00000000
32#define CPUID_XFAM_10H 0x00100000
33#define CPUID_XFAM_11H 0x00200000
34#define CPUID_XMOD 0x000f0000
35#define CPUID_XMOD_REV_F 0x00040000
36
37/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
38static __cpuinit int amd_apic_timer_broken(void)
39{
40 u32 lo, hi;
41 u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
42 switch (eax & CPUID_XFAM) {
43 case CPUID_XFAM_K8:
44 if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
45 break;
46 case CPUID_XFAM_10H:
47 case CPUID_XFAM_11H:
48 rdmsr(MSR_K8_ENABLE_C1E, lo, hi);
49 if (lo & ENABLE_C1E_MASK) {
50 if (smp_processor_id() != boot_cpu_physical_apicid)
51 printk(KERN_INFO "AMD C1E detected late. "
52 " Force timer broadcast.\n");
53 return 1;
54 }
55 break;
56 default:
57 /* err on the side of caution */
58 return 1;
59 }
60 return 0;
61}
62#endif
63
64int force_mwait __cpuinitdata;
65
66static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) 27static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
67{ 28{
68 if (cpuid_eax(0x80000000) >= 0x80000007) { 29 if (cpuid_eax(0x80000000) >= 0x80000007) {
@@ -297,11 +258,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
297 num_cache_leaves = 3; 258 num_cache_leaves = 3;
298 } 259 }
299 260
300#ifdef CONFIG_X86_LOCAL_APIC
301 if (amd_apic_timer_broken())
302 local_apic_timer_disabled = 1;
303#endif
304
305 /* K6s reports MCEs but don't actually have all the MSRs */ 261 /* K6s reports MCEs but don't actually have all the MSRs */
306 if (c->x86 < 6) 262 if (c->x86 < 6)
307 clear_cpu_cap(c, X86_FEATURE_MCE); 263 clear_cpu_cap(c, X86_FEATURE_MCE);
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
new file mode 100644
index 000000000000..d1692b2a41ff
--- /dev/null
+++ b/arch/x86/kernel/cpu/amd_64.c
@@ -0,0 +1,224 @@
1#include <linux/init.h>
2#include <linux/mm.h>
3
4#include <asm/numa_64.h>
5#include <asm/mmconfig.h>
6#include <asm/cacheflush.h>
7
8#include <mach_apic.h>
9
10#include "cpu.h"
11
12int force_mwait __cpuinitdata;
13
14#ifdef CONFIG_NUMA
15static int __cpuinit nearby_node(int apicid)
16{
17 int i, node;
18
19 for (i = apicid - 1; i >= 0; i--) {
20 node = apicid_to_node[i];
21 if (node != NUMA_NO_NODE && node_online(node))
22 return node;
23 }
24 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
25 node = apicid_to_node[i];
26 if (node != NUMA_NO_NODE && node_online(node))
27 return node;
28 }
29 return first_node(node_online_map); /* Shouldn't happen */
30}
31#endif
32
33/*
34 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
35 * Assumes number of cores is a power of two.
36 */
37static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
38{
39#ifdef CONFIG_SMP
40 unsigned bits;
41#ifdef CONFIG_NUMA
42 int cpu = smp_processor_id();
43 int node = 0;
44 unsigned apicid = hard_smp_processor_id();
45#endif
46 bits = c->x86_coreid_bits;
47
48 /* Low order bits define the core id (index of core in socket) */
49 c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
50 /* Convert the initial APIC ID into the socket ID */
51 c->phys_proc_id = c->initial_apicid >> bits;
52
53#ifdef CONFIG_NUMA
54 node = c->phys_proc_id;
55 if (apicid_to_node[apicid] != NUMA_NO_NODE)
56 node = apicid_to_node[apicid];
57 if (!node_online(node)) {
58 /* Two possibilities here:
59 - The CPU is missing memory and no node was created.
60 In that case try picking one from a nearby CPU
61 - The APIC IDs differ from the HyperTransport node IDs
62 which the K8 northbridge parsing fills in.
63 Assume they are all increased by a constant offset,
64 but in the same order as the HT nodeids.
65 If that doesn't result in a usable node fall back to the
66 path for the previous case. */
67
68 int ht_nodeid = c->initial_apicid;
69
70 if (ht_nodeid >= 0 &&
71 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
72 node = apicid_to_node[ht_nodeid];
73 /* Pick a nearby node */
74 if (!node_online(node))
75 node = nearby_node(apicid);
76 }
77 numa_set_node(cpu, node);
78
79 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
80#endif
81#endif
82}
83
84static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
85{
86#ifdef CONFIG_SMP
87 unsigned bits, ecx;
88
89 /* Multi core CPU? */
90 if (c->extended_cpuid_level < 0x80000008)
91 return;
92
93 ecx = cpuid_ecx(0x80000008);
94
95 c->x86_max_cores = (ecx & 0xff) + 1;
96
97 /* CPU telling us the core id bits shift? */
98 bits = (ecx >> 12) & 0xF;
99
100 /* Otherwise recompute */
101 if (bits == 0) {
102 while ((1 << bits) < c->x86_max_cores)
103 bits++;
104 }
105
106 c->x86_coreid_bits = bits;
107
108#endif
109}
110
111static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
112{
113 early_init_amd_mc(c);
114
115 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
116 if (c->x86_power & (1<<8))
117 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
118
119 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
120}
121
122static void __cpuinit init_amd(struct cpuinfo_x86 *c)
123{
124 unsigned level;
125
126#ifdef CONFIG_SMP
127 unsigned long value;
128
129 /*
130 * Disable TLB flush filter by setting HWCR.FFDIS on K8
131 * bit 6 of msr C001_0015
132 *
133 * Errata 63 for SH-B3 steppings
134 * Errata 122 for all steppings (F+ have it disabled by default)
135 */
136 if (c->x86 == 0xf) {
137 rdmsrl(MSR_K8_HWCR, value);
138 value |= 1 << 6;
139 wrmsrl(MSR_K8_HWCR, value);
140 }
141#endif
142
143 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
144 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
145 clear_cpu_cap(c, 0*32+31);
146
147 /* On C+ stepping K8 rep microcode works well for copy/memset */
148 if (c->x86 == 0xf) {
149 level = cpuid_eax(1);
150 if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
151 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
152 }
153 if (c->x86 == 0x10 || c->x86 == 0x11)
154 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
155
156 /* Enable workaround for FXSAVE leak */
157 if (c->x86 >= 6)
158 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
159
160 level = get_model_name(c);
161 if (!level) {
162 switch (c->x86) {
163 case 0xf:
164 /* Should distinguish Models here, but this is only
165 a fallback anyways. */
166 strcpy(c->x86_model_id, "Hammer");
167 break;
168 }
169 }
170 display_cacheinfo(c);
171
172 /* Multi core CPU? */
173 if (c->extended_cpuid_level >= 0x80000008)
174 amd_detect_cmp(c);
175
176 if (c->extended_cpuid_level >= 0x80000006 &&
177 (cpuid_edx(0x80000006) & 0xf000))
178 num_cache_leaves = 4;
179 else
180 num_cache_leaves = 3;
181
182 if (c->x86 >= 0xf && c->x86 <= 0x11)
183 set_cpu_cap(c, X86_FEATURE_K8);
184
185 /* MFENCE stops RDTSC speculation */
186 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
187
188 if (c->x86 == 0x10) {
189 /* do this for boot cpu */
190 if (c == &boot_cpu_data)
191 check_enable_amd_mmconf_dmi();
192
193 fam10h_check_enable_mmcfg();
194 }
195
196 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
197 unsigned long long tseg;
198
199 /*
200 * Split up direct mapping around the TSEG SMM area.
201 * Don't do it for gbpages because there seems very little
202 * benefit in doing so.
203 */
204 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
205 printk(KERN_DEBUG "tseg: %010llx\n", tseg);
206 if ((tseg>>PMD_SHIFT) <
207 (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
208 ((tseg>>PMD_SHIFT) <
209 (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
210 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
211 set_memory_4k((unsigned long)__va(tseg), 1);
212 }
213 }
214}
215
216static struct cpu_dev amd_cpu_dev __cpuinitdata = {
217 .c_vendor = "AMD",
218 .c_ident = { "AuthenticAMD" },
219 .c_early_init = early_init_amd,
220 .c_init = init_amd,
221};
222
223cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev);
224
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 170d2f5523b2..c9b58a806e85 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -59,8 +59,12 @@ static void __init check_fpu(void)
59 return; 59 return;
60 } 60 }
61 61
62/* trap_init() enabled FXSR and company _before_ testing for FP problems here. */ 62 /*
63 /* Test for the divl bug.. */ 63 * trap_init() enabled FXSR and company _before_ testing for FP
64 * problems here.
65 *
66 * Test for the divl bug..
67 */
64 __asm__("fninit\n\t" 68 __asm__("fninit\n\t"
65 "fldl %1\n\t" 69 "fldl %1\n\t"
66 "fdivl %2\n\t" 70 "fdivl %2\n\t"
@@ -108,10 +112,15 @@ static void __init check_popad(void)
108 "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx " 112 "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx "
109 : "=&a" (res) 113 : "=&a" (res)
110 : "d" (inp) 114 : "d" (inp)
111 : "ecx", "edi" ); 115 : "ecx", "edi");
112 /* If this fails, it means that any user program may lock the CPU hard. Too bad. */ 116 /*
113 if (res != 12345678) printk( "Buggy.\n" ); 117 * If this fails, it means that any user program may lock the
114 else printk( "OK.\n" ); 118 * CPU hard. Too bad.
119 */
120 if (res != 12345678)
121 printk("Buggy.\n");
122 else
123 printk("OK.\n");
115#endif 124#endif
116} 125}
117 126
@@ -122,13 +131,7 @@ static void __init check_popad(void)
122 * (for due to lack of "invlpg" and working WP on a i386) 131 * (for due to lack of "invlpg" and working WP on a i386)
123 * - In order to run on anything without a TSC, we need to be 132 * - In order to run on anything without a TSC, we need to be
124 * compiled for a i486. 133 * compiled for a i486.
125 * - In order to support the local APIC on a buggy Pentium machine, 134 */
126 * we need to be compiled with CONFIG_X86_GOOD_APIC disabled,
127 * which happens implicitly if compiled for a Pentium or lower
128 * (unless an advanced selection of CPU features is used) as an
129 * otherwise config implies a properly working local APIC without
130 * the need to do extra reads from the APIC.
131*/
132 135
133static void __init check_config(void) 136static void __init check_config(void)
134{ 137{
@@ -137,25 +140,11 @@ static void __init check_config(void)
137 * i486+ only features! (WP works in supervisor mode and the 140 * i486+ only features! (WP works in supervisor mode and the
138 * new "invlpg" and "bswap" instructions) 141 * new "invlpg" and "bswap" instructions)
139 */ 142 */
140#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_BSWAP) 143#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || \
144 defined(CONFIG_X86_BSWAP)
141 if (boot_cpu_data.x86 == 3) 145 if (boot_cpu_data.x86 == 3)
142 panic("Kernel requires i486+ for 'invlpg' and other features"); 146 panic("Kernel requires i486+ for 'invlpg' and other features");
143#endif 147#endif
144
145/*
146 * If we were told we had a good local APIC, check for buggy Pentia,
147 * i.e. all B steppings and the C2 stepping of P54C when using their
148 * integrated APIC (see 11AP erratum in "Pentium Processor
149 * Specification Update").
150 */
151#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC)
152 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL
153 && cpu_has_apic
154 && boot_cpu_data.x86 == 5
155 && boot_cpu_data.x86_model == 2
156 && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11))
157 panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!");
158#endif
159} 148}
160 149
161 150
@@ -170,6 +159,7 @@ void __init check_bugs(void)
170 check_fpu(); 159 check_fpu();
171 check_hlt(); 160 check_hlt();
172 check_popad(); 161 check_popad();
173 init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); 162 init_utsname()->machine[1] =
163 '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
174 alternative_instructions(); 164 alternative_instructions();
175} 165}
diff --git a/arch/x86/kernel/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c
index 9a3ed0649d4e..9a3ed0649d4e 100644
--- a/arch/x86/kernel/bugs_64.c
+++ b/arch/x86/kernel/cpu/bugs_64.c
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
new file mode 100644
index 000000000000..1d181c40e2e1
--- /dev/null
+++ b/arch/x86/kernel/cpu/centaur_64.c
@@ -0,0 +1,35 @@
1#include <linux/init.h>
2#include <linux/smp.h>
3
4#include <asm/cpufeature.h>
5#include <asm/processor.h>
6
7#include "cpu.h"
8
9static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
10{
11 if (c->x86 == 0x6 && c->x86_model >= 0xf)
12 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
13
14 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
15}
16
17static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
18{
19 if (c->x86 == 0x6 && c->x86_model >= 0xf) {
20 c->x86_cache_alignment = c->x86_clflush_size * 2;
21 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
22 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
23 }
24 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
25}
26
27static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
28 .c_vendor = "Centaur",
29 .c_ident = { "CentaurHauls" },
30 .c_early_init = early_init_centaur,
31 .c_init = init_centaur,
32};
33
34cpu_vendor_dev_register(X86_VENDOR_CENTAUR, &centaur_cpu_dev);
35
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index d0463a946247..80ab20d4fa39 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -427,7 +427,7 @@ __setup("serialnumber", x86_serial_nr_setup);
427/* 427/*
428 * This does the hard work of actually picking apart the CPU stuff... 428 * This does the hard work of actually picking apart the CPU stuff...
429 */ 429 */
430void __cpuinit identify_cpu(struct cpuinfo_x86 *c) 430static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
431{ 431{
432 int i; 432 int i;
433 433
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
new file mode 100644
index 000000000000..dd6e3f15017e
--- /dev/null
+++ b/arch/x86/kernel/cpu/common_64.c
@@ -0,0 +1,670 @@
1#include <linux/init.h>
2#include <linux/kernel.h>
3#include <linux/sched.h>
4#include <linux/string.h>
5#include <linux/bootmem.h>
6#include <linux/bitops.h>
7#include <linux/module.h>
8#include <linux/kgdb.h>
9#include <linux/topology.h>
10#include <linux/delay.h>
11#include <linux/smp.h>
12#include <linux/percpu.h>
13#include <asm/i387.h>
14#include <asm/msr.h>
15#include <asm/io.h>
16#include <asm/linkage.h>
17#include <asm/mmu_context.h>
18#include <asm/mtrr.h>
19#include <asm/mce.h>
20#include <asm/pat.h>
21#include <asm/numa.h>
22#ifdef CONFIG_X86_LOCAL_APIC
23#include <asm/mpspec.h>
24#include <asm/apic.h>
25#include <mach_apic.h>
26#endif
27#include <asm/pda.h>
28#include <asm/pgtable.h>
29#include <asm/processor.h>
30#include <asm/desc.h>
31#include <asm/atomic.h>
32#include <asm/proto.h>
33#include <asm/sections.h>
34#include <asm/setup.h>
35#include <asm/genapic.h>
36
37#include "cpu.h"
38
39/* We need valid kernel segments for data and code in long mode too
40 * IRET will check the segment types kkeil 2000/10/28
41 * Also sysret mandates a special GDT layout
42 */
43/* The TLS descriptors are currently at a different place compared to i386.
44 Hopefully nobody expects them at a fixed place (Wine?) */
45DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
46 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
47 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
48 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
49 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
50 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
51 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
52} };
53EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
54
55__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
56
57/* Current gdt points %fs at the "master" per-cpu area: after this,
58 * it's on the real one. */
59void switch_to_new_gdt(void)
60{
61 struct desc_ptr gdt_descr;
62
63 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
64 gdt_descr.size = GDT_SIZE - 1;
65 load_gdt(&gdt_descr);
66}
67
68struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
69
70static void __cpuinit default_init(struct cpuinfo_x86 *c)
71{
72 display_cacheinfo(c);
73}
74
75static struct cpu_dev __cpuinitdata default_cpu = {
76 .c_init = default_init,
77 .c_vendor = "Unknown",
78};
79static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
80
81int __cpuinit get_model_name(struct cpuinfo_x86 *c)
82{
83 unsigned int *v;
84
85 if (c->extended_cpuid_level < 0x80000004)
86 return 0;
87
88 v = (unsigned int *) c->x86_model_id;
89 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
90 cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
91 cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
92 c->x86_model_id[48] = 0;
93 return 1;
94}
95
96
97void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
98{
99 unsigned int n, dummy, ebx, ecx, edx;
100
101 n = c->extended_cpuid_level;
102
103 if (n >= 0x80000005) {
104 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
105 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
106 "D cache %dK (%d bytes/line)\n",
107 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
108 c->x86_cache_size = (ecx>>24) + (edx>>24);
109 /* On K8 L1 TLB is inclusive, so don't count it */
110 c->x86_tlbsize = 0;
111 }
112
113 if (n >= 0x80000006) {
114 cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
115 ecx = cpuid_ecx(0x80000006);
116 c->x86_cache_size = ecx >> 16;
117 c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
118
119 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
120 c->x86_cache_size, ecx & 0xFF);
121 }
122}
123
124void __cpuinit detect_ht(struct cpuinfo_x86 *c)
125{
126#ifdef CONFIG_SMP
127 u32 eax, ebx, ecx, edx;
128 int index_msb, core_bits;
129
130 cpuid(1, &eax, &ebx, &ecx, &edx);
131
132
133 if (!cpu_has(c, X86_FEATURE_HT))
134 return;
135 if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
136 goto out;
137
138 smp_num_siblings = (ebx & 0xff0000) >> 16;
139
140 if (smp_num_siblings == 1) {
141 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
142 } else if (smp_num_siblings > 1) {
143
144 if (smp_num_siblings > NR_CPUS) {
145 printk(KERN_WARNING "CPU: Unsupported number of "
146 "siblings %d", smp_num_siblings);
147 smp_num_siblings = 1;
148 return;
149 }
150
151 index_msb = get_count_order(smp_num_siblings);
152 c->phys_proc_id = phys_pkg_id(index_msb);
153
154 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
155
156 index_msb = get_count_order(smp_num_siblings);
157
158 core_bits = get_count_order(c->x86_max_cores);
159
160 c->cpu_core_id = phys_pkg_id(index_msb) &
161 ((1 << core_bits) - 1);
162 }
163out:
164 if ((c->x86_max_cores * smp_num_siblings) > 1) {
165 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
166 c->phys_proc_id);
167 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
168 c->cpu_core_id);
169 }
170
171#endif
172}
173
174static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
175{
176 char *v = c->x86_vendor_id;
177 int i;
178 static int printed;
179
180 for (i = 0; i < X86_VENDOR_NUM; i++) {
181 if (cpu_devs[i]) {
182 if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
183 (cpu_devs[i]->c_ident[1] &&
184 !strcmp(v, cpu_devs[i]->c_ident[1]))) {
185 c->x86_vendor = i;
186 this_cpu = cpu_devs[i];
187 return;
188 }
189 }
190 }
191 if (!printed) {
192 printed++;
193 printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
194 printk(KERN_ERR "CPU: Your system may be unstable.\n");
195 }
196 c->x86_vendor = X86_VENDOR_UNKNOWN;
197}
198
199static void __init early_cpu_support_print(void)
200{
201 int i,j;
202 struct cpu_dev *cpu_devx;
203
204 printk("KERNEL supported cpus:\n");
205 for (i = 0; i < X86_VENDOR_NUM; i++) {
206 cpu_devx = cpu_devs[i];
207 if (!cpu_devx)
208 continue;
209 for (j = 0; j < 2; j++) {
210 if (!cpu_devx->c_ident[j])
211 continue;
212 printk(" %s %s\n", cpu_devx->c_vendor,
213 cpu_devx->c_ident[j]);
214 }
215 }
216}
217
218static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
219
220void __init early_cpu_init(void)
221{
222 struct cpu_vendor_dev *cvdev;
223
224 for (cvdev = __x86cpuvendor_start ;
225 cvdev < __x86cpuvendor_end ;
226 cvdev++)
227 cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
228 early_cpu_support_print();
229 early_identify_cpu(&boot_cpu_data);
230}
231
232/* Do some early cpuid on the boot CPU to get some parameter that are
233 needed before check_bugs. Everything advanced is in identify_cpu
234 below. */
235static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
236{
237 u32 tfms, xlvl;
238
239 c->loops_per_jiffy = loops_per_jiffy;
240 c->x86_cache_size = -1;
241 c->x86_vendor = X86_VENDOR_UNKNOWN;
242 c->x86_model = c->x86_mask = 0; /* So far unknown... */
243 c->x86_vendor_id[0] = '\0'; /* Unset */
244 c->x86_model_id[0] = '\0'; /* Unset */
245 c->x86_clflush_size = 64;
246 c->x86_cache_alignment = c->x86_clflush_size;
247 c->x86_max_cores = 1;
248 c->x86_coreid_bits = 0;
249 c->extended_cpuid_level = 0;
250 memset(&c->x86_capability, 0, sizeof c->x86_capability);
251
252 /* Get vendor name */
253 cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
254 (unsigned int *)&c->x86_vendor_id[0],
255 (unsigned int *)&c->x86_vendor_id[8],
256 (unsigned int *)&c->x86_vendor_id[4]);
257
258 get_cpu_vendor(c);
259
260 /* Initialize the standard set of capabilities */
261 /* Note that the vendor-specific code below might override */
262
263 /* Intel-defined flags: level 0x00000001 */
264 if (c->cpuid_level >= 0x00000001) {
265 __u32 misc;
266 cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
267 &c->x86_capability[0]);
268 c->x86 = (tfms >> 8) & 0xf;
269 c->x86_model = (tfms >> 4) & 0xf;
270 c->x86_mask = tfms & 0xf;
271 if (c->x86 == 0xf)
272 c->x86 += (tfms >> 20) & 0xff;
273 if (c->x86 >= 0x6)
274 c->x86_model += ((tfms >> 16) & 0xF) << 4;
275 if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
276 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
277 } else {
278 /* Have CPUID level 0 only - unheard of */
279 c->x86 = 4;
280 }
281
282 c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
283#ifdef CONFIG_SMP
284 c->phys_proc_id = c->initial_apicid;
285#endif
286 /* AMD-defined flags: level 0x80000001 */
287 xlvl = cpuid_eax(0x80000000);
288 c->extended_cpuid_level = xlvl;
289 if ((xlvl & 0xffff0000) == 0x80000000) {
290 if (xlvl >= 0x80000001) {
291 c->x86_capability[1] = cpuid_edx(0x80000001);
292 c->x86_capability[6] = cpuid_ecx(0x80000001);
293 }
294 if (xlvl >= 0x80000004)
295 get_model_name(c); /* Default name */
296 }
297
298 /* Transmeta-defined flags: level 0x80860001 */
299 xlvl = cpuid_eax(0x80860000);
300 if ((xlvl & 0xffff0000) == 0x80860000) {
301 /* Don't set x86_cpuid_level here for now to not confuse. */
302 if (xlvl >= 0x80860001)
303 c->x86_capability[2] = cpuid_edx(0x80860001);
304 }
305
306 if (c->extended_cpuid_level >= 0x80000007)
307 c->x86_power = cpuid_edx(0x80000007);
308
309 if (c->extended_cpuid_level >= 0x80000008) {
310 u32 eax = cpuid_eax(0x80000008);
311
312 c->x86_virt_bits = (eax >> 8) & 0xff;
313 c->x86_phys_bits = eax & 0xff;
314 }
315
316 if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
317 cpu_devs[c->x86_vendor]->c_early_init)
318 cpu_devs[c->x86_vendor]->c_early_init(c);
319
320 validate_pat_support(c);
321}
322
323/*
324 * This does the hard work of actually picking apart the CPU stuff...
325 */
326static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
327{
328 int i;
329
330 early_identify_cpu(c);
331
332 init_scattered_cpuid_features(c);
333
334 c->apicid = phys_pkg_id(0);
335
336 /*
337 * Vendor-specific initialization. In this section we
338 * canonicalize the feature flags, meaning if there are
339 * features a certain CPU supports which CPUID doesn't
340 * tell us, CPUID claiming incorrect flags, or other bugs,
341 * we handle them here.
342 *
343 * At the end of this section, c->x86_capability better
344 * indicate the features this CPU genuinely supports!
345 */
346 if (this_cpu->c_init)
347 this_cpu->c_init(c);
348
349 detect_ht(c);
350
351 /*
352 * On SMP, boot_cpu_data holds the common feature set between
353 * all CPUs; so make sure that we indicate which features are
354 * common between the CPUs. The first time this routine gets
355 * executed, c == &boot_cpu_data.
356 */
357 if (c != &boot_cpu_data) {
358 /* AND the already accumulated flags with these */
359 for (i = 0; i < NCAPINTS; i++)
360 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
361 }
362
363 /* Clear all flags overriden by options */
364 for (i = 0; i < NCAPINTS; i++)
365 c->x86_capability[i] &= ~cleared_cpu_caps[i];
366
367#ifdef CONFIG_X86_MCE
368 mcheck_init(c);
369#endif
370 select_idle_routine(c);
371
372#ifdef CONFIG_NUMA
373 numa_add_cpu(smp_processor_id());
374#endif
375
376}
377
378void __cpuinit identify_boot_cpu(void)
379{
380 identify_cpu(&boot_cpu_data);
381}
382
383void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
384{
385 BUG_ON(c == &boot_cpu_data);
386 identify_cpu(c);
387 mtrr_ap_init();
388}
389
390static __init int setup_noclflush(char *arg)
391{
392 setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
393 return 1;
394}
395__setup("noclflush", setup_noclflush);
396
397void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
398{
399 if (c->x86_model_id[0])
400 printk(KERN_CONT "%s", c->x86_model_id);
401
402 if (c->x86_mask || c->cpuid_level >= 0)
403 printk(KERN_CONT " stepping %02x\n", c->x86_mask);
404 else
405 printk(KERN_CONT "\n");
406}
407
408static __init int setup_disablecpuid(char *arg)
409{
410 int bit;
411 if (get_option(&arg, &bit) && bit < NCAPINTS*32)
412 setup_clear_cpu_cap(bit);
413 else
414 return 0;
415 return 1;
416}
417__setup("clearcpuid=", setup_disablecpuid);
418
419cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
420
421struct x8664_pda **_cpu_pda __read_mostly;
422EXPORT_SYMBOL(_cpu_pda);
423
424struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
425
426char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
427
428unsigned long __supported_pte_mask __read_mostly = ~0UL;
429EXPORT_SYMBOL_GPL(__supported_pte_mask);
430
431static int do_not_nx __cpuinitdata;
432
433/* noexec=on|off
434Control non executable mappings for 64bit processes.
435
436on Enable(default)
437off Disable
438*/
439static int __init nonx_setup(char *str)
440{
441 if (!str)
442 return -EINVAL;
443 if (!strncmp(str, "on", 2)) {
444 __supported_pte_mask |= _PAGE_NX;
445 do_not_nx = 0;
446 } else if (!strncmp(str, "off", 3)) {
447 do_not_nx = 1;
448 __supported_pte_mask &= ~_PAGE_NX;
449 }
450 return 0;
451}
452early_param("noexec", nonx_setup);
453
454int force_personality32;
455
456/* noexec32=on|off
457Control non executable heap for 32bit processes.
458To control the stack too use noexec=off
459
460on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
461off PROT_READ implies PROT_EXEC
462*/
463static int __init nonx32_setup(char *str)
464{
465 if (!strcmp(str, "on"))
466 force_personality32 &= ~READ_IMPLIES_EXEC;
467 else if (!strcmp(str, "off"))
468 force_personality32 |= READ_IMPLIES_EXEC;
469 return 1;
470}
471__setup("noexec32=", nonx32_setup);
472
473void pda_init(int cpu)
474{
475 struct x8664_pda *pda = cpu_pda(cpu);
476
477 /* Setup up data that may be needed in __get_free_pages early */
478 loadsegment(fs, 0);
479 loadsegment(gs, 0);
480 /* Memory clobbers used to order PDA accessed */
481 mb();
482 wrmsrl(MSR_GS_BASE, pda);
483 mb();
484
485 pda->cpunumber = cpu;
486 pda->irqcount = -1;
487 pda->kernelstack = (unsigned long)stack_thread_info() -
488 PDA_STACKOFFSET + THREAD_SIZE;
489 pda->active_mm = &init_mm;
490 pda->mmu_state = 0;
491
492 if (cpu == 0) {
493 /* others are initialized in smpboot.c */
494 pda->pcurrent = &init_task;
495 pda->irqstackptr = boot_cpu_stack;
496 } else {
497 pda->irqstackptr = (char *)
498 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
499 if (!pda->irqstackptr)
500 panic("cannot allocate irqstack for cpu %d", cpu);
501
502 if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
503 pda->nodenumber = cpu_to_node(cpu);
504 }
505
506 pda->irqstackptr += IRQSTACKSIZE-64;
507}
508
509char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
510 DEBUG_STKSZ] __page_aligned_bss;
511
512extern asmlinkage void ignore_sysret(void);
513
514/* May not be marked __init: used by software suspend */
515void syscall_init(void)
516{
517 /*
518 * LSTAR and STAR live in a bit strange symbiosis.
519 * They both write to the same internal register. STAR allows to
520 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
521 */
522 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
523 wrmsrl(MSR_LSTAR, system_call);
524 wrmsrl(MSR_CSTAR, ignore_sysret);
525
526#ifdef CONFIG_IA32_EMULATION
527 syscall32_cpu_init();
528#endif
529
530 /* Flags to clear on syscall */
531 wrmsrl(MSR_SYSCALL_MASK,
532 X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
533}
534
535void __cpuinit check_efer(void)
536{
537 unsigned long efer;
538
539 rdmsrl(MSR_EFER, efer);
540 if (!(efer & EFER_NX) || do_not_nx)
541 __supported_pte_mask &= ~_PAGE_NX;
542}
543
544unsigned long kernel_eflags;
545
546/*
547 * Copies of the original ist values from the tss are only accessed during
548 * debugging, no special alignment required.
549 */
550DEFINE_PER_CPU(struct orig_ist, orig_ist);
551
552/*
553 * cpu_init() initializes state that is per-CPU. Some data is already
554 * initialized (naturally) in the bootstrap process, such as the GDT
555 * and IDT. We reload them nevertheless, this function acts as a
556 * 'CPU state barrier', nothing should get across.
557 * A lot of state is already set up in PDA init.
558 */
559void __cpuinit cpu_init(void)
560{
561 int cpu = stack_smp_processor_id();
562 struct tss_struct *t = &per_cpu(init_tss, cpu);
563 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
564 unsigned long v;
565 char *estacks = NULL;
566 struct task_struct *me;
567 int i;
568
569 /* CPU 0 is initialised in head64.c */
570 if (cpu != 0)
571 pda_init(cpu);
572 else
573 estacks = boot_exception_stacks;
574
575 me = current;
576
577 if (cpu_test_and_set(cpu, cpu_initialized))
578 panic("CPU#%d already initialized!\n", cpu);
579
580 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
581
582 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
583
584 /*
585 * Initialize the per-CPU GDT with the boot GDT,
586 * and set up the GDT descriptor:
587 */
588
589 switch_to_new_gdt();
590 load_idt((const struct desc_ptr *)&idt_descr);
591
592 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
593 syscall_init();
594
595 wrmsrl(MSR_FS_BASE, 0);
596 wrmsrl(MSR_KERNEL_GS_BASE, 0);
597 barrier();
598
599 check_efer();
600
601 /*
602 * set up and load the per-CPU TSS
603 */
604 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
605 static const unsigned int order[N_EXCEPTION_STACKS] = {
606 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
607 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
608 };
609 if (cpu) {
610 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
611 if (!estacks)
612 panic("Cannot allocate exception stack %ld %d\n",
613 v, cpu);
614 }
615 estacks += PAGE_SIZE << order[v];
616 orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
617 }
618
619 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
620 /*
621 * <= is required because the CPU will access up to
622 * 8 bits beyond the end of the IO permission bitmap.
623 */
624 for (i = 0; i <= IO_BITMAP_LONGS; i++)
625 t->io_bitmap[i] = ~0UL;
626
627 atomic_inc(&init_mm.mm_count);
628 me->active_mm = &init_mm;
629 if (me->mm)
630 BUG();
631 enter_lazy_tlb(&init_mm, me);
632
633 load_sp0(t, &current->thread);
634 set_tss_desc(cpu, t);
635 load_TR_desc();
636 load_LDT(&init_mm.context);
637
638#ifdef CONFIG_KGDB
639 /*
640 * If the kgdb is connected no debug regs should be altered. This
641 * is only applicable when KGDB and a KGDB I/O module are built
642 * into the kernel and you are using early debugging with
643 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
644 */
645 if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
646 arch_kgdb_ops.correct_hw_break();
647 else {
648#endif
649 /*
650 * Clear all 6 debug registers:
651 */
652
653 set_debugreg(0UL, 0);
654 set_debugreg(0UL, 1);
655 set_debugreg(0UL, 2);
656 set_debugreg(0UL, 3);
657 set_debugreg(0UL, 6);
658 set_debugreg(0UL, 7);
659#ifdef CONFIG_KGDB
660 /* If the kgdb is connected no debug regs should be altered. */
661 }
662#endif
663
664 fpu_init();
665
666 raw_local_save_flags(kernel_eflags);
667
668 if (is_uv_system())
669 uv_cpu_init();
670}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 783691b2a738..4d894e8565fe 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -1,3 +1,6 @@
1#ifndef ARCH_X86_CPU_H
2
3#define ARCH_X86_CPU_H
1 4
2struct cpu_model_info { 5struct cpu_model_info {
3 int vendor; 6 int vendor;
@@ -36,3 +39,5 @@ extern struct cpu_vendor_dev __x86cpuvendor_start[], __x86cpuvendor_end[];
36 39
37extern int get_model_name(struct cpuinfo_x86 *c); 40extern int get_model_name(struct cpuinfo_x86 *c);
38extern void display_cacheinfo(struct cpuinfo_x86 *c); 41extern void display_cacheinfo(struct cpuinfo_x86 *c);
42
43#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index b0c8208df9fa..ff2fff56f0a8 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -200,10 +200,12 @@ static void drv_read(struct drv_cmd *cmd)
200static void drv_write(struct drv_cmd *cmd) 200static void drv_write(struct drv_cmd *cmd)
201{ 201{
202 cpumask_t saved_mask = current->cpus_allowed; 202 cpumask_t saved_mask = current->cpus_allowed;
203 cpumask_of_cpu_ptr_declare(cpu_mask);
203 unsigned int i; 204 unsigned int i;
204 205
205 for_each_cpu_mask(i, cmd->mask) { 206 for_each_cpu_mask_nr(i, cmd->mask) {
206 set_cpus_allowed_ptr(current, &cpumask_of_cpu(i)); 207 cpumask_of_cpu_ptr_next(cpu_mask, i);
208 set_cpus_allowed_ptr(current, cpu_mask);
207 do_drv_write(cmd); 209 do_drv_write(cmd);
208 } 210 }
209 211
@@ -267,11 +269,12 @@ static unsigned int get_measured_perf(unsigned int cpu)
267 } aperf_cur, mperf_cur; 269 } aperf_cur, mperf_cur;
268 270
269 cpumask_t saved_mask; 271 cpumask_t saved_mask;
272 cpumask_of_cpu_ptr(cpu_mask, cpu);
270 unsigned int perf_percent; 273 unsigned int perf_percent;
271 unsigned int retval; 274 unsigned int retval;
272 275
273 saved_mask = current->cpus_allowed; 276 saved_mask = current->cpus_allowed;
274 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 277 set_cpus_allowed_ptr(current, cpu_mask);
275 if (get_cpu() != cpu) { 278 if (get_cpu() != cpu) {
276 /* We were not able to run on requested processor */ 279 /* We were not able to run on requested processor */
277 put_cpu(); 280 put_cpu();
@@ -337,6 +340,7 @@ static unsigned int get_measured_perf(unsigned int cpu)
337 340
338static unsigned int get_cur_freq_on_cpu(unsigned int cpu) 341static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
339{ 342{
343 cpumask_of_cpu_ptr(cpu_mask, cpu);
340 struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu); 344 struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu);
341 unsigned int freq; 345 unsigned int freq;
342 unsigned int cached_freq; 346 unsigned int cached_freq;
@@ -349,7 +353,7 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
349 } 353 }
350 354
351 cached_freq = data->freq_table[data->acpi_data->state].frequency; 355 cached_freq = data->freq_table[data->acpi_data->state].frequency;
352 freq = extract_freq(get_cur_val(&cpumask_of_cpu(cpu)), data); 356 freq = extract_freq(get_cur_val(cpu_mask), data);
353 if (freq != cached_freq) { 357 if (freq != cached_freq) {
354 /* 358 /*
355 * The dreaded BIOS frequency change behind our back. 359 * The dreaded BIOS frequency change behind our back.
@@ -451,7 +455,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
451 455
452 freqs.old = perf->states[perf->state].core_frequency * 1000; 456 freqs.old = perf->states[perf->state].core_frequency * 1000;
453 freqs.new = data->freq_table[next_state].frequency; 457 freqs.new = data->freq_table[next_state].frequency;
454 for_each_cpu_mask(i, cmd.mask) { 458 for_each_cpu_mask_nr(i, cmd.mask) {
455 freqs.cpu = i; 459 freqs.cpu = i;
456 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 460 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
457 } 461 }
@@ -466,7 +470,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
466 } 470 }
467 } 471 }
468 472
469 for_each_cpu_mask(i, cmd.mask) { 473 for_each_cpu_mask_nr(i, cmd.mask) {
470 freqs.cpu = i; 474 freqs.cpu = i;
471 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 475 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
472 } 476 }
diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
index f03e9153618e..965ea52767ac 100644
--- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
+++ b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
@@ -26,9 +26,10 @@
26#define NFORCE2_SAFE_DISTANCE 50 26#define NFORCE2_SAFE_DISTANCE 50
27 27
28/* Delay in ms between FSB changes */ 28/* Delay in ms between FSB changes */
29//#define NFORCE2_DELAY 10 29/* #define NFORCE2_DELAY 10 */
30 30
31/* nforce2_chipset: 31/*
32 * nforce2_chipset:
32 * FSB is changed using the chipset 33 * FSB is changed using the chipset
33 */ 34 */
34static struct pci_dev *nforce2_chipset_dev; 35static struct pci_dev *nforce2_chipset_dev;
@@ -36,13 +37,13 @@ static struct pci_dev *nforce2_chipset_dev;
36/* fid: 37/* fid:
37 * multiplier * 10 38 * multiplier * 10
38 */ 39 */
39static int fid = 0; 40static int fid;
40 41
41/* min_fsb, max_fsb: 42/* min_fsb, max_fsb:
42 * minimum and maximum FSB (= FSB at boot time) 43 * minimum and maximum FSB (= FSB at boot time)
43 */ 44 */
44static int min_fsb = 0; 45static int min_fsb;
45static int max_fsb = 0; 46static int max_fsb;
46 47
47MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>"); 48MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>");
48MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver"); 49MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver");
@@ -53,7 +54,7 @@ module_param(min_fsb, int, 0444);
53 54
54MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)"); 55MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)");
55MODULE_PARM_DESC(min_fsb, 56MODULE_PARM_DESC(min_fsb,
56 "Minimum FSB to use, if not defined: current FSB - 50"); 57 "Minimum FSB to use, if not defined: current FSB - 50");
57 58
58#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "cpufreq-nforce2", msg) 59#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "cpufreq-nforce2", msg)
59 60
@@ -139,7 +140,7 @@ static unsigned int nforce2_fsb_read(int bootfsb)
139 140
140 /* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */ 141 /* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */
141 nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, 142 nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
142 0x01EF,PCI_ANY_ID,PCI_ANY_ID,NULL); 143 0x01EF, PCI_ANY_ID, PCI_ANY_ID, NULL);
143 if (!nforce2_sub5) 144 if (!nforce2_sub5)
144 return 0; 145 return 0;
145 146
@@ -147,13 +148,13 @@ static unsigned int nforce2_fsb_read(int bootfsb)
147 fsb /= 1000000; 148 fsb /= 1000000;
148 149
149 /* Check if PLL register is already set */ 150 /* Check if PLL register is already set */
150 pci_read_config_byte(nforce2_chipset_dev,NFORCE2_PLLENABLE, (u8 *)&temp); 151 pci_read_config_byte(nforce2_chipset_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
151 152
152 if(bootfsb || !temp) 153 if (bootfsb || !temp)
153 return fsb; 154 return fsb;
154 155
155 /* Use PLL register FSB value */ 156 /* Use PLL register FSB value */
156 pci_read_config_dword(nforce2_chipset_dev,NFORCE2_PLLREG, &temp); 157 pci_read_config_dword(nforce2_chipset_dev, NFORCE2_PLLREG, &temp);
157 fsb = nforce2_calc_fsb(temp); 158 fsb = nforce2_calc_fsb(temp);
158 159
159 return fsb; 160 return fsb;
@@ -184,7 +185,7 @@ static int nforce2_set_fsb(unsigned int fsb)
184 } 185 }
185 186
186 /* First write? Then set actual value */ 187 /* First write? Then set actual value */
187 pci_read_config_byte(nforce2_chipset_dev,NFORCE2_PLLENABLE, (u8 *)&temp); 188 pci_read_config_byte(nforce2_chipset_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
188 if (!temp) { 189 if (!temp) {
189 pll = nforce2_calc_pll(tfsb); 190 pll = nforce2_calc_pll(tfsb);
190 191
@@ -210,7 +211,8 @@ static int nforce2_set_fsb(unsigned int fsb)
210 tfsb--; 211 tfsb--;
211 212
212 /* Calculate the PLL reg. value */ 213 /* Calculate the PLL reg. value */
213 if ((pll = nforce2_calc_pll(tfsb)) == -1) 214 pll = nforce2_calc_pll(tfsb);
215 if (pll == -1)
214 return -EINVAL; 216 return -EINVAL;
215 217
216 nforce2_write_pll(pll); 218 nforce2_write_pll(pll);
@@ -249,7 +251,7 @@ static unsigned int nforce2_get(unsigned int cpu)
249static int nforce2_target(struct cpufreq_policy *policy, 251static int nforce2_target(struct cpufreq_policy *policy,
250 unsigned int target_freq, unsigned int relation) 252 unsigned int target_freq, unsigned int relation)
251{ 253{
252// unsigned long flags; 254/* unsigned long flags; */
253 struct cpufreq_freqs freqs; 255 struct cpufreq_freqs freqs;
254 unsigned int target_fsb; 256 unsigned int target_fsb;
255 257
@@ -271,17 +273,17 @@ static int nforce2_target(struct cpufreq_policy *policy,
271 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 273 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
272 274
273 /* Disable IRQs */ 275 /* Disable IRQs */
274 //local_irq_save(flags); 276 /* local_irq_save(flags); */
275 277
276 if (nforce2_set_fsb(target_fsb) < 0) 278 if (nforce2_set_fsb(target_fsb) < 0)
277 printk(KERN_ERR "cpufreq: Changing FSB to %d failed\n", 279 printk(KERN_ERR "cpufreq: Changing FSB to %d failed\n",
278 target_fsb); 280 target_fsb);
279 else 281 else
280 dprintk("Changed FSB successfully to %d\n", 282 dprintk("Changed FSB successfully to %d\n",
281 target_fsb); 283 target_fsb);
282 284
283 /* Enable IRQs */ 285 /* Enable IRQs */
284 //local_irq_restore(flags); 286 /* local_irq_restore(flags); */
285 287
286 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 288 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
287 289
@@ -302,8 +304,8 @@ static int nforce2_verify(struct cpufreq_policy *policy)
302 policy->max = (fsb_pol_max + 1) * fid * 100; 304 policy->max = (fsb_pol_max + 1) * fid * 100;
303 305
304 cpufreq_verify_within_limits(policy, 306 cpufreq_verify_within_limits(policy,
305 policy->cpuinfo.min_freq, 307 policy->cpuinfo.min_freq,
306 policy->cpuinfo.max_freq); 308 policy->cpuinfo.max_freq);
307 return 0; 309 return 0;
308} 310}
309 311
@@ -347,7 +349,7 @@ static int nforce2_cpu_init(struct cpufreq_policy *policy)
347 /* Set maximum FSB to FSB at boot time */ 349 /* Set maximum FSB to FSB at boot time */
348 max_fsb = nforce2_fsb_read(1); 350 max_fsb = nforce2_fsb_read(1);
349 351
350 if(!max_fsb) 352 if (!max_fsb)
351 return -EIO; 353 return -EIO;
352 354
353 if (!min_fsb) 355 if (!min_fsb)
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 199e4e05e5dc..f1685fb91fbd 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -122,7 +122,7 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
122 return 0; 122 return 0;
123 123
124 /* notifiers */ 124 /* notifiers */
125 for_each_cpu_mask(i, policy->cpus) { 125 for_each_cpu_mask_nr(i, policy->cpus) {
126 freqs.cpu = i; 126 freqs.cpu = i;
127 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 127 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
128 } 128 }
@@ -130,11 +130,11 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
130 /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software 130 /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software
131 * Developer's Manual, Volume 3 131 * Developer's Manual, Volume 3
132 */ 132 */
133 for_each_cpu_mask(i, policy->cpus) 133 for_each_cpu_mask_nr(i, policy->cpus)
134 cpufreq_p4_setdc(i, p4clockmod_table[newstate].index); 134 cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
135 135
136 /* notifiers */ 136 /* notifiers */
137 for_each_cpu_mask(i, policy->cpus) { 137 for_each_cpu_mask_nr(i, policy->cpus) {
138 freqs.cpu = i; 138 freqs.cpu = i;
139 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 139 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
140 } 140 }
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
index f8a63b3664e3..35fb4eaf6e1c 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
@@ -1,5 +1,4 @@
1/* 1/*
2 * $Id: powernow-k7.h,v 1.2 2003/02/10 18:26:01 davej Exp $
3 * (C) 2003 Dave Jones. 2 * (C) 2003 Dave Jones.
4 * 3 *
5 * Licensed under the terms of the GNU GPL License version 2. 4 * Licensed under the terms of the GNU GPL License version 2.
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 206791eb46e3..53c7b6936973 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -479,11 +479,12 @@ static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvi
479static int check_supported_cpu(unsigned int cpu) 479static int check_supported_cpu(unsigned int cpu)
480{ 480{
481 cpumask_t oldmask; 481 cpumask_t oldmask;
482 cpumask_of_cpu_ptr(cpu_mask, cpu);
482 u32 eax, ebx, ecx, edx; 483 u32 eax, ebx, ecx, edx;
483 unsigned int rc = 0; 484 unsigned int rc = 0;
484 485
485 oldmask = current->cpus_allowed; 486 oldmask = current->cpus_allowed;
486 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 487 set_cpus_allowed_ptr(current, cpu_mask);
487 488
488 if (smp_processor_id() != cpu) { 489 if (smp_processor_id() != cpu) {
489 printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu); 490 printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu);
@@ -966,7 +967,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i
966 freqs.old = find_khz_freq_from_fid(data->currfid); 967 freqs.old = find_khz_freq_from_fid(data->currfid);
967 freqs.new = find_khz_freq_from_fid(fid); 968 freqs.new = find_khz_freq_from_fid(fid);
968 969
969 for_each_cpu_mask(i, *(data->available_cores)) { 970 for_each_cpu_mask_nr(i, *(data->available_cores)) {
970 freqs.cpu = i; 971 freqs.cpu = i;
971 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 972 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
972 } 973 }
@@ -974,7 +975,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i
974 res = transition_fid_vid(data, fid, vid); 975 res = transition_fid_vid(data, fid, vid);
975 freqs.new = find_khz_freq_from_fid(data->currfid); 976 freqs.new = find_khz_freq_from_fid(data->currfid);
976 977
977 for_each_cpu_mask(i, *(data->available_cores)) { 978 for_each_cpu_mask_nr(i, *(data->available_cores)) {
978 freqs.cpu = i; 979 freqs.cpu = i;
979 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 980 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
980 } 981 }
@@ -997,7 +998,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
997 freqs.old = find_khz_freq_from_pstate(data->powernow_table, data->currpstate); 998 freqs.old = find_khz_freq_from_pstate(data->powernow_table, data->currpstate);
998 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); 999 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
999 1000
1000 for_each_cpu_mask(i, *(data->available_cores)) { 1001 for_each_cpu_mask_nr(i, *(data->available_cores)) {
1001 freqs.cpu = i; 1002 freqs.cpu = i;
1002 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 1003 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
1003 } 1004 }
@@ -1005,7 +1006,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
1005 res = transition_pstate(data, pstate); 1006 res = transition_pstate(data, pstate);
1006 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); 1007 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
1007 1008
1008 for_each_cpu_mask(i, *(data->available_cores)) { 1009 for_each_cpu_mask_nr(i, *(data->available_cores)) {
1009 freqs.cpu = i; 1010 freqs.cpu = i;
1010 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 1011 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
1011 } 1012 }
@@ -1016,6 +1017,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
1016static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation) 1017static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation)
1017{ 1018{
1018 cpumask_t oldmask; 1019 cpumask_t oldmask;
1020 cpumask_of_cpu_ptr(cpu_mask, pol->cpu);
1019 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); 1021 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1020 u32 checkfid; 1022 u32 checkfid;
1021 u32 checkvid; 1023 u32 checkvid;
@@ -1030,7 +1032,7 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi
1030 1032
1031 /* only run on specific CPU from here on */ 1033 /* only run on specific CPU from here on */
1032 oldmask = current->cpus_allowed; 1034 oldmask = current->cpus_allowed;
1033 set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); 1035 set_cpus_allowed_ptr(current, cpu_mask);
1034 1036
1035 if (smp_processor_id() != pol->cpu) { 1037 if (smp_processor_id() != pol->cpu) {
1036 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); 1038 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
@@ -1105,6 +1107,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1105{ 1107{
1106 struct powernow_k8_data *data; 1108 struct powernow_k8_data *data;
1107 cpumask_t oldmask; 1109 cpumask_t oldmask;
1110 cpumask_of_cpu_ptr_declare(newmask);
1108 int rc; 1111 int rc;
1109 1112
1110 if (!cpu_online(pol->cpu)) 1113 if (!cpu_online(pol->cpu))
@@ -1156,7 +1159,8 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1156 1159
1157 /* only run on specific CPU from here on */ 1160 /* only run on specific CPU from here on */
1158 oldmask = current->cpus_allowed; 1161 oldmask = current->cpus_allowed;
1159 set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); 1162 cpumask_of_cpu_ptr_next(newmask, pol->cpu);
1163 set_cpus_allowed_ptr(current, newmask);
1160 1164
1161 if (smp_processor_id() != pol->cpu) { 1165 if (smp_processor_id() != pol->cpu) {
1162 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); 1166 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
@@ -1178,7 +1182,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1178 set_cpus_allowed_ptr(current, &oldmask); 1182 set_cpus_allowed_ptr(current, &oldmask);
1179 1183
1180 if (cpu_family == CPU_HW_PSTATE) 1184 if (cpu_family == CPU_HW_PSTATE)
1181 pol->cpus = cpumask_of_cpu(pol->cpu); 1185 pol->cpus = *newmask;
1182 else 1186 else
1183 pol->cpus = per_cpu(cpu_core_map, pol->cpu); 1187 pol->cpus = per_cpu(cpu_core_map, pol->cpu);
1184 data->available_cores = &(pol->cpus); 1188 data->available_cores = &(pol->cpus);
@@ -1244,6 +1248,7 @@ static unsigned int powernowk8_get (unsigned int cpu)
1244{ 1248{
1245 struct powernow_k8_data *data; 1249 struct powernow_k8_data *data;
1246 cpumask_t oldmask = current->cpus_allowed; 1250 cpumask_t oldmask = current->cpus_allowed;
1251 cpumask_of_cpu_ptr(newmask, cpu);
1247 unsigned int khz = 0; 1252 unsigned int khz = 0;
1248 unsigned int first; 1253 unsigned int first;
1249 1254
@@ -1253,7 +1258,7 @@ static unsigned int powernowk8_get (unsigned int cpu)
1253 if (!data) 1258 if (!data)
1254 return -EINVAL; 1259 return -EINVAL;
1255 1260
1256 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 1261 set_cpus_allowed_ptr(current, newmask);
1257 if (smp_processor_id() != cpu) { 1262 if (smp_processor_id() != cpu) {
1258 printk(KERN_ERR PFX 1263 printk(KERN_ERR PFX
1259 "limiting to CPU %d failed in powernowk8_get\n", cpu); 1264 "limiting to CPU %d failed in powernowk8_get\n", cpu);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 908dd347c67e..ca2ac13b7af2 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -28,7 +28,8 @@
28#define PFX "speedstep-centrino: " 28#define PFX "speedstep-centrino: "
29#define MAINTAINER "cpufreq@lists.linux.org.uk" 29#define MAINTAINER "cpufreq@lists.linux.org.uk"
30 30
31#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) 31#define dprintk(msg...) \
32 cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
32 33
33#define INTEL_MSR_RANGE (0xffff) 34#define INTEL_MSR_RANGE (0xffff)
34 35
@@ -66,11 +67,12 @@ struct cpu_model
66 67
67 struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */ 68 struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */
68}; 69};
69static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x); 70static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
71 const struct cpu_id *x);
70 72
71/* Operating points for current CPU */ 73/* Operating points for current CPU */
72static struct cpu_model *centrino_model[NR_CPUS]; 74static DEFINE_PER_CPU(struct cpu_model *, centrino_model);
73static const struct cpu_id *centrino_cpu[NR_CPUS]; 75static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu);
74 76
75static struct cpufreq_driver centrino_driver; 77static struct cpufreq_driver centrino_driver;
76 78
@@ -255,7 +257,7 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy)
255 return -ENOENT; 257 return -ENOENT;
256 } 258 }
257 259
258 centrino_model[policy->cpu] = model; 260 per_cpu(centrino_model, policy->cpu) = model;
259 261
260 dprintk("found \"%s\": max frequency: %dkHz\n", 262 dprintk("found \"%s\": max frequency: %dkHz\n",
261 model->model_name, model->max_freq); 263 model->model_name, model->max_freq);
@@ -264,10 +266,14 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy)
264} 266}
265 267
266#else 268#else
267static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) { return -ENODEV; } 269static inline int centrino_cpu_init_table(struct cpufreq_policy *policy)
270{
271 return -ENODEV;
272}
268#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */ 273#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */
269 274
270static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x) 275static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
276 const struct cpu_id *x)
271{ 277{
272 if ((c->x86 == x->x86) && 278 if ((c->x86 == x->x86) &&
273 (c->x86_model == x->x86_model) && 279 (c->x86_model == x->x86_model) &&
@@ -286,23 +292,28 @@ static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe)
286 * for centrino, as some DSDTs are buggy. 292 * for centrino, as some DSDTs are buggy.
287 * Ideally, this can be done using the acpi_data structure. 293 * Ideally, this can be done using the acpi_data structure.
288 */ 294 */
289 if ((centrino_cpu[cpu] == &cpu_ids[CPU_BANIAS]) || 295 if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) ||
290 (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_A1]) || 296 (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) ||
291 (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_B0])) { 297 (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) {
292 msr = (msr >> 8) & 0xff; 298 msr = (msr >> 8) & 0xff;
293 return msr * 100000; 299 return msr * 100000;
294 } 300 }
295 301
296 if ((!centrino_model[cpu]) || (!centrino_model[cpu]->op_points)) 302 if ((!per_cpu(centrino_model, cpu)) ||
303 (!per_cpu(centrino_model, cpu)->op_points))
297 return 0; 304 return 0;
298 305
299 msr &= 0xffff; 306 msr &= 0xffff;
300 for (i=0;centrino_model[cpu]->op_points[i].frequency != CPUFREQ_TABLE_END; i++) { 307 for (i = 0;
301 if (msr == centrino_model[cpu]->op_points[i].index) 308 per_cpu(centrino_model, cpu)->op_points[i].frequency
302 return centrino_model[cpu]->op_points[i].frequency; 309 != CPUFREQ_TABLE_END;
310 i++) {
311 if (msr == per_cpu(centrino_model, cpu)->op_points[i].index)
312 return per_cpu(centrino_model, cpu)->
313 op_points[i].frequency;
303 } 314 }
304 if (failsafe) 315 if (failsafe)
305 return centrino_model[cpu]->op_points[i-1].frequency; 316 return per_cpu(centrino_model, cpu)->op_points[i-1].frequency;
306 else 317 else
307 return 0; 318 return 0;
308} 319}
@@ -313,9 +324,10 @@ static unsigned int get_cur_freq(unsigned int cpu)
313 unsigned l, h; 324 unsigned l, h;
314 unsigned clock_freq; 325 unsigned clock_freq;
315 cpumask_t saved_mask; 326 cpumask_t saved_mask;
327 cpumask_of_cpu_ptr(new_mask, cpu);
316 328
317 saved_mask = current->cpus_allowed; 329 saved_mask = current->cpus_allowed;
318 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 330 set_cpus_allowed_ptr(current, new_mask);
319 if (smp_processor_id() != cpu) 331 if (smp_processor_id() != cpu)
320 return 0; 332 return 0;
321 333
@@ -347,7 +359,8 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
347 int i; 359 int i;
348 360
349 /* Only Intel makes Enhanced Speedstep-capable CPUs */ 361 /* Only Intel makes Enhanced Speedstep-capable CPUs */
350 if (cpu->x86_vendor != X86_VENDOR_INTEL || !cpu_has(cpu, X86_FEATURE_EST)) 362 if (cpu->x86_vendor != X86_VENDOR_INTEL ||
363 !cpu_has(cpu, X86_FEATURE_EST))
351 return -ENODEV; 364 return -ENODEV;
352 365
353 if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC)) 366 if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC))
@@ -361,9 +374,9 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
361 break; 374 break;
362 375
363 if (i != N_IDS) 376 if (i != N_IDS)
364 centrino_cpu[policy->cpu] = &cpu_ids[i]; 377 per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i];
365 378
366 if (!centrino_cpu[policy->cpu]) { 379 if (!per_cpu(centrino_cpu, policy->cpu)) {
367 dprintk("found unsupported CPU with " 380 dprintk("found unsupported CPU with "
368 "Enhanced SpeedStep: send /proc/cpuinfo to " 381 "Enhanced SpeedStep: send /proc/cpuinfo to "
369 MAINTAINER "\n"); 382 MAINTAINER "\n");
@@ -386,23 +399,26 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
386 /* check to see if it stuck */ 399 /* check to see if it stuck */
387 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 400 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
388 if (!(l & (1<<16))) { 401 if (!(l & (1<<16))) {
389 printk(KERN_INFO PFX "couldn't enable Enhanced SpeedStep\n"); 402 printk(KERN_INFO PFX
403 "couldn't enable Enhanced SpeedStep\n");
390 return -ENODEV; 404 return -ENODEV;
391 } 405 }
392 } 406 }
393 407
394 freq = get_cur_freq(policy->cpu); 408 freq = get_cur_freq(policy->cpu);
395 409 policy->cpuinfo.transition_latency = 10000;
396 policy->cpuinfo.transition_latency = 10000; /* 10uS transition latency */ 410 /* 10uS transition latency */
397 policy->cur = freq; 411 policy->cur = freq;
398 412
399 dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur); 413 dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur);
400 414
401 ret = cpufreq_frequency_table_cpuinfo(policy, centrino_model[policy->cpu]->op_points); 415 ret = cpufreq_frequency_table_cpuinfo(policy,
416 per_cpu(centrino_model, policy->cpu)->op_points);
402 if (ret) 417 if (ret)
403 return (ret); 418 return (ret);
404 419
405 cpufreq_frequency_table_get_attr(centrino_model[policy->cpu]->op_points, policy->cpu); 420 cpufreq_frequency_table_get_attr(
421 per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu);
406 422
407 return 0; 423 return 0;
408} 424}
@@ -411,12 +427,12 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy)
411{ 427{
412 unsigned int cpu = policy->cpu; 428 unsigned int cpu = policy->cpu;
413 429
414 if (!centrino_model[cpu]) 430 if (!per_cpu(centrino_model, cpu))
415 return -ENODEV; 431 return -ENODEV;
416 432
417 cpufreq_frequency_table_put_attr(cpu); 433 cpufreq_frequency_table_put_attr(cpu);
418 434
419 centrino_model[cpu] = NULL; 435 per_cpu(centrino_model, cpu) = NULL;
420 436
421 return 0; 437 return 0;
422} 438}
@@ -430,17 +446,26 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy)
430 */ 446 */
431static int centrino_verify (struct cpufreq_policy *policy) 447static int centrino_verify (struct cpufreq_policy *policy)
432{ 448{
433 return cpufreq_frequency_table_verify(policy, centrino_model[policy->cpu]->op_points); 449 return cpufreq_frequency_table_verify(policy,
450 per_cpu(centrino_model, policy->cpu)->op_points);
434} 451}
435 452
436/** 453/**
437 * centrino_setpolicy - set a new CPUFreq policy 454 * centrino_setpolicy - set a new CPUFreq policy
438 * @policy: new policy 455 * @policy: new policy
439 * @target_freq: the target frequency 456 * @target_freq: the target frequency
440 * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) 457 * @relation: how that frequency relates to achieved frequency
458 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
441 * 459 *
442 * Sets a new CPUFreq policy. 460 * Sets a new CPUFreq policy.
443 */ 461 */
462struct allmasks {
463 cpumask_t online_policy_cpus;
464 cpumask_t saved_mask;
465 cpumask_t set_mask;
466 cpumask_t covered_cpus;
467};
468
444static int centrino_target (struct cpufreq_policy *policy, 469static int centrino_target (struct cpufreq_policy *policy,
445 unsigned int target_freq, 470 unsigned int target_freq,
446 unsigned int relation) 471 unsigned int relation)
@@ -448,48 +473,55 @@ static int centrino_target (struct cpufreq_policy *policy,
448 unsigned int newstate = 0; 473 unsigned int newstate = 0;
449 unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu; 474 unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu;
450 struct cpufreq_freqs freqs; 475 struct cpufreq_freqs freqs;
451 cpumask_t online_policy_cpus;
452 cpumask_t saved_mask;
453 cpumask_t set_mask;
454 cpumask_t covered_cpus;
455 int retval = 0; 476 int retval = 0;
456 unsigned int j, k, first_cpu, tmp; 477 unsigned int j, k, first_cpu, tmp;
457 478 CPUMASK_ALLOC(allmasks);
458 if (unlikely(centrino_model[cpu] == NULL)) 479 CPUMASK_PTR(online_policy_cpus, allmasks);
459 return -ENODEV; 480 CPUMASK_PTR(saved_mask, allmasks);
481 CPUMASK_PTR(set_mask, allmasks);
482 CPUMASK_PTR(covered_cpus, allmasks);
483
484 if (unlikely(allmasks == NULL))
485 return -ENOMEM;
486
487 if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {
488 retval = -ENODEV;
489 goto out;
490 }
460 491
461 if (unlikely(cpufreq_frequency_table_target(policy, 492 if (unlikely(cpufreq_frequency_table_target(policy,
462 centrino_model[cpu]->op_points, 493 per_cpu(centrino_model, cpu)->op_points,
463 target_freq, 494 target_freq,
464 relation, 495 relation,
465 &newstate))) { 496 &newstate))) {
466 return -EINVAL; 497 retval = -EINVAL;
498 goto out;
467 } 499 }
468 500
469#ifdef CONFIG_HOTPLUG_CPU 501#ifdef CONFIG_HOTPLUG_CPU
470 /* cpufreq holds the hotplug lock, so we are safe from here on */ 502 /* cpufreq holds the hotplug lock, so we are safe from here on */
471 cpus_and(online_policy_cpus, cpu_online_map, policy->cpus); 503 cpus_and(*online_policy_cpus, cpu_online_map, policy->cpus);
472#else 504#else
473 online_policy_cpus = policy->cpus; 505 *online_policy_cpus = policy->cpus;
474#endif 506#endif
475 507
476 saved_mask = current->cpus_allowed; 508 *saved_mask = current->cpus_allowed;
477 first_cpu = 1; 509 first_cpu = 1;
478 cpus_clear(covered_cpus); 510 cpus_clear(*covered_cpus);
479 for_each_cpu_mask(j, online_policy_cpus) { 511 for_each_cpu_mask_nr(j, *online_policy_cpus) {
480 /* 512 /*
481 * Support for SMP systems. 513 * Support for SMP systems.
482 * Make sure we are running on CPU that wants to change freq 514 * Make sure we are running on CPU that wants to change freq
483 */ 515 */
484 cpus_clear(set_mask); 516 cpus_clear(*set_mask);
485 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) 517 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
486 cpus_or(set_mask, set_mask, online_policy_cpus); 518 cpus_or(*set_mask, *set_mask, *online_policy_cpus);
487 else 519 else
488 cpu_set(j, set_mask); 520 cpu_set(j, *set_mask);
489 521
490 set_cpus_allowed_ptr(current, &set_mask); 522 set_cpus_allowed_ptr(current, set_mask);
491 preempt_disable(); 523 preempt_disable();
492 if (unlikely(!cpu_isset(smp_processor_id(), set_mask))) { 524 if (unlikely(!cpu_isset(smp_processor_id(), *set_mask))) {
493 dprintk("couldn't limit to CPUs in this domain\n"); 525 dprintk("couldn't limit to CPUs in this domain\n");
494 retval = -EAGAIN; 526 retval = -EAGAIN;
495 if (first_cpu) { 527 if (first_cpu) {
@@ -500,7 +532,7 @@ static int centrino_target (struct cpufreq_policy *policy,
500 break; 532 break;
501 } 533 }
502 534
503 msr = centrino_model[cpu]->op_points[newstate].index; 535 msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;
504 536
505 if (first_cpu) { 537 if (first_cpu) {
506 rdmsr(MSR_IA32_PERF_CTL, oldmsr, h); 538 rdmsr(MSR_IA32_PERF_CTL, oldmsr, h);
@@ -517,7 +549,7 @@ static int centrino_target (struct cpufreq_policy *policy,
517 dprintk("target=%dkHz old=%d new=%d msr=%04x\n", 549 dprintk("target=%dkHz old=%d new=%d msr=%04x\n",
518 target_freq, freqs.old, freqs.new, msr); 550 target_freq, freqs.old, freqs.new, msr);
519 551
520 for_each_cpu_mask(k, online_policy_cpus) { 552 for_each_cpu_mask_nr(k, *online_policy_cpus) {
521 freqs.cpu = k; 553 freqs.cpu = k;
522 cpufreq_notify_transition(&freqs, 554 cpufreq_notify_transition(&freqs,
523 CPUFREQ_PRECHANGE); 555 CPUFREQ_PRECHANGE);
@@ -536,11 +568,11 @@ static int centrino_target (struct cpufreq_policy *policy,
536 break; 568 break;
537 } 569 }
538 570
539 cpu_set(j, covered_cpus); 571 cpu_set(j, *covered_cpus);
540 preempt_enable(); 572 preempt_enable();
541 } 573 }
542 574
543 for_each_cpu_mask(k, online_policy_cpus) { 575 for_each_cpu_mask_nr(k, *online_policy_cpus) {
544 freqs.cpu = k; 576 freqs.cpu = k;
545 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 577 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
546 } 578 }
@@ -553,10 +585,12 @@ static int centrino_target (struct cpufreq_policy *policy,
553 * Best effort undo.. 585 * Best effort undo..
554 */ 586 */
555 587
556 if (!cpus_empty(covered_cpus)) { 588 if (!cpus_empty(*covered_cpus)) {
557 for_each_cpu_mask(j, covered_cpus) { 589 cpumask_of_cpu_ptr_declare(new_mask);
558 set_cpus_allowed_ptr(current, 590
559 &cpumask_of_cpu(j)); 591 for_each_cpu_mask_nr(j, *covered_cpus) {
592 cpumask_of_cpu_ptr_next(new_mask, j);
593 set_cpus_allowed_ptr(current, new_mask);
560 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); 594 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
561 } 595 }
562 } 596 }
@@ -564,19 +598,22 @@ static int centrino_target (struct cpufreq_policy *policy,
564 tmp = freqs.new; 598 tmp = freqs.new;
565 freqs.new = freqs.old; 599 freqs.new = freqs.old;
566 freqs.old = tmp; 600 freqs.old = tmp;
567 for_each_cpu_mask(j, online_policy_cpus) { 601 for_each_cpu_mask_nr(j, *online_policy_cpus) {
568 freqs.cpu = j; 602 freqs.cpu = j;
569 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 603 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
570 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 604 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
571 } 605 }
572 } 606 }
573 set_cpus_allowed_ptr(current, &saved_mask); 607 set_cpus_allowed_ptr(current, saved_mask);
574 return 0; 608 retval = 0;
609 goto out;
575 610
576migrate_end: 611migrate_end:
577 preempt_enable(); 612 preempt_enable();
578 set_cpus_allowed_ptr(current, &saved_mask); 613 set_cpus_allowed_ptr(current, saved_mask);
579 return 0; 614out:
615 CPUMASK_FREE(allmasks);
616 return retval;
580} 617}
581 618
582static struct freq_attr* centrino_attr[] = { 619static struct freq_attr* centrino_attr[] = {
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 1b50244b1fdf..2f3728dc24f6 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -244,7 +244,8 @@ static unsigned int _speedstep_get(const cpumask_t *cpus)
244 244
245static unsigned int speedstep_get(unsigned int cpu) 245static unsigned int speedstep_get(unsigned int cpu)
246{ 246{
247 return _speedstep_get(&cpumask_of_cpu(cpu)); 247 cpumask_of_cpu_ptr(newmask, cpu);
248 return _speedstep_get(newmask);
248} 249}
249 250
250/** 251/**
@@ -279,7 +280,7 @@ static int speedstep_target (struct cpufreq_policy *policy,
279 280
280 cpus_allowed = current->cpus_allowed; 281 cpus_allowed = current->cpus_allowed;
281 282
282 for_each_cpu_mask(i, policy->cpus) { 283 for_each_cpu_mask_nr(i, policy->cpus) {
283 freqs.cpu = i; 284 freqs.cpu = i;
284 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 285 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
285 } 286 }
@@ -292,7 +293,7 @@ static int speedstep_target (struct cpufreq_policy *policy,
292 /* allow to be run on all CPUs */ 293 /* allow to be run on all CPUs */
293 set_cpus_allowed_ptr(current, &cpus_allowed); 294 set_cpus_allowed_ptr(current, &cpus_allowed);
294 295
295 for_each_cpu_mask(i, policy->cpus) { 296 for_each_cpu_mask_nr(i, policy->cpus) {
296 freqs.cpu = i; 297 freqs.cpu = i;
297 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 298 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
298 } 299 }
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index fe9224c51d37..b75f2569b8f8 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -226,6 +226,20 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
226 226
227 if (cpu_has_bts) 227 if (cpu_has_bts)
228 ds_init_intel(c); 228 ds_init_intel(c);
229
230 /*
231 * See if we have a good local APIC by checking for buggy Pentia,
232 * i.e. all B steppings and the C2 stepping of P54C when using their
233 * integrated APIC (see 11AP erratum in "Pentium Processor
234 * Specification Update").
235 */
236 if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
237 (c->x86_mask < 0x6 || c->x86_mask == 0xb))
238 set_cpu_cap(c, X86_FEATURE_11AP);
239
240#ifdef CONFIG_X86_NUMAQ
241 numaq_tsc_disable();
242#endif
229} 243}
230 244
231static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) 245static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c
new file mode 100644
index 000000000000..1019c58d39f0
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_64.c
@@ -0,0 +1,95 @@
1#include <linux/init.h>
2#include <linux/smp.h>
3#include <asm/processor.h>
4#include <asm/ptrace.h>
5#include <asm/topology.h>
6#include <asm/numa_64.h>
7
8#include "cpu.h"
9
10static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
11{
12 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
13 (c->x86 == 0x6 && c->x86_model >= 0x0e))
14 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15
16 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
17}
18
19/*
20 * find out the number of processor cores on the die
21 */
22static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
23{
24 unsigned int eax, t;
25
26 if (c->cpuid_level < 4)
27 return 1;
28
29 cpuid_count(4, 0, &eax, &t, &t, &t);
30
31 if (eax & 0x1f)
32 return ((eax >> 26) + 1);
33 else
34 return 1;
35}
36
37static void __cpuinit srat_detect_node(void)
38{
39#ifdef CONFIG_NUMA
40 unsigned node;
41 int cpu = smp_processor_id();
42 int apicid = hard_smp_processor_id();
43
44 /* Don't do the funky fallback heuristics the AMD version employs
45 for now. */
46 node = apicid_to_node[apicid];
47 if (node == NUMA_NO_NODE || !node_online(node))
48 node = first_node(node_online_map);
49 numa_set_node(cpu, node);
50
51 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
52#endif
53}
54
55static void __cpuinit init_intel(struct cpuinfo_x86 *c)
56{
57 init_intel_cacheinfo(c);
58 if (c->cpuid_level > 9) {
59 unsigned eax = cpuid_eax(10);
60 /* Check for version and the number of counters */
61 if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
62 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
63 }
64
65 if (cpu_has_ds) {
66 unsigned int l1, l2;
67 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
68 if (!(l1 & (1<<11)))
69 set_cpu_cap(c, X86_FEATURE_BTS);
70 if (!(l1 & (1<<12)))
71 set_cpu_cap(c, X86_FEATURE_PEBS);
72 }
73
74
75 if (cpu_has_bts)
76 ds_init_intel(c);
77
78 if (c->x86 == 15)
79 c->x86_cache_alignment = c->x86_clflush_size * 2;
80 if (c->x86 == 6)
81 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
82 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
83 c->x86_max_cores = intel_num_cpu_cores(c);
84
85 srat_detect_node();
86}
87
88static struct cpu_dev intel_cpu_dev __cpuinitdata = {
89 .c_vendor = "Intel",
90 .c_ident = { "GenuineIntel" },
91 .c_early_init = early_init_intel,
92 .c_init = init_intel,
93};
94cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev);
95
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 26d615dcb149..650d40f7912b 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -62,6 +62,7 @@ static struct _cache_table cache_table[] __cpuinitdata =
62 { 0x4b, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ 62 { 0x4b, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */
63 { 0x4c, LVL_3, 12288 }, /* 12-way set assoc, 64 byte line size */ 63 { 0x4c, LVL_3, 12288 }, /* 12-way set assoc, 64 byte line size */
64 { 0x4d, LVL_3, 16384 }, /* 16-way set assoc, 64 byte line size */ 64 { 0x4d, LVL_3, 16384 }, /* 16-way set assoc, 64 byte line size */
65 { 0x4e, LVL_2, 6144 }, /* 24-way set assoc, 64 byte line size */
65 { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 66 { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */
66 { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 67 { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */
67 { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 68 { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */
@@ -488,7 +489,7 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
488 int sibling; 489 int sibling;
489 490
490 this_leaf = CPUID4_INFO_IDX(cpu, index); 491 this_leaf = CPUID4_INFO_IDX(cpu, index);
491 for_each_cpu_mask(sibling, this_leaf->shared_cpu_map) { 492 for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) {
492 sibling_leaf = CPUID4_INFO_IDX(sibling, index); 493 sibling_leaf = CPUID4_INFO_IDX(sibling, index);
493 cpu_clear(cpu, sibling_leaf->shared_cpu_map); 494 cpu_clear(cpu, sibling_leaf->shared_cpu_map);
494 } 495 }
@@ -515,6 +516,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
515 unsigned long j; 516 unsigned long j;
516 int retval; 517 int retval;
517 cpumask_t oldmask; 518 cpumask_t oldmask;
519 cpumask_of_cpu_ptr(newmask, cpu);
518 520
519 if (num_cache_leaves == 0) 521 if (num_cache_leaves == 0)
520 return -ENOENT; 522 return -ENOENT;
@@ -525,7 +527,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
525 return -ENOMEM; 527 return -ENOMEM;
526 528
527 oldmask = current->cpus_allowed; 529 oldmask = current->cpus_allowed;
528 retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 530 retval = set_cpus_allowed_ptr(current, newmask);
529 if (retval) 531 if (retval)
530 goto out; 532 goto out;
531 533
@@ -779,15 +781,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
779 } 781 }
780 kobject_put(per_cpu(cache_kobject, cpu)); 782 kobject_put(per_cpu(cache_kobject, cpu));
781 cpuid4_cache_sysfs_exit(cpu); 783 cpuid4_cache_sysfs_exit(cpu);
782 break; 784 return retval;
783 } 785 }
784 kobject_uevent(&(this_object->kobj), KOBJ_ADD); 786 kobject_uevent(&(this_object->kobj), KOBJ_ADD);
785 } 787 }
786 if (!retval) 788 cpu_set(cpu, cache_dev_map);
787 cpu_set(cpu, cache_dev_map);
788 789
789 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); 790 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD);
790 return retval; 791 return 0;
791} 792}
792 793
793static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) 794static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index e633c9c2b764..f390c9f66351 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -9,23 +9,23 @@
9#include <linux/interrupt.h> 9#include <linux/interrupt.h>
10#include <linux/smp.h> 10#include <linux/smp.h>
11 11
12#include <asm/processor.h> 12#include <asm/processor.h>
13#include <asm/system.h> 13#include <asm/system.h>
14#include <asm/msr.h> 14#include <asm/msr.h>
15 15
16#include "mce.h" 16#include "mce.h"
17 17
18/* Machine Check Handler For AMD Athlon/Duron */ 18/* Machine Check Handler For AMD Athlon/Duron */
19static void k7_machine_check(struct pt_regs * regs, long error_code) 19static void k7_machine_check(struct pt_regs *regs, long error_code)
20{ 20{
21 int recover=1; 21 int recover = 1;
22 u32 alow, ahigh, high, low; 22 u32 alow, ahigh, high, low;
23 u32 mcgstl, mcgsth; 23 u32 mcgstl, mcgsth;
24 int i; 24 int i;
25 25
26 rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 26 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
27 if (mcgstl & (1<<0)) /* Recoverable ? */ 27 if (mcgstl & (1<<0)) /* Recoverable ? */
28 recover=0; 28 recover = 0;
29 29
30 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", 30 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
31 smp_processor_id(), mcgsth, mcgstl); 31 smp_processor_id(), mcgsth, mcgstl);
@@ -60,12 +60,12 @@ static void k7_machine_check(struct pt_regs * regs, long error_code)
60 } 60 }
61 61
62 if (recover&2) 62 if (recover&2)
63 panic ("CPU context corrupt"); 63 panic("CPU context corrupt");
64 if (recover&1) 64 if (recover&1)
65 panic ("Unable to continue"); 65 panic("Unable to continue");
66 printk (KERN_EMERG "Attempting to continue.\n"); 66 printk(KERN_EMERG "Attempting to continue.\n");
67 mcgstl &= ~(1<<2); 67 mcgstl &= ~(1<<2);
68 wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth); 68 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
69} 69}
70 70
71 71
@@ -81,25 +81,25 @@ void amd_mcheck_init(struct cpuinfo_x86 *c)
81 machine_check_vector = k7_machine_check; 81 machine_check_vector = k7_machine_check;
82 wmb(); 82 wmb();
83 83
84 printk (KERN_INFO "Intel machine check architecture supported.\n"); 84 printk(KERN_INFO "Intel machine check architecture supported.\n");
85 rdmsr (MSR_IA32_MCG_CAP, l, h); 85 rdmsr(MSR_IA32_MCG_CAP, l, h);
86 if (l & (1<<8)) /* Control register present ? */ 86 if (l & (1<<8)) /* Control register present ? */
87 wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 87 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
88 nr_mce_banks = l & 0xff; 88 nr_mce_banks = l & 0xff;
89 89
90 /* Clear status for MC index 0 separately, we don't touch CTL, 90 /* Clear status for MC index 0 separately, we don't touch CTL,
91 * as some K7 Athlons cause spurious MCEs when its enabled. */ 91 * as some K7 Athlons cause spurious MCEs when its enabled. */
92 if (boot_cpu_data.x86 == 6) { 92 if (boot_cpu_data.x86 == 6) {
93 wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0); 93 wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0);
94 i = 1; 94 i = 1;
95 } else 95 } else
96 i = 0; 96 i = 0;
97 for (; i<nr_mce_banks; i++) { 97 for (; i < nr_mce_banks; i++) {
98 wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); 98 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
99 wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); 99 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
100 } 100 }
101 101
102 set_in_cr4 (X86_CR4_MCE); 102 set_in_cr4(X86_CR4_MCE);
103 printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", 103 printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
104 smp_processor_id()); 104 smp_processor_id());
105} 105}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index e07e8c068ae0..65a339678ece 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -9,6 +9,7 @@
9#include <linux/types.h> 9#include <linux/types.h>
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/sched.h> 11#include <linux/sched.h>
12#include <linux/smp_lock.h>
12#include <linux/string.h> 13#include <linux/string.h>
13#include <linux/rcupdate.h> 14#include <linux/rcupdate.h>
14#include <linux/kallsyms.h> 15#include <linux/kallsyms.h>
@@ -31,7 +32,7 @@
31#include <asm/idle.h> 32#include <asm/idle.h>
32 33
33#define MISC_MCELOG_MINOR 227 34#define MISC_MCELOG_MINOR 227
34#define NR_BANKS 6 35#define NR_SYSFS_BANKS 6
35 36
36atomic_t mce_entry; 37atomic_t mce_entry;
37 38
@@ -46,7 +47,7 @@ static int mce_dont_init;
46 */ 47 */
47static int tolerant = 1; 48static int tolerant = 1;
48static int banks; 49static int banks;
49static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; 50static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL };
50static unsigned long notify_user; 51static unsigned long notify_user;
51static int rip_msr; 52static int rip_msr;
52static int mce_bootlog = -1; 53static int mce_bootlog = -1;
@@ -209,7 +210,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
209 barrier(); 210 barrier();
210 211
211 for (i = 0; i < banks; i++) { 212 for (i = 0; i < banks; i++) {
212 if (!bank[i]) 213 if (i < NR_SYSFS_BANKS && !bank[i])
213 continue; 214 continue;
214 215
215 m.misc = 0; 216 m.misc = 0;
@@ -363,7 +364,7 @@ static void mcheck_check_cpu(void *info)
363 364
364static void mcheck_timer(struct work_struct *work) 365static void mcheck_timer(struct work_struct *work)
365{ 366{
366 on_each_cpu(mcheck_check_cpu, NULL, 1, 1); 367 on_each_cpu(mcheck_check_cpu, NULL, 1);
367 368
368 /* 369 /*
369 * Alert userspace if needed. If we logged an MCE, reduce the 370 * Alert userspace if needed. If we logged an MCE, reduce the
@@ -444,9 +445,10 @@ static void mce_init(void *dummy)
444 445
445 rdmsrl(MSR_IA32_MCG_CAP, cap); 446 rdmsrl(MSR_IA32_MCG_CAP, cap);
446 banks = cap & 0xff; 447 banks = cap & 0xff;
447 if (banks > NR_BANKS) { 448 if (banks > MCE_EXTENDED_BANK) {
448 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks); 449 banks = MCE_EXTENDED_BANK;
449 banks = NR_BANKS; 450 printk(KERN_INFO "MCE: warning: using only %d banks\n",
451 MCE_EXTENDED_BANK);
450 } 452 }
451 /* Use accurate RIP reporting if available. */ 453 /* Use accurate RIP reporting if available. */
452 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) 454 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
@@ -462,7 +464,11 @@ static void mce_init(void *dummy)
462 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 464 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
463 465
464 for (i = 0; i < banks; i++) { 466 for (i = 0; i < banks; i++) {
465 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); 467 if (i < NR_SYSFS_BANKS)
468 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
469 else
470 wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
471
466 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 472 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
467 } 473 }
468} 474}
@@ -527,10 +533,12 @@ static int open_exclu; /* already open exclusive? */
527 533
528static int mce_open(struct inode *inode, struct file *file) 534static int mce_open(struct inode *inode, struct file *file)
529{ 535{
536 lock_kernel();
530 spin_lock(&mce_state_lock); 537 spin_lock(&mce_state_lock);
531 538
532 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { 539 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
533 spin_unlock(&mce_state_lock); 540 spin_unlock(&mce_state_lock);
541 unlock_kernel();
534 return -EBUSY; 542 return -EBUSY;
535 } 543 }
536 544
@@ -539,6 +547,7 @@ static int mce_open(struct inode *inode, struct file *file)
539 open_count++; 547 open_count++;
540 548
541 spin_unlock(&mce_state_lock); 549 spin_unlock(&mce_state_lock);
550 unlock_kernel();
542 551
543 return nonseekable_open(inode, file); 552 return nonseekable_open(inode, file);
544} 553}
@@ -571,7 +580,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
571 char __user *buf = ubuf; 580 char __user *buf = ubuf;
572 int i, err; 581 int i, err;
573 582
574 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL); 583 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
575 if (!cpu_tsc) 584 if (!cpu_tsc)
576 return -ENOMEM; 585 return -ENOMEM;
577 586
@@ -612,7 +621,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
612 * Collect entries that were still getting written before the 621 * Collect entries that were still getting written before the
613 * synchronize. 622 * synchronize.
614 */ 623 */
615 on_each_cpu(collect_tscs, cpu_tsc, 1, 1); 624 on_each_cpu(collect_tscs, cpu_tsc, 1);
616 for (i = next; i < MCE_LOG_LEN; i++) { 625 for (i = next; i < MCE_LOG_LEN; i++) {
617 if (mcelog.entry[i].finished && 626 if (mcelog.entry[i].finished &&
618 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { 627 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
@@ -737,7 +746,7 @@ static void mce_restart(void)
737 if (next_interval) 746 if (next_interval)
738 cancel_delayed_work(&mcheck_work); 747 cancel_delayed_work(&mcheck_work);
739 /* Timer race is harmless here */ 748 /* Timer race is harmless here */
740 on_each_cpu(mce_init, NULL, 1, 1); 749 on_each_cpu(mce_init, NULL, 1);
741 next_interval = check_interval * HZ; 750 next_interval = check_interval * HZ;
742 if (next_interval) 751 if (next_interval)
743 schedule_delayed_work(&mcheck_work, 752 schedule_delayed_work(&mcheck_work,
@@ -753,10 +762,14 @@ DEFINE_PER_CPU(struct sys_device, device_mce);
753 762
754/* Why are there no generic functions for this? */ 763/* Why are there no generic functions for this? */
755#define ACCESSOR(name, var, start) \ 764#define ACCESSOR(name, var, start) \
756 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \ 765 static ssize_t show_ ## name(struct sys_device *s, \
766 struct sysdev_attribute *attr, \
767 char *buf) { \
757 return sprintf(buf, "%lx\n", (unsigned long)var); \ 768 return sprintf(buf, "%lx\n", (unsigned long)var); \
758 } \ 769 } \
759 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \ 770 static ssize_t set_ ## name(struct sys_device *s, \
771 struct sysdev_attribute *attr, \
772 const char *buf, size_t siz) { \
760 char *end; \ 773 char *end; \
761 unsigned long new = simple_strtoul(buf, &end, 0); \ 774 unsigned long new = simple_strtoul(buf, &end, 0); \
762 if (end == buf) return -EINVAL; \ 775 if (end == buf) return -EINVAL; \
@@ -766,7 +779,10 @@ DEFINE_PER_CPU(struct sys_device, device_mce);
766 } \ 779 } \
767 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); 780 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
768 781
769/* TBD should generate these dynamically based on number of available banks */ 782/*
783 * TBD should generate these dynamically based on number of available banks.
784 * Have only 6 contol banks in /sysfs until then.
785 */
770ACCESSOR(bank0ctl,bank[0],mce_restart()) 786ACCESSOR(bank0ctl,bank[0],mce_restart())
771ACCESSOR(bank1ctl,bank[1],mce_restart()) 787ACCESSOR(bank1ctl,bank[1],mce_restart())
772ACCESSOR(bank2ctl,bank[2],mce_restart()) 788ACCESSOR(bank2ctl,bank[2],mce_restart())
@@ -774,14 +790,16 @@ ACCESSOR(bank3ctl,bank[3],mce_restart())
774ACCESSOR(bank4ctl,bank[4],mce_restart()) 790ACCESSOR(bank4ctl,bank[4],mce_restart())
775ACCESSOR(bank5ctl,bank[5],mce_restart()) 791ACCESSOR(bank5ctl,bank[5],mce_restart())
776 792
777static ssize_t show_trigger(struct sys_device *s, char *buf) 793static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
794 char *buf)
778{ 795{
779 strcpy(buf, trigger); 796 strcpy(buf, trigger);
780 strcat(buf, "\n"); 797 strcat(buf, "\n");
781 return strlen(trigger) + 1; 798 return strlen(trigger) + 1;
782} 799}
783 800
784static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz) 801static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
802 const char *buf,size_t siz)
785{ 803{
786 char *p; 804 char *p;
787 int len; 805 int len;
@@ -794,12 +812,12 @@ static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
794} 812}
795 813
796static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 814static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
797ACCESSOR(tolerant,tolerant,) 815static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
798ACCESSOR(check_interval,check_interval,mce_restart()) 816ACCESSOR(check_interval,check_interval,mce_restart())
799static struct sysdev_attribute *mce_attributes[] = { 817static struct sysdev_attribute *mce_attributes[] = {
800 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, 818 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
801 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, 819 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
802 &attr_tolerant, &attr_check_interval, &attr_trigger, 820 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
803 NULL 821 NULL
804}; 822};
805 823
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 7c9a813e1193..88736cadbaa6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -527,7 +527,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
527 if (err) 527 if (err)
528 goto out_free; 528 goto out_free;
529 529
530 for_each_cpu_mask(i, b->cpus) { 530 for_each_cpu_mask_nr(i, b->cpus) {
531 if (i == cpu) 531 if (i == cpu)
532 continue; 532 continue;
533 533
@@ -617,7 +617,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
617#endif 617#endif
618 618
619 /* remove all sibling symlinks before unregistering */ 619 /* remove all sibling symlinks before unregistering */
620 for_each_cpu_mask(i, b->cpus) { 620 for_each_cpu_mask_nr(i, b->cpus) {
621 if (i == cpu) 621 if (i == cpu)
622 continue; 622 continue;
623 623
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
index 00ccb6c14ec2..cc1fccdd31e0 100644
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -59,7 +59,7 @@ static DECLARE_DELAYED_WORK(mce_work, mce_work_fn);
59 59
60static void mce_work_fn(struct work_struct *work) 60static void mce_work_fn(struct work_struct *work)
61{ 61{
62 on_each_cpu(mce_checkregs, NULL, 1, 1); 62 on_each_cpu(mce_checkregs, NULL, 1);
63 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); 63 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
64} 64}
65 65
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index cb03345554a5..9b60fce09f75 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -8,7 +8,7 @@
8#include <linux/interrupt.h> 8#include <linux/interrupt.h>
9#include <linux/smp.h> 9#include <linux/smp.h>
10 10
11#include <asm/processor.h> 11#include <asm/processor.h>
12#include <asm/system.h> 12#include <asm/system.h>
13#include <asm/msr.h> 13#include <asm/msr.h>
14#include <asm/apic.h> 14#include <asm/apic.h>
@@ -32,12 +32,12 @@ struct intel_mce_extended_msrs {
32 /* u32 *reserved[]; */ 32 /* u32 *reserved[]; */
33}; 33};
34 34
35static int mce_num_extended_msrs = 0; 35static int mce_num_extended_msrs;
36 36
37 37
38#ifdef CONFIG_X86_MCE_P4THERMAL 38#ifdef CONFIG_X86_MCE_P4THERMAL
39static void unexpected_thermal_interrupt(struct pt_regs *regs) 39static void unexpected_thermal_interrupt(struct pt_regs *regs)
40{ 40{
41 printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", 41 printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
42 smp_processor_id()); 42 smp_processor_id());
43 add_taint(TAINT_MACHINE_CHECK); 43 add_taint(TAINT_MACHINE_CHECK);
@@ -83,7 +83,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
83 * be some SMM goo which handles it, so we can't even put a handler 83 * be some SMM goo which handles it, so we can't even put a handler
84 * since it might be delivered via SMI already -zwanem. 84 * since it might be delivered via SMI already -zwanem.
85 */ 85 */
86 rdmsr (MSR_IA32_MISC_ENABLE, l, h); 86 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
87 h = apic_read(APIC_LVTTHMR); 87 h = apic_read(APIC_LVTTHMR);
88 if ((l & (1<<3)) && (h & APIC_DM_SMI)) { 88 if ((l & (1<<3)) && (h & APIC_DM_SMI)) {
89 printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", 89 printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",
@@ -91,7 +91,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
91 return; /* -EBUSY */ 91 return; /* -EBUSY */
92 } 92 }
93 93
94 /* check whether a vector already exists, temporarily masked? */ 94 /* check whether a vector already exists, temporarily masked? */
95 if (h & APIC_VECTOR_MASK) { 95 if (h & APIC_VECTOR_MASK) {
96 printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already " 96 printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already "
97 "installed\n", 97 "installed\n",
@@ -102,20 +102,20 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
102 /* The temperature transition interrupt handler setup */ 102 /* The temperature transition interrupt handler setup */
103 h = THERMAL_APIC_VECTOR; /* our delivery vector */ 103 h = THERMAL_APIC_VECTOR; /* our delivery vector */
104 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ 104 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
105 apic_write_around(APIC_LVTTHMR, h); 105 apic_write(APIC_LVTTHMR, h);
106 106
107 rdmsr (MSR_IA32_THERM_INTERRUPT, l, h); 107 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
108 wrmsr (MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); 108 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
109 109
110 /* ok we're good to go... */ 110 /* ok we're good to go... */
111 vendor_thermal_interrupt = intel_thermal_interrupt; 111 vendor_thermal_interrupt = intel_thermal_interrupt;
112
113 rdmsr (MSR_IA32_MISC_ENABLE, l, h);
114 wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h);
115 112
116 l = apic_read (APIC_LVTTHMR); 113 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
117 apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED); 114 wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h);
118 printk (KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); 115
116 l = apic_read(APIC_LVTTHMR);
117 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
118 printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
119 119
120 /* enable thermal throttle processing */ 120 /* enable thermal throttle processing */
121 atomic_set(&therm_throt_en, 1); 121 atomic_set(&therm_throt_en, 1);
@@ -129,28 +129,28 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
129{ 129{
130 u32 h; 130 u32 h;
131 131
132 rdmsr (MSR_IA32_MCG_EAX, r->eax, h); 132 rdmsr(MSR_IA32_MCG_EAX, r->eax, h);
133 rdmsr (MSR_IA32_MCG_EBX, r->ebx, h); 133 rdmsr(MSR_IA32_MCG_EBX, r->ebx, h);
134 rdmsr (MSR_IA32_MCG_ECX, r->ecx, h); 134 rdmsr(MSR_IA32_MCG_ECX, r->ecx, h);
135 rdmsr (MSR_IA32_MCG_EDX, r->edx, h); 135 rdmsr(MSR_IA32_MCG_EDX, r->edx, h);
136 rdmsr (MSR_IA32_MCG_ESI, r->esi, h); 136 rdmsr(MSR_IA32_MCG_ESI, r->esi, h);
137 rdmsr (MSR_IA32_MCG_EDI, r->edi, h); 137 rdmsr(MSR_IA32_MCG_EDI, r->edi, h);
138 rdmsr (MSR_IA32_MCG_EBP, r->ebp, h); 138 rdmsr(MSR_IA32_MCG_EBP, r->ebp, h);
139 rdmsr (MSR_IA32_MCG_ESP, r->esp, h); 139 rdmsr(MSR_IA32_MCG_ESP, r->esp, h);
140 rdmsr (MSR_IA32_MCG_EFLAGS, r->eflags, h); 140 rdmsr(MSR_IA32_MCG_EFLAGS, r->eflags, h);
141 rdmsr (MSR_IA32_MCG_EIP, r->eip, h); 141 rdmsr(MSR_IA32_MCG_EIP, r->eip, h);
142} 142}
143 143
144static void intel_machine_check(struct pt_regs * regs, long error_code) 144static void intel_machine_check(struct pt_regs *regs, long error_code)
145{ 145{
146 int recover=1; 146 int recover = 1;
147 u32 alow, ahigh, high, low; 147 u32 alow, ahigh, high, low;
148 u32 mcgstl, mcgsth; 148 u32 mcgstl, mcgsth;
149 int i; 149 int i;
150 150
151 rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 151 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
152 if (mcgstl & (1<<0)) /* Recoverable ? */ 152 if (mcgstl & (1<<0)) /* Recoverable ? */
153 recover=0; 153 recover = 0;
154 154
155 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", 155 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
156 smp_processor_id(), mcgsth, mcgstl); 156 smp_processor_id(), mcgsth, mcgstl);
@@ -191,20 +191,20 @@ static void intel_machine_check(struct pt_regs * regs, long error_code)
191 } 191 }
192 192
193 if (recover & 2) 193 if (recover & 2)
194 panic ("CPU context corrupt"); 194 panic("CPU context corrupt");
195 if (recover & 1) 195 if (recover & 1)
196 panic ("Unable to continue"); 196 panic("Unable to continue");
197 197
198 printk(KERN_EMERG "Attempting to continue.\n"); 198 printk(KERN_EMERG "Attempting to continue.\n");
199 /* 199 /*
200 * Do not clear the MSR_IA32_MCi_STATUS if the error is not 200 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
201 * recoverable/continuable.This will allow BIOS to look at the MSRs 201 * recoverable/continuable.This will allow BIOS to look at the MSRs
202 * for errors if the OS could not log the error. 202 * for errors if the OS could not log the error.
203 */ 203 */
204 for (i=0; i<nr_mce_banks; i++) { 204 for (i = 0; i < nr_mce_banks; i++) {
205 u32 msr; 205 u32 msr;
206 msr = MSR_IA32_MC0_STATUS+i*4; 206 msr = MSR_IA32_MC0_STATUS+i*4;
207 rdmsr (msr, low, high); 207 rdmsr(msr, low, high);
208 if (high&(1<<31)) { 208 if (high&(1<<31)) {
209 /* Clear it */ 209 /* Clear it */
210 wrmsr(msr, 0UL, 0UL); 210 wrmsr(msr, 0UL, 0UL);
@@ -214,7 +214,7 @@ static void intel_machine_check(struct pt_regs * regs, long error_code)
214 } 214 }
215 } 215 }
216 mcgstl &= ~(1<<2); 216 mcgstl &= ~(1<<2);
217 wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth); 217 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
218} 218}
219 219
220 220
@@ -222,30 +222,30 @@ void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
222{ 222{
223 u32 l, h; 223 u32 l, h;
224 int i; 224 int i;
225 225
226 machine_check_vector = intel_machine_check; 226 machine_check_vector = intel_machine_check;
227 wmb(); 227 wmb();
228 228
229 printk (KERN_INFO "Intel machine check architecture supported.\n"); 229 printk(KERN_INFO "Intel machine check architecture supported.\n");
230 rdmsr (MSR_IA32_MCG_CAP, l, h); 230 rdmsr(MSR_IA32_MCG_CAP, l, h);
231 if (l & (1<<8)) /* Control register present ? */ 231 if (l & (1<<8)) /* Control register present ? */
232 wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 232 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
233 nr_mce_banks = l & 0xff; 233 nr_mce_banks = l & 0xff;
234 234
235 for (i=0; i<nr_mce_banks; i++) { 235 for (i = 0; i < nr_mce_banks; i++) {
236 wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); 236 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
237 wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); 237 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
238 } 238 }
239 239
240 set_in_cr4 (X86_CR4_MCE); 240 set_in_cr4(X86_CR4_MCE);
241 printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", 241 printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
242 smp_processor_id()); 242 smp_processor_id());
243 243
244 /* Check for P4/Xeon extended MCE MSRs */ 244 /* Check for P4/Xeon extended MCE MSRs */
245 rdmsr (MSR_IA32_MCG_CAP, l, h); 245 rdmsr(MSR_IA32_MCG_CAP, l, h);
246 if (l & (1<<9)) {/* MCG_EXT_P */ 246 if (l & (1<<9)) {/* MCG_EXT_P */
247 mce_num_extended_msrs = (l >> 16) & 0xff; 247 mce_num_extended_msrs = (l >> 16) & 0xff;
248 printk (KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)" 248 printk(KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)"
249 " available\n", 249 " available\n",
250 smp_processor_id(), mce_num_extended_msrs); 250 smp_processor_id(), mce_num_extended_msrs);
251 251
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 1f4cc48c14c6..d5ae2243f0b9 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -35,6 +35,7 @@ atomic_t therm_throt_en = ATOMIC_INIT(0);
35 35
36#define define_therm_throt_sysdev_show_func(name) \ 36#define define_therm_throt_sysdev_show_func(name) \
37static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ 37static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
38 struct sysdev_attribute *attr, \
38 char *buf) \ 39 char *buf) \
39{ \ 40{ \
40 unsigned int cpu = dev->id; \ 41 unsigned int cpu = dev->id; \
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 5d241ce94a44..509bd3d9eacd 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -37,7 +37,7 @@ static struct fixed_range_block fixed_range_blocks[] = {
37static unsigned long smp_changes_mask; 37static unsigned long smp_changes_mask;
38static struct mtrr_state mtrr_state = {}; 38static struct mtrr_state mtrr_state = {};
39static int mtrr_state_set; 39static int mtrr_state_set;
40static u64 tom2; 40u64 mtrr_tom2;
41 41
42#undef MODULE_PARAM_PREFIX 42#undef MODULE_PARAM_PREFIX
43#define MODULE_PARAM_PREFIX "mtrr." 43#define MODULE_PARAM_PREFIX "mtrr."
@@ -139,8 +139,8 @@ u8 mtrr_type_lookup(u64 start, u64 end)
139 } 139 }
140 } 140 }
141 141
142 if (tom2) { 142 if (mtrr_tom2) {
143 if (start >= (1ULL<<32) && (end < tom2)) 143 if (start >= (1ULL<<32) && (end < mtrr_tom2))
144 return MTRR_TYPE_WRBACK; 144 return MTRR_TYPE_WRBACK;
145 } 145 }
146 146
@@ -158,6 +158,20 @@ get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
158 rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); 158 rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
159} 159}
160 160
161/* fill the MSR pair relating to a var range */
162void fill_mtrr_var_range(unsigned int index,
163 u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi)
164{
165 struct mtrr_var_range *vr;
166
167 vr = mtrr_state.var_ranges;
168
169 vr[index].base_lo = base_lo;
170 vr[index].base_hi = base_hi;
171 vr[index].mask_lo = mask_lo;
172 vr[index].mask_hi = mask_hi;
173}
174
161static void 175static void
162get_fixed_ranges(mtrr_type * frs) 176get_fixed_ranges(mtrr_type * frs)
163{ 177{
@@ -213,13 +227,13 @@ void __init get_mtrr_state(void)
213 mtrr_state.enabled = (lo & 0xc00) >> 10; 227 mtrr_state.enabled = (lo & 0xc00) >> 10;
214 228
215 if (amd_special_default_mtrr()) { 229 if (amd_special_default_mtrr()) {
216 unsigned lo, hi; 230 unsigned low, high;
217 /* TOP_MEM2 */ 231 /* TOP_MEM2 */
218 rdmsr(MSR_K8_TOP_MEM2, lo, hi); 232 rdmsr(MSR_K8_TOP_MEM2, low, high);
219 tom2 = hi; 233 mtrr_tom2 = high;
220 tom2 <<= 32; 234 mtrr_tom2 <<= 32;
221 tom2 |= lo; 235 mtrr_tom2 |= low;
222 tom2 &= 0xffffff8000000ULL; 236 mtrr_tom2 &= 0xffffff800000ULL;
223 } 237 }
224 if (mtrr_show) { 238 if (mtrr_show) {
225 int high_width; 239 int high_width;
@@ -251,9 +265,9 @@ void __init get_mtrr_state(void)
251 else 265 else
252 printk(KERN_INFO "MTRR %u disabled\n", i); 266 printk(KERN_INFO "MTRR %u disabled\n", i);
253 } 267 }
254 if (tom2) { 268 if (mtrr_tom2) {
255 printk(KERN_INFO "TOM2: %016llx aka %lldM\n", 269 printk(KERN_INFO "TOM2: %016llx aka %lldM\n",
256 tom2, tom2>>20); 270 mtrr_tom2, mtrr_tom2>>20);
257 } 271 }
258 } 272 }
259 mtrr_state_set = 1; 273 mtrr_state_set = 1;
@@ -328,7 +342,7 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
328 342
329 if (lo != msrwords[0] || hi != msrwords[1]) { 343 if (lo != msrwords[0] || hi != msrwords[1]) {
330 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && 344 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
331 boot_cpu_data.x86 == 15 && 345 (boot_cpu_data.x86 >= 0x0f && boot_cpu_data.x86 <= 0x11) &&
332 ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK)) 346 ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK))
333 k8_enable_fixed_iorrs(); 347 k8_enable_fixed_iorrs();
334 mtrr_wrmsr(msr, msrwords[0], msrwords[1]); 348 mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 6a1e278d9323..6f23969c8faf 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -37,6 +37,7 @@
37#include <linux/smp.h> 37#include <linux/smp.h>
38#include <linux/cpu.h> 38#include <linux/cpu.h>
39#include <linux/mutex.h> 39#include <linux/mutex.h>
40#include <linux/sort.h>
40 41
41#include <asm/e820.h> 42#include <asm/e820.h>
42#include <asm/mtrr.h> 43#include <asm/mtrr.h>
@@ -222,7 +223,7 @@ static void set_mtrr(unsigned int reg, unsigned long base,
222 atomic_set(&data.gate,0); 223 atomic_set(&data.gate,0);
223 224
224 /* Start the ball rolling on other CPUs */ 225 /* Start the ball rolling on other CPUs */
225 if (smp_call_function(ipi_handler, &data, 1, 0) != 0) 226 if (smp_call_function(ipi_handler, &data, 0) != 0)
226 panic("mtrr: timed out waiting for other CPUs\n"); 227 panic("mtrr: timed out waiting for other CPUs\n");
227 228
228 local_irq_save(flags); 229 local_irq_save(flags);
@@ -609,6 +610,787 @@ static struct sysdev_driver mtrr_sysdev_driver = {
609 .resume = mtrr_restore, 610 .resume = mtrr_restore,
610}; 611};
611 612
613/* should be related to MTRR_VAR_RANGES nums */
614#define RANGE_NUM 256
615
616struct res_range {
617 unsigned long start;
618 unsigned long end;
619};
620
621static int __init
622add_range(struct res_range *range, int nr_range, unsigned long start,
623 unsigned long end)
624{
625 /* out of slots */
626 if (nr_range >= RANGE_NUM)
627 return nr_range;
628
629 range[nr_range].start = start;
630 range[nr_range].end = end;
631
632 nr_range++;
633
634 return nr_range;
635}
636
637static int __init
638add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
639 unsigned long end)
640{
641 int i;
642
643 /* try to merge it with old one */
644 for (i = 0; i < nr_range; i++) {
645 unsigned long final_start, final_end;
646 unsigned long common_start, common_end;
647
648 if (!range[i].end)
649 continue;
650
651 common_start = max(range[i].start, start);
652 common_end = min(range[i].end, end);
653 if (common_start > common_end + 1)
654 continue;
655
656 final_start = min(range[i].start, start);
657 final_end = max(range[i].end, end);
658
659 range[i].start = final_start;
660 range[i].end = final_end;
661 return nr_range;
662 }
663
664 /* need to add that */
665 return add_range(range, nr_range, start, end);
666}
667
668static void __init
669subtract_range(struct res_range *range, unsigned long start, unsigned long end)
670{
671 int i, j;
672
673 for (j = 0; j < RANGE_NUM; j++) {
674 if (!range[j].end)
675 continue;
676
677 if (start <= range[j].start && end >= range[j].end) {
678 range[j].start = 0;
679 range[j].end = 0;
680 continue;
681 }
682
683 if (start <= range[j].start && end < range[j].end &&
684 range[j].start < end + 1) {
685 range[j].start = end + 1;
686 continue;
687 }
688
689
690 if (start > range[j].start && end >= range[j].end &&
691 range[j].end > start - 1) {
692 range[j].end = start - 1;
693 continue;
694 }
695
696 if (start > range[j].start && end < range[j].end) {
697 /* find the new spare */
698 for (i = 0; i < RANGE_NUM; i++) {
699 if (range[i].end == 0)
700 break;
701 }
702 if (i < RANGE_NUM) {
703 range[i].end = range[j].end;
704 range[i].start = end + 1;
705 } else {
706 printk(KERN_ERR "run of slot in ranges\n");
707 }
708 range[j].end = start - 1;
709 continue;
710 }
711 }
712}
713
714static int __init cmp_range(const void *x1, const void *x2)
715{
716 const struct res_range *r1 = x1;
717 const struct res_range *r2 = x2;
718 long start1, start2;
719
720 start1 = r1->start;
721 start2 = r2->start;
722
723 return start1 - start2;
724}
725
726struct var_mtrr_range_state {
727 unsigned long base_pfn;
728 unsigned long size_pfn;
729 mtrr_type type;
730};
731
732struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
733static int __initdata debug_print;
734
735static int __init
736x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
737 unsigned long extra_remove_base,
738 unsigned long extra_remove_size)
739{
740 unsigned long i, base, size;
741 mtrr_type type;
742
743 for (i = 0; i < num_var_ranges; i++) {
744 type = range_state[i].type;
745 if (type != MTRR_TYPE_WRBACK)
746 continue;
747 base = range_state[i].base_pfn;
748 size = range_state[i].size_pfn;
749 nr_range = add_range_with_merge(range, nr_range, base,
750 base + size - 1);
751 }
752 if (debug_print) {
753 printk(KERN_DEBUG "After WB checking\n");
754 for (i = 0; i < nr_range; i++)
755 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
756 range[i].start, range[i].end + 1);
757 }
758
759 /* take out UC ranges */
760 for (i = 0; i < num_var_ranges; i++) {
761 type = range_state[i].type;
762 if (type != MTRR_TYPE_UNCACHABLE)
763 continue;
764 size = range_state[i].size_pfn;
765 if (!size)
766 continue;
767 base = range_state[i].base_pfn;
768 subtract_range(range, base, base + size - 1);
769 }
770 if (extra_remove_size)
771 subtract_range(range, extra_remove_base,
772 extra_remove_base + extra_remove_size - 1);
773
774 /* get new range num */
775 nr_range = 0;
776 for (i = 0; i < RANGE_NUM; i++) {
777 if (!range[i].end)
778 continue;
779 nr_range++;
780 }
781 if (debug_print) {
782 printk(KERN_DEBUG "After UC checking\n");
783 for (i = 0; i < nr_range; i++)
784 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
785 range[i].start, range[i].end + 1);
786 }
787
788 /* sort the ranges */
789 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
790 if (debug_print) {
791 printk(KERN_DEBUG "After sorting\n");
792 for (i = 0; i < nr_range; i++)
793 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
794 range[i].start, range[i].end + 1);
795 }
796
797 /* clear those is not used */
798 for (i = nr_range; i < RANGE_NUM; i++)
799 memset(&range[i], 0, sizeof(range[i]));
800
801 return nr_range;
802}
803
804static struct res_range __initdata range[RANGE_NUM];
805
806#ifdef CONFIG_MTRR_SANITIZER
807
808static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
809{
810 unsigned long sum;
811 int i;
812
813 sum = 0;
814 for (i = 0; i < nr_range; i++)
815 sum += range[i].end + 1 - range[i].start;
816
817 return sum;
818}
819
820static int enable_mtrr_cleanup __initdata =
821 CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
822
823static int __init disable_mtrr_cleanup_setup(char *str)
824{
825 if (enable_mtrr_cleanup != -1)
826 enable_mtrr_cleanup = 0;
827 return 0;
828}
829early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
830
831static int __init enable_mtrr_cleanup_setup(char *str)
832{
833 if (enable_mtrr_cleanup != -1)
834 enable_mtrr_cleanup = 1;
835 return 0;
836}
837early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup);
838
839struct var_mtrr_state {
840 unsigned long range_startk;
841 unsigned long range_sizek;
842 unsigned long chunk_sizek;
843 unsigned long gran_sizek;
844 unsigned int reg;
845};
846
847static void __init
848set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
849 unsigned char type, unsigned int address_bits)
850{
851 u32 base_lo, base_hi, mask_lo, mask_hi;
852 u64 base, mask;
853
854 if (!sizek) {
855 fill_mtrr_var_range(reg, 0, 0, 0, 0);
856 return;
857 }
858
859 mask = (1ULL << address_bits) - 1;
860 mask &= ~((((u64)sizek) << 10) - 1);
861
862 base = ((u64)basek) << 10;
863
864 base |= type;
865 mask |= 0x800;
866
867 base_lo = base & ((1ULL<<32) - 1);
868 base_hi = base >> 32;
869
870 mask_lo = mask & ((1ULL<<32) - 1);
871 mask_hi = mask >> 32;
872
873 fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
874}
875
876static void __init
877save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
878 unsigned char type)
879{
880 range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
881 range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
882 range_state[reg].type = type;
883}
884
885static void __init
886set_var_mtrr_all(unsigned int address_bits)
887{
888 unsigned long basek, sizek;
889 unsigned char type;
890 unsigned int reg;
891
892 for (reg = 0; reg < num_var_ranges; reg++) {
893 basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
894 sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
895 type = range_state[reg].type;
896
897 set_var_mtrr(reg, basek, sizek, type, address_bits);
898 }
899}
900
901static unsigned int __init
902range_to_mtrr(unsigned int reg, unsigned long range_startk,
903 unsigned long range_sizek, unsigned char type)
904{
905 if (!range_sizek || (reg >= num_var_ranges))
906 return reg;
907
908 while (range_sizek) {
909 unsigned long max_align, align;
910 unsigned long sizek;
911
912 /* Compute the maximum size I can make a range */
913 if (range_startk)
914 max_align = ffs(range_startk) - 1;
915 else
916 max_align = 32;
917 align = fls(range_sizek) - 1;
918 if (align > max_align)
919 align = max_align;
920
921 sizek = 1 << align;
922 if (debug_print)
923 printk(KERN_DEBUG "Setting variable MTRR %d, "
924 "base: %ldMB, range: %ldMB, type %s\n",
925 reg, range_startk >> 10, sizek >> 10,
926 (type == MTRR_TYPE_UNCACHABLE)?"UC":
927 ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
928 );
929 save_var_mtrr(reg++, range_startk, sizek, type);
930 range_startk += sizek;
931 range_sizek -= sizek;
932 if (reg >= num_var_ranges)
933 break;
934 }
935 return reg;
936}
937
938static unsigned __init
939range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
940 unsigned long sizek)
941{
942 unsigned long hole_basek, hole_sizek;
943 unsigned long second_basek, second_sizek;
944 unsigned long range0_basek, range0_sizek;
945 unsigned long range_basek, range_sizek;
946 unsigned long chunk_sizek;
947 unsigned long gran_sizek;
948
949 hole_basek = 0;
950 hole_sizek = 0;
951 second_basek = 0;
952 second_sizek = 0;
953 chunk_sizek = state->chunk_sizek;
954 gran_sizek = state->gran_sizek;
955
956 /* align with gran size, prevent small block used up MTRRs */
957 range_basek = ALIGN(state->range_startk, gran_sizek);
958 if ((range_basek > basek) && basek)
959 return second_sizek;
960 state->range_sizek -= (range_basek - state->range_startk);
961 range_sizek = ALIGN(state->range_sizek, gran_sizek);
962
963 while (range_sizek > state->range_sizek) {
964 range_sizek -= gran_sizek;
965 if (!range_sizek)
966 return 0;
967 }
968 state->range_sizek = range_sizek;
969
970 /* try to append some small hole */
971 range0_basek = state->range_startk;
972 range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
973 if (range0_sizek == state->range_sizek) {
974 if (debug_print)
975 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
976 range0_basek<<10,
977 (range0_basek + state->range_sizek)<<10);
978 state->reg = range_to_mtrr(state->reg, range0_basek,
979 state->range_sizek, MTRR_TYPE_WRBACK);
980 return 0;
981 }
982
983 range0_sizek -= chunk_sizek;
984 if (range0_sizek && sizek) {
985 while (range0_basek + range0_sizek > (basek + sizek)) {
986 range0_sizek -= chunk_sizek;
987 if (!range0_sizek)
988 break;
989 }
990 }
991
992 if (range0_sizek) {
993 if (debug_print)
994 printk(KERN_DEBUG "range0: %016lx - %016lx\n",
995 range0_basek<<10,
996 (range0_basek + range0_sizek)<<10);
997 state->reg = range_to_mtrr(state->reg, range0_basek,
998 range0_sizek, MTRR_TYPE_WRBACK);
999
1000 }
1001
1002 range_basek = range0_basek + range0_sizek;
1003 range_sizek = chunk_sizek;
1004
1005 if (range_basek + range_sizek > basek &&
1006 range_basek + range_sizek <= (basek + sizek)) {
1007 /* one hole */
1008 second_basek = basek;
1009 second_sizek = range_basek + range_sizek - basek;
1010 }
1011
1012 /* if last piece, only could one hole near end */
1013 if ((second_basek || !basek) &&
1014 range_sizek - (state->range_sizek - range0_sizek) - second_sizek <
1015 (chunk_sizek >> 1)) {
1016 /*
1017 * one hole in middle (second_sizek is 0) or at end
1018 * (second_sizek is 0 )
1019 */
1020 hole_sizek = range_sizek - (state->range_sizek - range0_sizek)
1021 - second_sizek;
1022 hole_basek = range_basek + range_sizek - hole_sizek
1023 - second_sizek;
1024 } else {
1025 /* fallback for big hole, or several holes */
1026 range_sizek = state->range_sizek - range0_sizek;
1027 second_basek = 0;
1028 second_sizek = 0;
1029 }
1030
1031 if (debug_print)
1032 printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10,
1033 (range_basek + range_sizek)<<10);
1034 state->reg = range_to_mtrr(state->reg, range_basek, range_sizek,
1035 MTRR_TYPE_WRBACK);
1036 if (hole_sizek) {
1037 if (debug_print)
1038 printk(KERN_DEBUG "hole: %016lx - %016lx\n",
1039 hole_basek<<10, (hole_basek + hole_sizek)<<10);
1040 state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek,
1041 MTRR_TYPE_UNCACHABLE);
1042
1043 }
1044
1045 return second_sizek;
1046}
1047
1048static void __init
1049set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
1050 unsigned long size_pfn)
1051{
1052 unsigned long basek, sizek;
1053 unsigned long second_sizek = 0;
1054
1055 if (state->reg >= num_var_ranges)
1056 return;
1057
1058 basek = base_pfn << (PAGE_SHIFT - 10);
1059 sizek = size_pfn << (PAGE_SHIFT - 10);
1060
1061 /* See if I can merge with the last range */
1062 if ((basek <= 1024) ||
1063 (state->range_startk + state->range_sizek == basek)) {
1064 unsigned long endk = basek + sizek;
1065 state->range_sizek = endk - state->range_startk;
1066 return;
1067 }
1068 /* Write the range mtrrs */
1069 if (state->range_sizek != 0)
1070 second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
1071
1072 /* Allocate an msr */
1073 state->range_startk = basek + second_sizek;
1074 state->range_sizek = sizek - second_sizek;
1075}
1076
1077/* mininum size of mtrr block that can take hole */
1078static u64 mtrr_chunk_size __initdata = (256ULL<<20);
1079
1080static int __init parse_mtrr_chunk_size_opt(char *p)
1081{
1082 if (!p)
1083 return -EINVAL;
1084 mtrr_chunk_size = memparse(p, &p);
1085 return 0;
1086}
1087early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
1088
1089/* granity of mtrr of block */
1090static u64 mtrr_gran_size __initdata;
1091
1092static int __init parse_mtrr_gran_size_opt(char *p)
1093{
1094 if (!p)
1095 return -EINVAL;
1096 mtrr_gran_size = memparse(p, &p);
1097 return 0;
1098}
1099early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
1100
1101static int nr_mtrr_spare_reg __initdata =
1102 CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
1103
1104static int __init parse_mtrr_spare_reg(char *arg)
1105{
1106 if (arg)
1107 nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
1108 return 0;
1109}
1110
1111early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
1112
1113static int __init
1114x86_setup_var_mtrrs(struct res_range *range, int nr_range,
1115 u64 chunk_size, u64 gran_size)
1116{
1117 struct var_mtrr_state var_state;
1118 int i;
1119 int num_reg;
1120
1121 var_state.range_startk = 0;
1122 var_state.range_sizek = 0;
1123 var_state.reg = 0;
1124 var_state.chunk_sizek = chunk_size >> 10;
1125 var_state.gran_sizek = gran_size >> 10;
1126
1127 memset(range_state, 0, sizeof(range_state));
1128
1129 /* Write the range etc */
1130 for (i = 0; i < nr_range; i++)
1131 set_var_mtrr_range(&var_state, range[i].start,
1132 range[i].end - range[i].start + 1);
1133
1134 /* Write the last range */
1135 if (var_state.range_sizek != 0)
1136 range_to_mtrr_with_hole(&var_state, 0, 0);
1137
1138 num_reg = var_state.reg;
1139 /* Clear out the extra MTRR's */
1140 while (var_state.reg < num_var_ranges) {
1141 save_var_mtrr(var_state.reg, 0, 0, 0);
1142 var_state.reg++;
1143 }
1144
1145 return num_reg;
1146}
1147
1148struct mtrr_cleanup_result {
1149 unsigned long gran_sizek;
1150 unsigned long chunk_sizek;
1151 unsigned long lose_cover_sizek;
1152 unsigned int num_reg;
1153 int bad;
1154};
1155
1156/*
1157 * gran_size: 1M, 2M, ..., 2G
1158 * chunk size: gran_size, ..., 4G
1159 * so we need (2+13)*6
1160 */
1161#define NUM_RESULT 90
1162#define PSHIFT (PAGE_SHIFT - 10)
1163
1164static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
1165static struct res_range __initdata range_new[RANGE_NUM];
1166static unsigned long __initdata min_loss_pfn[RANGE_NUM];
1167
1168static int __init mtrr_cleanup(unsigned address_bits)
1169{
1170 unsigned long extra_remove_base, extra_remove_size;
1171 unsigned long i, base, size, def, dummy;
1172 mtrr_type type;
1173 int nr_range, nr_range_new;
1174 u64 chunk_size, gran_size;
1175 unsigned long range_sums, range_sums_new;
1176 int index_good;
1177 int num_reg_good;
1178
1179 /* extra one for all 0 */
1180 int num[MTRR_NUM_TYPES + 1];
1181
1182 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
1183 return 0;
1184 rdmsr(MTRRdefType_MSR, def, dummy);
1185 def &= 0xff;
1186 if (def != MTRR_TYPE_UNCACHABLE)
1187 return 0;
1188
1189 /* get it and store it aside */
1190 memset(range_state, 0, sizeof(range_state));
1191 for (i = 0; i < num_var_ranges; i++) {
1192 mtrr_if->get(i, &base, &size, &type);
1193 range_state[i].base_pfn = base;
1194 range_state[i].size_pfn = size;
1195 range_state[i].type = type;
1196 }
1197
1198 /* check entries number */
1199 memset(num, 0, sizeof(num));
1200 for (i = 0; i < num_var_ranges; i++) {
1201 type = range_state[i].type;
1202 size = range_state[i].size_pfn;
1203 if (type >= MTRR_NUM_TYPES)
1204 continue;
1205 if (!size)
1206 type = MTRR_NUM_TYPES;
1207 num[type]++;
1208 }
1209
1210 /* check if we got UC entries */
1211 if (!num[MTRR_TYPE_UNCACHABLE])
1212 return 0;
1213
1214 /* check if we only had WB and UC */
1215 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
1216 num_var_ranges - num[MTRR_NUM_TYPES])
1217 return 0;
1218
1219 memset(range, 0, sizeof(range));
1220 extra_remove_size = 0;
1221 if (mtrr_tom2) {
1222 extra_remove_base = 1 << (32 - PAGE_SHIFT);
1223 extra_remove_size =
1224 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
1225 }
1226 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
1227 extra_remove_size);
1228 range_sums = sum_ranges(range, nr_range);
1229 printk(KERN_INFO "total RAM coverred: %ldM\n",
1230 range_sums >> (20 - PAGE_SHIFT));
1231
1232 if (mtrr_chunk_size && mtrr_gran_size) {
1233 int num_reg;
1234
1235 debug_print = 1;
1236 /* convert ranges to var ranges state */
1237 num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size,
1238 mtrr_gran_size);
1239
1240 /* we got new setting in range_state, check it */
1241 memset(range_new, 0, sizeof(range_new));
1242 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
1243 extra_remove_base,
1244 extra_remove_size);
1245 range_sums_new = sum_ranges(range_new, nr_range_new);
1246
1247 i = 0;
1248 result[i].chunk_sizek = mtrr_chunk_size >> 10;
1249 result[i].gran_sizek = mtrr_gran_size >> 10;
1250 result[i].num_reg = num_reg;
1251 if (range_sums < range_sums_new) {
1252 result[i].lose_cover_sizek =
1253 (range_sums_new - range_sums) << PSHIFT;
1254 result[i].bad = 1;
1255 } else
1256 result[i].lose_cover_sizek =
1257 (range_sums - range_sums_new) << PSHIFT;
1258
1259 printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t",
1260 result[i].bad?"*BAD*":" ", result[i].gran_sizek >> 10,
1261 result[i].chunk_sizek >> 10);
1262 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n",
1263 result[i].num_reg, result[i].bad?"-":"",
1264 result[i].lose_cover_sizek >> 10);
1265 if (!result[i].bad) {
1266 set_var_mtrr_all(address_bits);
1267 return 1;
1268 }
1269 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
1270 "will find optimal one\n");
1271 debug_print = 0;
1272 memset(result, 0, sizeof(result[0]));
1273 }
1274
1275 i = 0;
1276 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
1277 memset(result, 0, sizeof(result));
1278 for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) {
1279 for (chunk_size = gran_size; chunk_size < (1ULL<<33);
1280 chunk_size <<= 1) {
1281 int num_reg;
1282
1283 if (debug_print)
1284 printk(KERN_INFO
1285 "\ngran_size: %lldM chunk_size_size: %lldM\n",
1286 gran_size >> 20, chunk_size >> 20);
1287 if (i >= NUM_RESULT)
1288 continue;
1289
1290 /* convert ranges to var ranges state */
1291 num_reg = x86_setup_var_mtrrs(range, nr_range,
1292 chunk_size, gran_size);
1293
1294 /* we got new setting in range_state, check it */
1295 memset(range_new, 0, sizeof(range_new));
1296 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
1297 extra_remove_base, extra_remove_size);
1298 range_sums_new = sum_ranges(range_new, nr_range_new);
1299
1300 result[i].chunk_sizek = chunk_size >> 10;
1301 result[i].gran_sizek = gran_size >> 10;
1302 result[i].num_reg = num_reg;
1303 if (range_sums < range_sums_new) {
1304 result[i].lose_cover_sizek =
1305 (range_sums_new - range_sums) << PSHIFT;
1306 result[i].bad = 1;
1307 } else
1308 result[i].lose_cover_sizek =
1309 (range_sums - range_sums_new) << PSHIFT;
1310
1311 /* double check it */
1312 if (!result[i].bad && !result[i].lose_cover_sizek) {
1313 if (nr_range_new != nr_range ||
1314 memcmp(range, range_new, sizeof(range)))
1315 result[i].bad = 1;
1316 }
1317
1318 if (!result[i].bad && (range_sums - range_sums_new <
1319 min_loss_pfn[num_reg])) {
1320 min_loss_pfn[num_reg] =
1321 range_sums - range_sums_new;
1322 }
1323 i++;
1324 }
1325 }
1326
1327 /* print out all */
1328 for (i = 0; i < NUM_RESULT; i++) {
1329 printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t",
1330 result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10,
1331 result[i].chunk_sizek >> 10);
1332 printk(KERN_CONT "num_reg: %d \tlose RAM: %s%ldM\n",
1333 result[i].num_reg, result[i].bad?"-":"",
1334 result[i].lose_cover_sizek >> 10);
1335 }
1336
1337 /* try to find the optimal index */
1338 if (nr_mtrr_spare_reg >= num_var_ranges)
1339 nr_mtrr_spare_reg = num_var_ranges - 1;
1340 num_reg_good = -1;
1341 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
1342 if (!min_loss_pfn[i]) {
1343 num_reg_good = i;
1344 break;
1345 }
1346 }
1347
1348 index_good = -1;
1349 if (num_reg_good != -1) {
1350 for (i = 0; i < NUM_RESULT; i++) {
1351 if (!result[i].bad &&
1352 result[i].num_reg == num_reg_good &&
1353 !result[i].lose_cover_sizek) {
1354 index_good = i;
1355 break;
1356 }
1357 }
1358 }
1359
1360 if (index_good != -1) {
1361 printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
1362 i = index_good;
1363 printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t",
1364 result[i].gran_sizek >> 10,
1365 result[i].chunk_sizek >> 10);
1366 printk(KERN_CONT "num_reg: %d \tlose RAM: %ldM\n",
1367 result[i].num_reg,
1368 result[i].lose_cover_sizek >> 10);
1369 /* convert ranges to var ranges state */
1370 chunk_size = result[i].chunk_sizek;
1371 chunk_size <<= 10;
1372 gran_size = result[i].gran_sizek;
1373 gran_size <<= 10;
1374 debug_print = 1;
1375 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
1376 set_var_mtrr_all(address_bits);
1377 return 1;
1378 }
1379
1380 printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
1381 printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
1382
1383 return 0;
1384}
1385#else
1386static int __init mtrr_cleanup(unsigned address_bits)
1387{
1388 return 0;
1389}
1390#endif
1391
1392static int __initdata changed_by_mtrr_cleanup;
1393
612static int disable_mtrr_trim; 1394static int disable_mtrr_trim;
613 1395
614static int __init disable_mtrr_trim_setup(char *str) 1396static int __init disable_mtrr_trim_setup(char *str)
@@ -648,6 +1430,19 @@ int __init amd_special_default_mtrr(void)
648 return 0; 1430 return 0;
649} 1431}
650 1432
1433static u64 __init real_trim_memory(unsigned long start_pfn,
1434 unsigned long limit_pfn)
1435{
1436 u64 trim_start, trim_size;
1437 trim_start = start_pfn;
1438 trim_start <<= PAGE_SHIFT;
1439 trim_size = limit_pfn;
1440 trim_size <<= PAGE_SHIFT;
1441 trim_size -= trim_start;
1442
1443 return e820_update_range(trim_start, trim_size, E820_RAM,
1444 E820_RESERVED);
1445}
651/** 1446/**
652 * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs 1447 * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
653 * @end_pfn: ending page frame number 1448 * @end_pfn: ending page frame number
@@ -663,8 +1458,11 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
663{ 1458{
664 unsigned long i, base, size, highest_pfn = 0, def, dummy; 1459 unsigned long i, base, size, highest_pfn = 0, def, dummy;
665 mtrr_type type; 1460 mtrr_type type;
666 u64 trim_start, trim_size; 1461 int nr_range;
1462 u64 total_trim_size;
667 1463
1464 /* extra one for all 0 */
1465 int num[MTRR_NUM_TYPES + 1];
668 /* 1466 /*
669 * Make sure we only trim uncachable memory on machines that 1467 * Make sure we only trim uncachable memory on machines that
670 * support the Intel MTRR architecture: 1468 * support the Intel MTRR architecture:
@@ -676,14 +1474,22 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
676 if (def != MTRR_TYPE_UNCACHABLE) 1474 if (def != MTRR_TYPE_UNCACHABLE)
677 return 0; 1475 return 0;
678 1476
679 if (amd_special_default_mtrr()) 1477 /* get it and store it aside */
680 return 0; 1478 memset(range_state, 0, sizeof(range_state));
1479 for (i = 0; i < num_var_ranges; i++) {
1480 mtrr_if->get(i, &base, &size, &type);
1481 range_state[i].base_pfn = base;
1482 range_state[i].size_pfn = size;
1483 range_state[i].type = type;
1484 }
681 1485
682 /* Find highest cached pfn */ 1486 /* Find highest cached pfn */
683 for (i = 0; i < num_var_ranges; i++) { 1487 for (i = 0; i < num_var_ranges; i++) {
684 mtrr_if->get(i, &base, &size, &type); 1488 type = range_state[i].type;
685 if (type != MTRR_TYPE_WRBACK) 1489 if (type != MTRR_TYPE_WRBACK)
686 continue; 1490 continue;
1491 base = range_state[i].base_pfn;
1492 size = range_state[i].size_pfn;
687 if (highest_pfn < base + size) 1493 if (highest_pfn < base + size)
688 highest_pfn = base + size; 1494 highest_pfn = base + size;
689 } 1495 }
@@ -698,22 +1504,65 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
698 return 0; 1504 return 0;
699 } 1505 }
700 1506
701 if (highest_pfn < end_pfn) { 1507 /* check entries number */
1508 memset(num, 0, sizeof(num));
1509 for (i = 0; i < num_var_ranges; i++) {
1510 type = range_state[i].type;
1511 if (type >= MTRR_NUM_TYPES)
1512 continue;
1513 size = range_state[i].size_pfn;
1514 if (!size)
1515 type = MTRR_NUM_TYPES;
1516 num[type]++;
1517 }
1518
1519 /* no entry for WB? */
1520 if (!num[MTRR_TYPE_WRBACK])
1521 return 0;
1522
1523 /* check if we only had WB and UC */
1524 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
1525 num_var_ranges - num[MTRR_NUM_TYPES])
1526 return 0;
1527
1528 memset(range, 0, sizeof(range));
1529 nr_range = 0;
1530 if (mtrr_tom2) {
1531 range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
1532 range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
1533 if (highest_pfn < range[nr_range].end + 1)
1534 highest_pfn = range[nr_range].end + 1;
1535 nr_range++;
1536 }
1537 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
1538
1539 total_trim_size = 0;
1540 /* check the head */
1541 if (range[0].start)
1542 total_trim_size += real_trim_memory(0, range[0].start);
1543 /* check the holes */
1544 for (i = 0; i < nr_range - 1; i++) {
1545 if (range[i].end + 1 < range[i+1].start)
1546 total_trim_size += real_trim_memory(range[i].end + 1,
1547 range[i+1].start);
1548 }
1549 /* check the top */
1550 i = nr_range - 1;
1551 if (range[i].end + 1 < end_pfn)
1552 total_trim_size += real_trim_memory(range[i].end + 1,
1553 end_pfn);
1554
1555 if (total_trim_size) {
702 printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" 1556 printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
703 " all of memory, losing %luMB of RAM.\n", 1557 " all of memory, losing %lluMB of RAM.\n",
704 (end_pfn - highest_pfn) >> (20 - PAGE_SHIFT)); 1558 total_trim_size >> 20);
705 1559
706 WARN_ON(1); 1560 if (!changed_by_mtrr_cleanup)
1561 WARN_ON(1);
707 1562
708 printk(KERN_INFO "update e820 for mtrr\n"); 1563 printk(KERN_INFO "update e820 for mtrr\n");
709 trim_start = highest_pfn;
710 trim_start <<= PAGE_SHIFT;
711 trim_size = end_pfn;
712 trim_size <<= PAGE_SHIFT;
713 trim_size -= trim_start;
714 update_memory_range(trim_start, trim_size, E820_RAM,
715 E820_RESERVED);
716 update_e820(); 1564 update_e820();
1565
717 return 1; 1566 return 1;
718 } 1567 }
719 1568
@@ -729,18 +1578,21 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
729 */ 1578 */
730void __init mtrr_bp_init(void) 1579void __init mtrr_bp_init(void)
731{ 1580{
1581 u32 phys_addr;
732 init_ifs(); 1582 init_ifs();
733 1583
1584 phys_addr = 32;
1585
734 if (cpu_has_mtrr) { 1586 if (cpu_has_mtrr) {
735 mtrr_if = &generic_mtrr_ops; 1587 mtrr_if = &generic_mtrr_ops;
736 size_or_mask = 0xff000000; /* 36 bits */ 1588 size_or_mask = 0xff000000; /* 36 bits */
737 size_and_mask = 0x00f00000; 1589 size_and_mask = 0x00f00000;
1590 phys_addr = 36;
738 1591
739 /* This is an AMD specific MSR, but we assume(hope?) that 1592 /* This is an AMD specific MSR, but we assume(hope?) that
740 Intel will implement it to when they extend the address 1593 Intel will implement it to when they extend the address
741 bus of the Xeon. */ 1594 bus of the Xeon. */
742 if (cpuid_eax(0x80000000) >= 0x80000008) { 1595 if (cpuid_eax(0x80000000) >= 0x80000008) {
743 u32 phys_addr;
744 phys_addr = cpuid_eax(0x80000008) & 0xff; 1596 phys_addr = cpuid_eax(0x80000008) & 0xff;
745 /* CPUID workaround for Intel 0F33/0F34 CPU */ 1597 /* CPUID workaround for Intel 0F33/0F34 CPU */
746 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && 1598 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
@@ -758,6 +1610,7 @@ void __init mtrr_bp_init(void)
758 don't support PAE */ 1610 don't support PAE */
759 size_or_mask = 0xfff00000; /* 32 bits */ 1611 size_or_mask = 0xfff00000; /* 32 bits */
760 size_and_mask = 0; 1612 size_and_mask = 0;
1613 phys_addr = 32;
761 } 1614 }
762 } else { 1615 } else {
763 switch (boot_cpu_data.x86_vendor) { 1616 switch (boot_cpu_data.x86_vendor) {
@@ -791,8 +1644,15 @@ void __init mtrr_bp_init(void)
791 if (mtrr_if) { 1644 if (mtrr_if) {
792 set_num_var_ranges(); 1645 set_num_var_ranges();
793 init_table(); 1646 init_table();
794 if (use_intel()) 1647 if (use_intel()) {
795 get_mtrr_state(); 1648 get_mtrr_state();
1649
1650 if (mtrr_cleanup(phys_addr)) {
1651 changed_by_mtrr_cleanup = 1;
1652 mtrr_if->set_all();
1653 }
1654
1655 }
796 } 1656 }
797} 1657}
798 1658
@@ -822,16 +1682,17 @@ void mtrr_ap_init(void)
822 */ 1682 */
823void mtrr_save_state(void) 1683void mtrr_save_state(void)
824{ 1684{
825 smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1, 1); 1685 smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1);
826} 1686}
827 1687
828static int __init mtrr_init_finialize(void) 1688static int __init mtrr_init_finialize(void)
829{ 1689{
830 if (!mtrr_if) 1690 if (!mtrr_if)
831 return 0; 1691 return 0;
832 if (use_intel()) 1692 if (use_intel()) {
833 mtrr_state_warn(); 1693 if (!changed_by_mtrr_cleanup)
834 else { 1694 mtrr_state_warn();
1695 } else {
835 /* The CPUs haven't MTRR and seem to not support SMP. They have 1696 /* The CPUs haven't MTRR and seem to not support SMP. They have
836 * specific drivers, we use a tricky method to support 1697 * specific drivers, we use a tricky method to support
837 * suspend/resume for them. 1698 * suspend/resume for them.
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 2cc77eb6fea3..2dc4ec656b23 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -81,6 +81,8 @@ void set_mtrr_done(struct set_mtrr_context *ctxt);
81void set_mtrr_cache_disable(struct set_mtrr_context *ctxt); 81void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
82void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); 82void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
83 83
84void fill_mtrr_var_range(unsigned int index,
85 u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
84void get_mtrr_state(void); 86void get_mtrr_state(void);
85 87
86extern void set_mtrr_ops(struct mtrr_ops * ops); 88extern void set_mtrr_ops(struct mtrr_ops * ops);
@@ -92,6 +94,7 @@ extern struct mtrr_ops * mtrr_if;
92#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) 94#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1)
93 95
94extern unsigned int num_var_ranges; 96extern unsigned int num_var_ranges;
97extern u64 mtrr_tom2;
95 98
96void mtrr_state_warn(void); 99void mtrr_state_warn(void);
97const char *mtrr_attrib_to_str(int x); 100const char *mtrr_attrib_to_str(int x);
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index f9ae93adffe5..de7439f82b92 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -1,11 +1,15 @@
1/* local apic based NMI watchdog for various CPUs. 1/*
2 This file also handles reservation of performance counters for coordination 2 * local apic based NMI watchdog for various CPUs.
3 with other users (like oprofile). 3 *
4 4 * This file also handles reservation of performance counters for coordination
5 Note that these events normally don't tick when the CPU idles. This means 5 * with other users (like oprofile).
6 the frequency varies with CPU load. 6 *
7 7 * Note that these events normally don't tick when the CPU idles. This means
8 Original code for K7/P6 written by Keith Owens */ 8 * the frequency varies with CPU load.
9 *
10 * Original code for K7/P6 written by Keith Owens
11 *
12 */
9 13
10#include <linux/percpu.h> 14#include <linux/percpu.h>
11#include <linux/module.h> 15#include <linux/module.h>
@@ -36,12 +40,16 @@ struct wd_ops {
36 40
37static const struct wd_ops *wd_ops; 41static const struct wd_ops *wd_ops;
38 42
39/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's 43/*
40 * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) 44 * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
45 * offset from MSR_P4_BSU_ESCR0.
46 *
47 * It will be the max for all platforms (for now)
41 */ 48 */
42#define NMI_MAX_COUNTER_BITS 66 49#define NMI_MAX_COUNTER_BITS 66
43 50
44/* perfctr_nmi_owner tracks the ownership of the perfctr registers: 51/*
52 * perfctr_nmi_owner tracks the ownership of the perfctr registers:
45 * evtsel_nmi_owner tracks the ownership of the event selection 53 * evtsel_nmi_owner tracks the ownership of the event selection
46 * - different performance counters/ event selection may be reserved for 54 * - different performance counters/ event selection may be reserved for
47 * different subsystems this reservation system just tries to coordinate 55 * different subsystems this reservation system just tries to coordinate
@@ -73,8 +81,10 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
73 return 0; 81 return 0;
74} 82}
75 83
76/* converts an msr to an appropriate reservation bit */ 84/*
77/* returns the bit offset of the event selection register */ 85 * converts an msr to an appropriate reservation bit
86 * returns the bit offset of the event selection register
87 */
78static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) 88static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
79{ 89{
80 /* returns the bit offset of the event selection register */ 90 /* returns the bit offset of the event selection register */
@@ -114,6 +124,7 @@ int avail_to_resrv_perfctr_nmi(unsigned int msr)
114 124
115 return (!test_bit(counter, perfctr_nmi_owner)); 125 return (!test_bit(counter, perfctr_nmi_owner));
116} 126}
127EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
117 128
118int reserve_perfctr_nmi(unsigned int msr) 129int reserve_perfctr_nmi(unsigned int msr)
119{ 130{
@@ -128,6 +139,7 @@ int reserve_perfctr_nmi(unsigned int msr)
128 return 1; 139 return 1;
129 return 0; 140 return 0;
130} 141}
142EXPORT_SYMBOL(reserve_perfctr_nmi);
131 143
132void release_perfctr_nmi(unsigned int msr) 144void release_perfctr_nmi(unsigned int msr)
133{ 145{
@@ -140,6 +152,7 @@ void release_perfctr_nmi(unsigned int msr)
140 152
141 clear_bit(counter, perfctr_nmi_owner); 153 clear_bit(counter, perfctr_nmi_owner);
142} 154}
155EXPORT_SYMBOL(release_perfctr_nmi);
143 156
144int reserve_evntsel_nmi(unsigned int msr) 157int reserve_evntsel_nmi(unsigned int msr)
145{ 158{
@@ -154,6 +167,7 @@ int reserve_evntsel_nmi(unsigned int msr)
154 return 1; 167 return 1;
155 return 0; 168 return 0;
156} 169}
170EXPORT_SYMBOL(reserve_evntsel_nmi);
157 171
158void release_evntsel_nmi(unsigned int msr) 172void release_evntsel_nmi(unsigned int msr)
159{ 173{
@@ -166,11 +180,6 @@ void release_evntsel_nmi(unsigned int msr)
166 180
167 clear_bit(counter, evntsel_nmi_owner); 181 clear_bit(counter, evntsel_nmi_owner);
168} 182}
169
170EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
171EXPORT_SYMBOL(reserve_perfctr_nmi);
172EXPORT_SYMBOL(release_perfctr_nmi);
173EXPORT_SYMBOL(reserve_evntsel_nmi);
174EXPORT_SYMBOL(release_evntsel_nmi); 183EXPORT_SYMBOL(release_evntsel_nmi);
175 184
176void disable_lapic_nmi_watchdog(void) 185void disable_lapic_nmi_watchdog(void)
@@ -180,8 +189,10 @@ void disable_lapic_nmi_watchdog(void)
180 if (atomic_read(&nmi_active) <= 0) 189 if (atomic_read(&nmi_active) <= 0)
181 return; 190 return;
182 191
183 on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); 192 on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
184 wd_ops->unreserve(); 193
194 if (wd_ops)
195 wd_ops->unreserve();
185 196
186 BUG_ON(atomic_read(&nmi_active) != 0); 197 BUG_ON(atomic_read(&nmi_active) != 0);
187} 198}
@@ -202,7 +213,7 @@ void enable_lapic_nmi_watchdog(void)
202 return; 213 return;
203 } 214 }
204 215
205 on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); 216 on_each_cpu(setup_apic_nmi_watchdog, NULL, 1);
206 touch_nmi_watchdog(); 217 touch_nmi_watchdog();
207} 218}
208 219
@@ -232,31 +243,32 @@ static unsigned int adjust_for_32bit_ctr(unsigned int hz)
232 return retval; 243 return retval;
233} 244}
234 245
235static void 246static void write_watchdog_counter(unsigned int perfctr_msr,
236write_watchdog_counter(unsigned int perfctr_msr, const char *descr, unsigned nmi_hz) 247 const char *descr, unsigned nmi_hz)
237{ 248{
238 u64 count = (u64)cpu_khz * 1000; 249 u64 count = (u64)cpu_khz * 1000;
239 250
240 do_div(count, nmi_hz); 251 do_div(count, nmi_hz);
241 if(descr) 252 if(descr)
242 Dprintk("setting %s to -0x%08Lx\n", descr, count); 253 pr_debug("setting %s to -0x%08Lx\n", descr, count);
243 wrmsrl(perfctr_msr, 0 - count); 254 wrmsrl(perfctr_msr, 0 - count);
244} 255}
245 256
246static void write_watchdog_counter32(unsigned int perfctr_msr, 257static void write_watchdog_counter32(unsigned int perfctr_msr,
247 const char *descr, unsigned nmi_hz) 258 const char *descr, unsigned nmi_hz)
248{ 259{
249 u64 count = (u64)cpu_khz * 1000; 260 u64 count = (u64)cpu_khz * 1000;
250 261
251 do_div(count, nmi_hz); 262 do_div(count, nmi_hz);
252 if(descr) 263 if(descr)
253 Dprintk("setting %s to -0x%08Lx\n", descr, count); 264 pr_debug("setting %s to -0x%08Lx\n", descr, count);
254 wrmsr(perfctr_msr, (u32)(-count), 0); 265 wrmsr(perfctr_msr, (u32)(-count), 0);
255} 266}
256 267
257/* AMD K7/K8/Family10h/Family11h support. AMD keeps this interface 268/*
258 nicely stable so there is not much variety */ 269 * AMD K7/K8/Family10h/Family11h support.
259 270 * AMD keeps this interface nicely stable so there is not much variety
271 */
260#define K7_EVNTSEL_ENABLE (1 << 22) 272#define K7_EVNTSEL_ENABLE (1 << 22)
261#define K7_EVNTSEL_INT (1 << 20) 273#define K7_EVNTSEL_INT (1 << 20)
262#define K7_EVNTSEL_OS (1 << 17) 274#define K7_EVNTSEL_OS (1 << 17)
@@ -289,7 +301,7 @@ static int setup_k7_watchdog(unsigned nmi_hz)
289 301
290 wd->perfctr_msr = perfctr_msr; 302 wd->perfctr_msr = perfctr_msr;
291 wd->evntsel_msr = evntsel_msr; 303 wd->evntsel_msr = evntsel_msr;
292 wd->cccr_msr = 0; //unused 304 wd->cccr_msr = 0; /* unused */
293 return 1; 305 return 1;
294} 306}
295 307
@@ -325,18 +337,19 @@ static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
325} 337}
326 338
327static const struct wd_ops k7_wd_ops = { 339static const struct wd_ops k7_wd_ops = {
328 .reserve = single_msr_reserve, 340 .reserve = single_msr_reserve,
329 .unreserve = single_msr_unreserve, 341 .unreserve = single_msr_unreserve,
330 .setup = setup_k7_watchdog, 342 .setup = setup_k7_watchdog,
331 .rearm = single_msr_rearm, 343 .rearm = single_msr_rearm,
332 .stop = single_msr_stop_watchdog, 344 .stop = single_msr_stop_watchdog,
333 .perfctr = MSR_K7_PERFCTR0, 345 .perfctr = MSR_K7_PERFCTR0,
334 .evntsel = MSR_K7_EVNTSEL0, 346 .evntsel = MSR_K7_EVNTSEL0,
335 .checkbit = 1ULL<<47, 347 .checkbit = 1ULL << 47,
336}; 348};
337 349
338/* Intel Model 6 (PPro+,P2,P3,P-M,Core1) */ 350/*
339 351 * Intel Model 6 (PPro+,P2,P3,P-M,Core1)
352 */
340#define P6_EVNTSEL0_ENABLE (1 << 22) 353#define P6_EVNTSEL0_ENABLE (1 << 22)
341#define P6_EVNTSEL_INT (1 << 20) 354#define P6_EVNTSEL_INT (1 << 20)
342#define P6_EVNTSEL_OS (1 << 17) 355#define P6_EVNTSEL_OS (1 << 17)
@@ -372,52 +385,58 @@ static int setup_p6_watchdog(unsigned nmi_hz)
372 385
373 wd->perfctr_msr = perfctr_msr; 386 wd->perfctr_msr = perfctr_msr;
374 wd->evntsel_msr = evntsel_msr; 387 wd->evntsel_msr = evntsel_msr;
375 wd->cccr_msr = 0; //unused 388 wd->cccr_msr = 0; /* unused */
376 return 1; 389 return 1;
377} 390}
378 391
379static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) 392static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
380{ 393{
381 /* P6 based Pentium M need to re-unmask 394 /*
395 * P6 based Pentium M need to re-unmask
382 * the apic vector but it doesn't hurt 396 * the apic vector but it doesn't hurt
383 * other P6 variant. 397 * other P6 variant.
384 * ArchPerfom/Core Duo also needs this */ 398 * ArchPerfom/Core Duo also needs this
399 */
385 apic_write(APIC_LVTPC, APIC_DM_NMI); 400 apic_write(APIC_LVTPC, APIC_DM_NMI);
401
386 /* P6/ARCH_PERFMON has 32 bit counter write */ 402 /* P6/ARCH_PERFMON has 32 bit counter write */
387 write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz); 403 write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz);
388} 404}
389 405
390static const struct wd_ops p6_wd_ops = { 406static const struct wd_ops p6_wd_ops = {
391 .reserve = single_msr_reserve, 407 .reserve = single_msr_reserve,
392 .unreserve = single_msr_unreserve, 408 .unreserve = single_msr_unreserve,
393 .setup = setup_p6_watchdog, 409 .setup = setup_p6_watchdog,
394 .rearm = p6_rearm, 410 .rearm = p6_rearm,
395 .stop = single_msr_stop_watchdog, 411 .stop = single_msr_stop_watchdog,
396 .perfctr = MSR_P6_PERFCTR0, 412 .perfctr = MSR_P6_PERFCTR0,
397 .evntsel = MSR_P6_EVNTSEL0, 413 .evntsel = MSR_P6_EVNTSEL0,
398 .checkbit = 1ULL<<39, 414 .checkbit = 1ULL << 39,
399}; 415};
400 416
401/* Intel P4 performance counters. By far the most complicated of all. */ 417/*
402 418 * Intel P4 performance counters.
403#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) 419 * By far the most complicated of all.
404#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) 420 */
405#define P4_ESCR_OS (1<<3) 421#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1 << 7)
406#define P4_ESCR_USR (1<<2) 422#define P4_ESCR_EVENT_SELECT(N) ((N) << 25)
407#define P4_CCCR_OVF_PMI0 (1<<26) 423#define P4_ESCR_OS (1 << 3)
408#define P4_CCCR_OVF_PMI1 (1<<27) 424#define P4_ESCR_USR (1 << 2)
409#define P4_CCCR_THRESHOLD(N) ((N)<<20) 425#define P4_CCCR_OVF_PMI0 (1 << 26)
410#define P4_CCCR_COMPLEMENT (1<<19) 426#define P4_CCCR_OVF_PMI1 (1 << 27)
411#define P4_CCCR_COMPARE (1<<18) 427#define P4_CCCR_THRESHOLD(N) ((N) << 20)
412#define P4_CCCR_REQUIRED (3<<16) 428#define P4_CCCR_COMPLEMENT (1 << 19)
413#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) 429#define P4_CCCR_COMPARE (1 << 18)
414#define P4_CCCR_ENABLE (1<<12) 430#define P4_CCCR_REQUIRED (3 << 16)
415#define P4_CCCR_OVF (1<<31) 431#define P4_CCCR_ESCR_SELECT(N) ((N) << 13)
416 432#define P4_CCCR_ENABLE (1 << 12)
417/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter 433#define P4_CCCR_OVF (1 << 31)
418 CRU_ESCR0 (with any non-null event selector) through a complemented
419 max threshold. [IA32-Vol3, Section 14.9.9] */
420 434
435/*
436 * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
437 * CRU_ESCR0 (with any non-null event selector) through a complemented
438 * max threshold. [IA32-Vol3, Section 14.9.9]
439 */
421static int setup_p4_watchdog(unsigned nmi_hz) 440static int setup_p4_watchdog(unsigned nmi_hz)
422{ 441{
423 unsigned int perfctr_msr, evntsel_msr, cccr_msr; 442 unsigned int perfctr_msr, evntsel_msr, cccr_msr;
@@ -442,7 +461,8 @@ static int setup_p4_watchdog(unsigned nmi_hz)
442#endif 461#endif
443 ht_num = 0; 462 ht_num = 0;
444 463
445 /* performance counters are shared resources 464 /*
465 * performance counters are shared resources
446 * assign each hyperthread its own set 466 * assign each hyperthread its own set
447 * (re-use the ESCR0 register, seems safe 467 * (re-use the ESCR0 register, seems safe
448 * and keeps the cccr_val the same) 468 * and keeps the cccr_val the same)
@@ -540,20 +560,21 @@ static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
540} 560}
541 561
542static const struct wd_ops p4_wd_ops = { 562static const struct wd_ops p4_wd_ops = {
543 .reserve = p4_reserve, 563 .reserve = p4_reserve,
544 .unreserve = p4_unreserve, 564 .unreserve = p4_unreserve,
545 .setup = setup_p4_watchdog, 565 .setup = setup_p4_watchdog,
546 .rearm = p4_rearm, 566 .rearm = p4_rearm,
547 .stop = stop_p4_watchdog, 567 .stop = stop_p4_watchdog,
548 /* RED-PEN this is wrong for the other sibling */ 568 /* RED-PEN this is wrong for the other sibling */
549 .perfctr = MSR_P4_BPU_PERFCTR0, 569 .perfctr = MSR_P4_BPU_PERFCTR0,
550 .evntsel = MSR_P4_BSU_ESCR0, 570 .evntsel = MSR_P4_BSU_ESCR0,
551 .checkbit = 1ULL<<39, 571 .checkbit = 1ULL << 39,
552}; 572};
553 573
554/* Watchdog using the Intel architected PerfMon. Used for Core2 and hopefully 574/*
555 all future Intel CPUs. */ 575 * Watchdog using the Intel architected PerfMon.
556 576 * Used for Core2 and hopefully all future Intel CPUs.
577 */
557#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 578#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
558#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK 579#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
559 580
@@ -599,19 +620,19 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
599 620
600 wd->perfctr_msr = perfctr_msr; 621 wd->perfctr_msr = perfctr_msr;
601 wd->evntsel_msr = evntsel_msr; 622 wd->evntsel_msr = evntsel_msr;
602 wd->cccr_msr = 0; //unused 623 wd->cccr_msr = 0; /* unused */
603 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); 624 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
604 return 1; 625 return 1;
605} 626}
606 627
607static struct wd_ops intel_arch_wd_ops __read_mostly = { 628static struct wd_ops intel_arch_wd_ops __read_mostly = {
608 .reserve = single_msr_reserve, 629 .reserve = single_msr_reserve,
609 .unreserve = single_msr_unreserve, 630 .unreserve = single_msr_unreserve,
610 .setup = setup_intel_arch_watchdog, 631 .setup = setup_intel_arch_watchdog,
611 .rearm = p6_rearm, 632 .rearm = p6_rearm,
612 .stop = single_msr_stop_watchdog, 633 .stop = single_msr_stop_watchdog,
613 .perfctr = MSR_ARCH_PERFMON_PERFCTR1, 634 .perfctr = MSR_ARCH_PERFMON_PERFCTR1,
614 .evntsel = MSR_ARCH_PERFMON_EVENTSEL1, 635 .evntsel = MSR_ARCH_PERFMON_EVENTSEL1,
615}; 636};
616 637
617static void probe_nmi_watchdog(void) 638static void probe_nmi_watchdog(void)
@@ -624,8 +645,10 @@ static void probe_nmi_watchdog(void)
624 wd_ops = &k7_wd_ops; 645 wd_ops = &k7_wd_ops;
625 break; 646 break;
626 case X86_VENDOR_INTEL: 647 case X86_VENDOR_INTEL:
627 /* Work around Core Duo (Yonah) errata AE49 where perfctr1 648 /*
628 doesn't have a working enable bit. */ 649 * Work around Core Duo (Yonah) errata AE49 where perfctr1
650 * doesn't have a working enable bit.
651 */
629 if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) { 652 if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) {
630 intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0; 653 intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0;
631 intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0; 654 intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0;
@@ -636,7 +659,7 @@ static void probe_nmi_watchdog(void)
636 } 659 }
637 switch (boot_cpu_data.x86) { 660 switch (boot_cpu_data.x86) {
638 case 6: 661 case 6:
639 if (boot_cpu_data.x86_model > 0xd) 662 if (boot_cpu_data.x86_model > 13)
640 return; 663 return;
641 664
642 wd_ops = &p6_wd_ops; 665 wd_ops = &p6_wd_ops;
@@ -697,10 +720,11 @@ int lapic_wd_event(unsigned nmi_hz)
697{ 720{
698 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); 721 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
699 u64 ctr; 722 u64 ctr;
723
700 rdmsrl(wd->perfctr_msr, ctr); 724 rdmsrl(wd->perfctr_msr, ctr);
701 if (ctr & wd_ops->checkbit) { /* perfctr still running? */ 725 if (ctr & wd_ops->checkbit) /* perfctr still running? */
702 return 0; 726 return 0;
703 } 727
704 wd_ops->rearm(wd, nmi_hz); 728 wd_ops->rearm(wd, nmi_hz);
705 return 1; 729 return 1;
706} 730}
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 0d0d9057e7c0..a26c480b9491 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -160,7 +160,7 @@ static void *c_start(struct seq_file *m, loff_t *pos)
160{ 160{
161 if (*pos == 0) /* just in case, cpu 0 is not the first */ 161 if (*pos == 0) /* just in case, cpu 0 is not the first */
162 *pos = first_cpu(cpu_online_map); 162 *pos = first_cpu(cpu_online_map);
163 if ((*pos) < NR_CPUS && cpu_online(*pos)) 163 if ((*pos) < nr_cpu_ids && cpu_online(*pos))
164 return &cpu_data(*pos); 164 return &cpu_data(*pos);
165 return NULL; 165 return NULL;
166} 166}
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index daff52a62248..14b11b3be31c 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -33,6 +33,7 @@
33#include <linux/init.h> 33#include <linux/init.h>
34#include <linux/poll.h> 34#include <linux/poll.h>
35#include <linux/smp.h> 35#include <linux/smp.h>
36#include <linux/smp_lock.h>
36#include <linux/major.h> 37#include <linux/major.h>
37#include <linux/fs.h> 38#include <linux/fs.h>
38#include <linux/smp_lock.h> 39#include <linux/smp_lock.h>
@@ -95,7 +96,7 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
95 for (; count; count -= 16) { 96 for (; count; count -= 16) {
96 cmd.eax = pos; 97 cmd.eax = pos;
97 cmd.ecx = pos >> 32; 98 cmd.ecx = pos >> 32;
98 smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1); 99 smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1);
99 if (copy_to_user(tmp, &cmd, 16)) 100 if (copy_to_user(tmp, &cmd, 16))
100 return -EFAULT; 101 return -EFAULT;
101 tmp += 16; 102 tmp += 16;
@@ -107,15 +108,23 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
107 108
108static int cpuid_open(struct inode *inode, struct file *file) 109static int cpuid_open(struct inode *inode, struct file *file)
109{ 110{
110 unsigned int cpu = iminor(file->f_path.dentry->d_inode); 111 unsigned int cpu;
111 struct cpuinfo_x86 *c = &cpu_data(cpu); 112 struct cpuinfo_x86 *c;
112 113 int ret = 0;
113 if (cpu >= NR_CPUS || !cpu_online(cpu)) 114
114 return -ENXIO; /* No such CPU */ 115 lock_kernel();
116
117 cpu = iminor(file->f_path.dentry->d_inode);
118 if (cpu >= NR_CPUS || !cpu_online(cpu)) {
119 ret = -ENXIO; /* No such CPU */
120 goto out;
121 }
122 c = &cpu_data(cpu);
115 if (c->cpuid_level < 0) 123 if (c->cpuid_level < 0)
116 return -EIO; /* CPUID not supported */ 124 ret = -EIO; /* CPUID not supported */
117 125out:
118 return 0; 126 unlock_kernel();
127 return ret;
119} 128}
120 129
121/* 130/*
@@ -132,8 +141,8 @@ static __cpuinit int cpuid_device_create(int cpu)
132{ 141{
133 struct device *dev; 142 struct device *dev;
134 143
135 dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), 144 dev = device_create_drvdata(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu),
136 "cpu%d", cpu); 145 NULL, "cpu%d", cpu);
137 return IS_ERR(dev) ? PTR_ERR(dev) : 0; 146 return IS_ERR(dev) ? PTR_ERR(dev) : 0;
138} 147}
139 148
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
new file mode 100644
index 000000000000..9af89078f7bb
--- /dev/null
+++ b/arch/x86/kernel/e820.c
@@ -0,0 +1,1365 @@
1/*
2 * Handle the memory map.
3 * The functions here do the job until bootmem takes over.
4 *
5 * Getting sanitize_e820_map() in sync with i386 version by applying change:
6 * - Provisions for empty E820 memory regions (reported by certain BIOSes).
7 * Alex Achenbach <xela@slit.de>, December 2002.
8 * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
9 *
10 */
11#include <linux/kernel.h>
12#include <linux/types.h>
13#include <linux/init.h>
14#include <linux/bootmem.h>
15#include <linux/ioport.h>
16#include <linux/string.h>
17#include <linux/kexec.h>
18#include <linux/module.h>
19#include <linux/mm.h>
20#include <linux/pfn.h>
21#include <linux/suspend.h>
22#include <linux/firmware-map.h>
23
24#include <asm/pgtable.h>
25#include <asm/page.h>
26#include <asm/e820.h>
27#include <asm/proto.h>
28#include <asm/setup.h>
29#include <asm/trampoline.h>
30
31/*
32 * The e820 map is the map that gets modified e.g. with command line parameters
33 * and that is also registered with modifications in the kernel resource tree
34 * with the iomem_resource as parent.
35 *
36 * The e820_saved is directly saved after the BIOS-provided memory map is
37 * copied. It doesn't get modified afterwards. It's registered for the
38 * /sys/firmware/memmap interface.
39 *
40 * That memory map is not modified and is used as base for kexec. The kexec'd
41 * kernel should get the same memory map as the firmware provides. Then the
42 * user can e.g. boot the original kernel with mem=1G while still booting the
43 * next kernel with full memory.
44 */
45struct e820map e820;
46struct e820map e820_saved;
47
48/* For PCI or other memory-mapped resources */
49unsigned long pci_mem_start = 0xaeedbabe;
50#ifdef CONFIG_PCI
51EXPORT_SYMBOL(pci_mem_start);
52#endif
53
54/*
55 * This function checks if any part of the range <start,end> is mapped
56 * with type.
57 */
58int
59e820_any_mapped(u64 start, u64 end, unsigned type)
60{
61 int i;
62
63 for (i = 0; i < e820.nr_map; i++) {
64 struct e820entry *ei = &e820.map[i];
65
66 if (type && ei->type != type)
67 continue;
68 if (ei->addr >= end || ei->addr + ei->size <= start)
69 continue;
70 return 1;
71 }
72 return 0;
73}
74EXPORT_SYMBOL_GPL(e820_any_mapped);
75
76/*
77 * This function checks if the entire range <start,end> is mapped with type.
78 *
79 * Note: this function only works correct if the e820 table is sorted and
80 * not-overlapping, which is the case
81 */
82int __init e820_all_mapped(u64 start, u64 end, unsigned type)
83{
84 int i;
85
86 for (i = 0; i < e820.nr_map; i++) {
87 struct e820entry *ei = &e820.map[i];
88
89 if (type && ei->type != type)
90 continue;
91 /* is the region (part) in overlap with the current region ?*/
92 if (ei->addr >= end || ei->addr + ei->size <= start)
93 continue;
94
95 /* if the region is at the beginning of <start,end> we move
96 * start to the end of the region since it's ok until there
97 */
98 if (ei->addr <= start)
99 start = ei->addr + ei->size;
100 /*
101 * if start is now at or beyond end, we're done, full
102 * coverage
103 */
104 if (start >= end)
105 return 1;
106 }
107 return 0;
108}
109
110/*
111 * Add a memory region to the kernel e820 map.
112 */
113void __init e820_add_region(u64 start, u64 size, int type)
114{
115 int x = e820.nr_map;
116
117 if (x == ARRAY_SIZE(e820.map)) {
118 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
119 return;
120 }
121
122 e820.map[x].addr = start;
123 e820.map[x].size = size;
124 e820.map[x].type = type;
125 e820.nr_map++;
126}
127
128void __init e820_print_map(char *who)
129{
130 int i;
131
132 for (i = 0; i < e820.nr_map; i++) {
133 printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
134 (unsigned long long) e820.map[i].addr,
135 (unsigned long long)
136 (e820.map[i].addr + e820.map[i].size));
137 switch (e820.map[i].type) {
138 case E820_RAM:
139 case E820_RESERVED_KERN:
140 printk(KERN_CONT "(usable)\n");
141 break;
142 case E820_RESERVED:
143 printk(KERN_CONT "(reserved)\n");
144 break;
145 case E820_ACPI:
146 printk(KERN_CONT "(ACPI data)\n");
147 break;
148 case E820_NVS:
149 printk(KERN_CONT "(ACPI NVS)\n");
150 break;
151 default:
152 printk(KERN_CONT "type %u\n", e820.map[i].type);
153 break;
154 }
155 }
156}
157
158/*
159 * Sanitize the BIOS e820 map.
160 *
161 * Some e820 responses include overlapping entries. The following
162 * replaces the original e820 map with a new one, removing overlaps,
163 * and resolving conflicting memory types in favor of highest
164 * numbered type.
165 *
166 * The input parameter biosmap points to an array of 'struct
167 * e820entry' which on entry has elements in the range [0, *pnr_map)
168 * valid, and which has space for up to max_nr_map entries.
169 * On return, the resulting sanitized e820 map entries will be in
170 * overwritten in the same location, starting at biosmap.
171 *
172 * The integer pointed to by pnr_map must be valid on entry (the
173 * current number of valid entries located at biosmap) and will
174 * be updated on return, with the new number of valid entries
175 * (something no more than max_nr_map.)
176 *
177 * The return value from sanitize_e820_map() is zero if it
178 * successfully 'sanitized' the map entries passed in, and is -1
179 * if it did nothing, which can happen if either of (1) it was
180 * only passed one map entry, or (2) any of the input map entries
181 * were invalid (start + size < start, meaning that the size was
182 * so big the described memory range wrapped around through zero.)
183 *
184 * Visually we're performing the following
185 * (1,2,3,4 = memory types)...
186 *
187 * Sample memory map (w/overlaps):
188 * ____22__________________
189 * ______________________4_
190 * ____1111________________
191 * _44_____________________
192 * 11111111________________
193 * ____________________33__
194 * ___________44___________
195 * __________33333_________
196 * ______________22________
197 * ___________________2222_
198 * _________111111111______
199 * _____________________11_
200 * _________________4______
201 *
202 * Sanitized equivalent (no overlap):
203 * 1_______________________
204 * _44_____________________
205 * ___1____________________
206 * ____22__________________
207 * ______11________________
208 * _________1______________
209 * __________3_____________
210 * ___________44___________
211 * _____________33_________
212 * _______________2________
213 * ________________1_______
214 * _________________4______
215 * ___________________2____
216 * ____________________33__
217 * ______________________4_
218 */
219
220int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
221 int *pnr_map)
222{
223 struct change_member {
224 struct e820entry *pbios; /* pointer to original bios entry */
225 unsigned long long addr; /* address for this change point */
226 };
227 static struct change_member change_point_list[2*E820_X_MAX] __initdata;
228 static struct change_member *change_point[2*E820_X_MAX] __initdata;
229 static struct e820entry *overlap_list[E820_X_MAX] __initdata;
230 static struct e820entry new_bios[E820_X_MAX] __initdata;
231 struct change_member *change_tmp;
232 unsigned long current_type, last_type;
233 unsigned long long last_addr;
234 int chgidx, still_changing;
235 int overlap_entries;
236 int new_bios_entry;
237 int old_nr, new_nr, chg_nr;
238 int i;
239
240 /* if there's only one memory region, don't bother */
241 if (*pnr_map < 2)
242 return -1;
243
244 old_nr = *pnr_map;
245 BUG_ON(old_nr > max_nr_map);
246
247 /* bail out if we find any unreasonable addresses in bios map */
248 for (i = 0; i < old_nr; i++)
249 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
250 return -1;
251
252 /* create pointers for initial change-point information (for sorting) */
253 for (i = 0; i < 2 * old_nr; i++)
254 change_point[i] = &change_point_list[i];
255
256 /* record all known change-points (starting and ending addresses),
257 omitting those that are for empty memory regions */
258 chgidx = 0;
259 for (i = 0; i < old_nr; i++) {
260 if (biosmap[i].size != 0) {
261 change_point[chgidx]->addr = biosmap[i].addr;
262 change_point[chgidx++]->pbios = &biosmap[i];
263 change_point[chgidx]->addr = biosmap[i].addr +
264 biosmap[i].size;
265 change_point[chgidx++]->pbios = &biosmap[i];
266 }
267 }
268 chg_nr = chgidx;
269
270 /* sort change-point list by memory addresses (low -> high) */
271 still_changing = 1;
272 while (still_changing) {
273 still_changing = 0;
274 for (i = 1; i < chg_nr; i++) {
275 unsigned long long curaddr, lastaddr;
276 unsigned long long curpbaddr, lastpbaddr;
277
278 curaddr = change_point[i]->addr;
279 lastaddr = change_point[i - 1]->addr;
280 curpbaddr = change_point[i]->pbios->addr;
281 lastpbaddr = change_point[i - 1]->pbios->addr;
282
283 /*
284 * swap entries, when:
285 *
286 * curaddr > lastaddr or
287 * curaddr == lastaddr and curaddr == curpbaddr and
288 * lastaddr != lastpbaddr
289 */
290 if (curaddr < lastaddr ||
291 (curaddr == lastaddr && curaddr == curpbaddr &&
292 lastaddr != lastpbaddr)) {
293 change_tmp = change_point[i];
294 change_point[i] = change_point[i-1];
295 change_point[i-1] = change_tmp;
296 still_changing = 1;
297 }
298 }
299 }
300
301 /* create a new bios memory map, removing overlaps */
302 overlap_entries = 0; /* number of entries in the overlap table */
303 new_bios_entry = 0; /* index for creating new bios map entries */
304 last_type = 0; /* start with undefined memory type */
305 last_addr = 0; /* start with 0 as last starting address */
306
307 /* loop through change-points, determining affect on the new bios map */
308 for (chgidx = 0; chgidx < chg_nr; chgidx++) {
309 /* keep track of all overlapping bios entries */
310 if (change_point[chgidx]->addr ==
311 change_point[chgidx]->pbios->addr) {
312 /*
313 * add map entry to overlap list (> 1 entry
314 * implies an overlap)
315 */
316 overlap_list[overlap_entries++] =
317 change_point[chgidx]->pbios;
318 } else {
319 /*
320 * remove entry from list (order independent,
321 * so swap with last)
322 */
323 for (i = 0; i < overlap_entries; i++) {
324 if (overlap_list[i] ==
325 change_point[chgidx]->pbios)
326 overlap_list[i] =
327 overlap_list[overlap_entries-1];
328 }
329 overlap_entries--;
330 }
331 /*
332 * if there are overlapping entries, decide which
333 * "type" to use (larger value takes precedence --
334 * 1=usable, 2,3,4,4+=unusable)
335 */
336 current_type = 0;
337 for (i = 0; i < overlap_entries; i++)
338 if (overlap_list[i]->type > current_type)
339 current_type = overlap_list[i]->type;
340 /*
341 * continue building up new bios map based on this
342 * information
343 */
344 if (current_type != last_type) {
345 if (last_type != 0) {
346 new_bios[new_bios_entry].size =
347 change_point[chgidx]->addr - last_addr;
348 /*
349 * move forward only if the new size
350 * was non-zero
351 */
352 if (new_bios[new_bios_entry].size != 0)
353 /*
354 * no more space left for new
355 * bios entries ?
356 */
357 if (++new_bios_entry >= max_nr_map)
358 break;
359 }
360 if (current_type != 0) {
361 new_bios[new_bios_entry].addr =
362 change_point[chgidx]->addr;
363 new_bios[new_bios_entry].type = current_type;
364 last_addr = change_point[chgidx]->addr;
365 }
366 last_type = current_type;
367 }
368 }
369 /* retain count for new bios entries */
370 new_nr = new_bios_entry;
371
372 /* copy new bios mapping into original location */
373 memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
374 *pnr_map = new_nr;
375
376 return 0;
377}
378
379static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
380{
381 while (nr_map) {
382 u64 start = biosmap->addr;
383 u64 size = biosmap->size;
384 u64 end = start + size;
385 u32 type = biosmap->type;
386
387 /* Overflow in 64 bits? Ignore the memory map. */
388 if (start > end)
389 return -1;
390
391 e820_add_region(start, size, type);
392
393 biosmap++;
394 nr_map--;
395 }
396 return 0;
397}
398
399/*
400 * Copy the BIOS e820 map into a safe place.
401 *
402 * Sanity-check it while we're at it..
403 *
404 * If we're lucky and live on a modern system, the setup code
405 * will have given us a memory map that we can use to properly
406 * set up memory. If we aren't, we'll fake a memory map.
407 */
408static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
409{
410 /* Only one memory region (or negative)? Ignore it */
411 if (nr_map < 2)
412 return -1;
413
414 return __append_e820_map(biosmap, nr_map);
415}
416
417static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
418 u64 size, unsigned old_type,
419 unsigned new_type)
420{
421 int i;
422 u64 real_updated_size = 0;
423
424 BUG_ON(old_type == new_type);
425
426 if (size > (ULLONG_MAX - start))
427 size = ULLONG_MAX - start;
428
429 for (i = 0; i < e820.nr_map; i++) {
430 struct e820entry *ei = &e820x->map[i];
431 u64 final_start, final_end;
432 if (ei->type != old_type)
433 continue;
434 /* totally covered? */
435 if (ei->addr >= start &&
436 (ei->addr + ei->size) <= (start + size)) {
437 ei->type = new_type;
438 real_updated_size += ei->size;
439 continue;
440 }
441 /* partially covered */
442 final_start = max(start, ei->addr);
443 final_end = min(start + size, ei->addr + ei->size);
444 if (final_start >= final_end)
445 continue;
446 e820_add_region(final_start, final_end - final_start,
447 new_type);
448 real_updated_size += final_end - final_start;
449
450 ei->size -= final_end - final_start;
451 if (ei->addr < final_start)
452 continue;
453 ei->addr = final_end;
454 }
455 return real_updated_size;
456}
457
458u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
459 unsigned new_type)
460{
461 return e820_update_range_map(&e820, start, size, old_type, new_type);
462}
463
464static u64 __init e820_update_range_saved(u64 start, u64 size,
465 unsigned old_type, unsigned new_type)
466{
467 return e820_update_range_map(&e820_saved, start, size, old_type,
468 new_type);
469}
470
471/* make e820 not cover the range */
472u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
473 int checktype)
474{
475 int i;
476 u64 real_removed_size = 0;
477
478 if (size > (ULLONG_MAX - start))
479 size = ULLONG_MAX - start;
480
481 for (i = 0; i < e820.nr_map; i++) {
482 struct e820entry *ei = &e820.map[i];
483 u64 final_start, final_end;
484
485 if (checktype && ei->type != old_type)
486 continue;
487 /* totally covered? */
488 if (ei->addr >= start &&
489 (ei->addr + ei->size) <= (start + size)) {
490 real_removed_size += ei->size;
491 memset(ei, 0, sizeof(struct e820entry));
492 continue;
493 }
494 /* partially covered */
495 final_start = max(start, ei->addr);
496 final_end = min(start + size, ei->addr + ei->size);
497 if (final_start >= final_end)
498 continue;
499 real_removed_size += final_end - final_start;
500
501 ei->size -= final_end - final_start;
502 if (ei->addr < final_start)
503 continue;
504 ei->addr = final_end;
505 }
506 return real_removed_size;
507}
508
509void __init update_e820(void)
510{
511 int nr_map;
512
513 nr_map = e820.nr_map;
514 if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
515 return;
516 e820.nr_map = nr_map;
517 printk(KERN_INFO "modified physical RAM map:\n");
518 e820_print_map("modified");
519}
520static void __init update_e820_saved(void)
521{
522 int nr_map;
523
524 nr_map = e820_saved.nr_map;
525 if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
526 return;
527 e820_saved.nr_map = nr_map;
528}
529#define MAX_GAP_END 0x100000000ull
530/*
531 * Search for a gap in the e820 memory space from start_addr to end_addr.
532 */
533__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
534 unsigned long start_addr, unsigned long long end_addr)
535{
536 unsigned long long last;
537 int i = e820.nr_map;
538 int found = 0;
539
540 last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END;
541
542 while (--i >= 0) {
543 unsigned long long start = e820.map[i].addr;
544 unsigned long long end = start + e820.map[i].size;
545
546 if (end < start_addr)
547 continue;
548
549 /*
550 * Since "last" is at most 4GB, we know we'll
551 * fit in 32 bits if this condition is true
552 */
553 if (last > end) {
554 unsigned long gap = last - end;
555
556 if (gap >= *gapsize) {
557 *gapsize = gap;
558 *gapstart = end;
559 found = 1;
560 }
561 }
562 if (start < last)
563 last = start;
564 }
565 return found;
566}
567
568/*
569 * Search for the biggest gap in the low 32 bits of the e820
570 * memory space. We pass this space to PCI to assign MMIO resources
571 * for hotplug or unconfigured devices in.
572 * Hopefully the BIOS let enough space left.
573 */
574__init void e820_setup_gap(void)
575{
576 unsigned long gapstart, gapsize, round;
577 int found;
578
579 gapstart = 0x10000000;
580 gapsize = 0x400000;
581 found = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END);
582
583#ifdef CONFIG_X86_64
584 if (!found) {
585 gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
586 printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
587 "address range\n"
588 KERN_ERR "PCI: Unassigned devices with 32bit resource "
589 "registers may break!\n");
590 }
591#endif
592
593 /*
594 * See how much we want to round up: start off with
595 * rounding to the next 1MB area.
596 */
597 round = 0x100000;
598 while ((gapsize >> 4) > round)
599 round += round;
600 /* Fun with two's complement */
601 pci_mem_start = (gapstart + round) & -round;
602
603 printk(KERN_INFO
604 "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
605 pci_mem_start, gapstart, gapsize);
606}
607
608/**
609 * Because of the size limitation of struct boot_params, only first
610 * 128 E820 memory entries are passed to kernel via
611 * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
612 * linked list of struct setup_data, which is parsed here.
613 */
614void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data)
615{
616 u32 map_len;
617 int entries;
618 struct e820entry *extmap;
619
620 entries = sdata->len / sizeof(struct e820entry);
621 map_len = sdata->len + sizeof(struct setup_data);
622 if (map_len > PAGE_SIZE)
623 sdata = early_ioremap(pa_data, map_len);
624 extmap = (struct e820entry *)(sdata->data);
625 __append_e820_map(extmap, entries);
626 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
627 if (map_len > PAGE_SIZE)
628 early_iounmap(sdata, map_len);
629 printk(KERN_INFO "extended physical RAM map:\n");
630 e820_print_map("extended");
631}
632
633#if defined(CONFIG_X86_64) || \
634 (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
635/**
636 * Find the ranges of physical addresses that do not correspond to
637 * e820 RAM areas and mark the corresponding pages as nosave for
638 * hibernation (32 bit) or software suspend and suspend to RAM (64 bit).
639 *
640 * This function requires the e820 map to be sorted and without any
641 * overlapping entries and assumes the first e820 area to be RAM.
642 */
643void __init e820_mark_nosave_regions(unsigned long limit_pfn)
644{
645 int i;
646 unsigned long pfn;
647
648 pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
649 for (i = 1; i < e820.nr_map; i++) {
650 struct e820entry *ei = &e820.map[i];
651
652 if (pfn < PFN_UP(ei->addr))
653 register_nosave_region(pfn, PFN_UP(ei->addr));
654
655 pfn = PFN_DOWN(ei->addr + ei->size);
656 if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
657 register_nosave_region(PFN_UP(ei->addr), pfn);
658
659 if (pfn >= limit_pfn)
660 break;
661 }
662}
663#endif
664
665/*
666 * Early reserved memory areas.
667 */
668#define MAX_EARLY_RES 20
669
670struct early_res {
671 u64 start, end;
672 char name[16];
673 char overlap_ok;
674};
675static struct early_res early_res[MAX_EARLY_RES] __initdata = {
676 { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
677#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE)
678 { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
679#endif
680#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
681 /*
682 * But first pinch a few for the stack/trampoline stuff
683 * FIXME: Don't need the extra page at 4K, but need to fix
684 * trampoline before removing it. (see the GDT stuff)
685 */
686 { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" },
687 /*
688 * Has to be in very low memory so we can execute
689 * real-mode AP code.
690 */
691 { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" },
692#endif
693 {}
694};
695
696static int __init find_overlapped_early(u64 start, u64 end)
697{
698 int i;
699 struct early_res *r;
700
701 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
702 r = &early_res[i];
703 if (end > r->start && start < r->end)
704 break;
705 }
706
707 return i;
708}
709
710/*
711 * Drop the i-th range from the early reservation map,
712 * by copying any higher ranges down one over it, and
713 * clearing what had been the last slot.
714 */
715static void __init drop_range(int i)
716{
717 int j;
718
719 for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
720 ;
721
722 memmove(&early_res[i], &early_res[i + 1],
723 (j - 1 - i) * sizeof(struct early_res));
724
725 early_res[j - 1].end = 0;
726}
727
728/*
729 * Split any existing ranges that:
730 * 1) are marked 'overlap_ok', and
731 * 2) overlap with the stated range [start, end)
732 * into whatever portion (if any) of the existing range is entirely
733 * below or entirely above the stated range. Drop the portion
734 * of the existing range that overlaps with the stated range,
735 * which will allow the caller of this routine to then add that
736 * stated range without conflicting with any existing range.
737 */
738static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
739{
740 int i;
741 struct early_res *r;
742 u64 lower_start, lower_end;
743 u64 upper_start, upper_end;
744 char name[16];
745
746 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
747 r = &early_res[i];
748
749 /* Continue past non-overlapping ranges */
750 if (end <= r->start || start >= r->end)
751 continue;
752
753 /*
754 * Leave non-ok overlaps as is; let caller
755 * panic "Overlapping early reservations"
756 * when it hits this overlap.
757 */
758 if (!r->overlap_ok)
759 return;
760
761 /*
762 * We have an ok overlap. We will drop it from the early
763 * reservation map, and add back in any non-overlapping
764 * portions (lower or upper) as separate, overlap_ok,
765 * non-overlapping ranges.
766 */
767
768 /* 1. Note any non-overlapping (lower or upper) ranges. */
769 strncpy(name, r->name, sizeof(name) - 1);
770
771 lower_start = lower_end = 0;
772 upper_start = upper_end = 0;
773 if (r->start < start) {
774 lower_start = r->start;
775 lower_end = start;
776 }
777 if (r->end > end) {
778 upper_start = end;
779 upper_end = r->end;
780 }
781
782 /* 2. Drop the original ok overlapping range */
783 drop_range(i);
784
785 i--; /* resume for-loop on copied down entry */
786
787 /* 3. Add back in any non-overlapping ranges. */
788 if (lower_end)
789 reserve_early_overlap_ok(lower_start, lower_end, name);
790 if (upper_end)
791 reserve_early_overlap_ok(upper_start, upper_end, name);
792 }
793}
794
795static void __init __reserve_early(u64 start, u64 end, char *name,
796 int overlap_ok)
797{
798 int i;
799 struct early_res *r;
800
801 i = find_overlapped_early(start, end);
802 if (i >= MAX_EARLY_RES)
803 panic("Too many early reservations");
804 r = &early_res[i];
805 if (r->end)
806 panic("Overlapping early reservations "
807 "%llx-%llx %s to %llx-%llx %s\n",
808 start, end - 1, name?name:"", r->start,
809 r->end - 1, r->name);
810 r->start = start;
811 r->end = end;
812 r->overlap_ok = overlap_ok;
813 if (name)
814 strncpy(r->name, name, sizeof(r->name) - 1);
815}
816
817/*
818 * A few early reservtations come here.
819 *
820 * The 'overlap_ok' in the name of this routine does -not- mean it
821 * is ok for these reservations to overlap an earlier reservation.
822 * Rather it means that it is ok for subsequent reservations to
823 * overlap this one.
824 *
825 * Use this entry point to reserve early ranges when you are doing
826 * so out of "Paranoia", reserving perhaps more memory than you need,
827 * just in case, and don't mind a subsequent overlapping reservation
828 * that is known to be needed.
829 *
830 * The drop_overlaps_that_are_ok() call here isn't really needed.
831 * It would be needed if we had two colliding 'overlap_ok'
832 * reservations, so that the second such would not panic on the
833 * overlap with the first. We don't have any such as of this
834 * writing, but might as well tolerate such if it happens in
835 * the future.
836 */
837void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
838{
839 drop_overlaps_that_are_ok(start, end);
840 __reserve_early(start, end, name, 1);
841}
842
843/*
844 * Most early reservations come here.
845 *
846 * We first have drop_overlaps_that_are_ok() drop any pre-existing
847 * 'overlap_ok' ranges, so that we can then reserve this memory
848 * range without risk of panic'ing on an overlapping overlap_ok
849 * early reservation.
850 */
851void __init reserve_early(u64 start, u64 end, char *name)
852{
853 drop_overlaps_that_are_ok(start, end);
854 __reserve_early(start, end, name, 0);
855}
856
857void __init free_early(u64 start, u64 end)
858{
859 struct early_res *r;
860 int i;
861
862 i = find_overlapped_early(start, end);
863 r = &early_res[i];
864 if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
865 panic("free_early on not reserved area: %llx-%llx!",
866 start, end - 1);
867
868 drop_range(i);
869}
870
871void __init early_res_to_bootmem(u64 start, u64 end)
872{
873 int i, count;
874 u64 final_start, final_end;
875
876 count = 0;
877 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
878 count++;
879
880 printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
881 count, start, end);
882 for (i = 0; i < count; i++) {
883 struct early_res *r = &early_res[i];
884 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
885 r->start, r->end, r->name);
886 final_start = max(start, r->start);
887 final_end = min(end, r->end);
888 if (final_start >= final_end) {
889 printk(KERN_CONT "\n");
890 continue;
891 }
892 printk(KERN_CONT " ==> [%010llx - %010llx]\n",
893 final_start, final_end);
894 reserve_bootmem_generic(final_start, final_end - final_start,
895 BOOTMEM_DEFAULT);
896 }
897}
898
899/* Check for already reserved areas */
900static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
901{
902 int i;
903 u64 addr = *addrp;
904 int changed = 0;
905 struct early_res *r;
906again:
907 i = find_overlapped_early(addr, addr + size);
908 r = &early_res[i];
909 if (i < MAX_EARLY_RES && r->end) {
910 *addrp = addr = round_up(r->end, align);
911 changed = 1;
912 goto again;
913 }
914 return changed;
915}
916
917/* Check for already reserved areas */
918static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
919{
920 int i;
921 u64 addr = *addrp, last;
922 u64 size = *sizep;
923 int changed = 0;
924again:
925 last = addr + size;
926 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
927 struct early_res *r = &early_res[i];
928 if (last > r->start && addr < r->start) {
929 size = r->start - addr;
930 changed = 1;
931 goto again;
932 }
933 if (last > r->end && addr < r->end) {
934 addr = round_up(r->end, align);
935 size = last - addr;
936 changed = 1;
937 goto again;
938 }
939 if (last <= r->end && addr >= r->start) {
940 (*sizep)++;
941 return 0;
942 }
943 }
944 if (changed) {
945 *addrp = addr;
946 *sizep = size;
947 }
948 return changed;
949}
950
951/*
952 * Find a free area with specified alignment in a specific range.
953 */
954u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
955{
956 int i;
957
958 for (i = 0; i < e820.nr_map; i++) {
959 struct e820entry *ei = &e820.map[i];
960 u64 addr, last;
961 u64 ei_last;
962
963 if (ei->type != E820_RAM)
964 continue;
965 addr = round_up(ei->addr, align);
966 ei_last = ei->addr + ei->size;
967 if (addr < start)
968 addr = round_up(start, align);
969 if (addr >= ei_last)
970 continue;
971 while (bad_addr(&addr, size, align) && addr+size <= ei_last)
972 ;
973 last = addr + size;
974 if (last > ei_last)
975 continue;
976 if (last > end)
977 continue;
978 return addr;
979 }
980 return -1ULL;
981}
982
983/*
984 * Find next free range after *start
985 */
986u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
987{
988 int i;
989
990 for (i = 0; i < e820.nr_map; i++) {
991 struct e820entry *ei = &e820.map[i];
992 u64 addr, last;
993 u64 ei_last;
994
995 if (ei->type != E820_RAM)
996 continue;
997 addr = round_up(ei->addr, align);
998 ei_last = ei->addr + ei->size;
999 if (addr < start)
1000 addr = round_up(start, align);
1001 if (addr >= ei_last)
1002 continue;
1003 *sizep = ei_last - addr;
1004 while (bad_addr_size(&addr, sizep, align) &&
1005 addr + *sizep <= ei_last)
1006 ;
1007 last = addr + *sizep;
1008 if (last > ei_last)
1009 continue;
1010 return addr;
1011 }
1012 return -1UL;
1013
1014}
1015
1016/*
1017 * pre allocated 4k and reserved it in e820
1018 */
1019u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
1020{
1021 u64 size = 0;
1022 u64 addr;
1023 u64 start;
1024
1025 start = startt;
1026 while (size < sizet)
1027 start = find_e820_area_size(start, &size, align);
1028
1029 if (size < sizet)
1030 return 0;
1031
1032 addr = round_down(start + size - sizet, align);
1033 e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
1034 e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
1035 printk(KERN_INFO "update e820 for early_reserve_e820\n");
1036 update_e820();
1037 update_e820_saved();
1038
1039 return addr;
1040}
1041
1042#ifdef CONFIG_X86_32
1043# ifdef CONFIG_X86_PAE
1044# define MAX_ARCH_PFN (1ULL<<(36-PAGE_SHIFT))
1045# else
1046# define MAX_ARCH_PFN (1ULL<<(32-PAGE_SHIFT))
1047# endif
1048#else /* CONFIG_X86_32 */
1049# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
1050#endif
1051
1052/*
1053 * Find the highest page frame number we have available
1054 */
1055static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
1056{
1057 int i;
1058 unsigned long last_pfn = 0;
1059 unsigned long max_arch_pfn = MAX_ARCH_PFN;
1060
1061 for (i = 0; i < e820.nr_map; i++) {
1062 struct e820entry *ei = &e820.map[i];
1063 unsigned long start_pfn;
1064 unsigned long end_pfn;
1065
1066 if (ei->type != type)
1067 continue;
1068
1069 start_pfn = ei->addr >> PAGE_SHIFT;
1070 end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
1071
1072 if (start_pfn >= limit_pfn)
1073 continue;
1074 if (end_pfn > limit_pfn) {
1075 last_pfn = limit_pfn;
1076 break;
1077 }
1078 if (end_pfn > last_pfn)
1079 last_pfn = end_pfn;
1080 }
1081
1082 if (last_pfn > max_arch_pfn)
1083 last_pfn = max_arch_pfn;
1084
1085 printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
1086 last_pfn, max_arch_pfn);
1087 return last_pfn;
1088}
1089unsigned long __init e820_end_of_ram_pfn(void)
1090{
1091 return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
1092}
1093
1094unsigned long __init e820_end_of_low_ram_pfn(void)
1095{
1096 return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
1097}
1098/*
1099 * Finds an active region in the address range from start_pfn to last_pfn and
1100 * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
1101 */
1102int __init e820_find_active_region(const struct e820entry *ei,
1103 unsigned long start_pfn,
1104 unsigned long last_pfn,
1105 unsigned long *ei_startpfn,
1106 unsigned long *ei_endpfn)
1107{
1108 u64 align = PAGE_SIZE;
1109
1110 *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT;
1111 *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT;
1112
1113 /* Skip map entries smaller than a page */
1114 if (*ei_startpfn >= *ei_endpfn)
1115 return 0;
1116
1117 /* Skip if map is outside the node */
1118 if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
1119 *ei_startpfn >= last_pfn)
1120 return 0;
1121
1122 /* Check for overlaps */
1123 if (*ei_startpfn < start_pfn)
1124 *ei_startpfn = start_pfn;
1125 if (*ei_endpfn > last_pfn)
1126 *ei_endpfn = last_pfn;
1127
1128 return 1;
1129}
1130
1131/* Walk the e820 map and register active regions within a node */
1132void __init e820_register_active_regions(int nid, unsigned long start_pfn,
1133 unsigned long last_pfn)
1134{
1135 unsigned long ei_startpfn;
1136 unsigned long ei_endpfn;
1137 int i;
1138
1139 for (i = 0; i < e820.nr_map; i++)
1140 if (e820_find_active_region(&e820.map[i],
1141 start_pfn, last_pfn,
1142 &ei_startpfn, &ei_endpfn))
1143 add_active_range(nid, ei_startpfn, ei_endpfn);
1144}
1145
1146/*
1147 * Find the hole size (in bytes) in the memory range.
1148 * @start: starting address of the memory range to scan
1149 * @end: ending address of the memory range to scan
1150 */
1151u64 __init e820_hole_size(u64 start, u64 end)
1152{
1153 unsigned long start_pfn = start >> PAGE_SHIFT;
1154 unsigned long last_pfn = end >> PAGE_SHIFT;
1155 unsigned long ei_startpfn, ei_endpfn, ram = 0;
1156 int i;
1157
1158 for (i = 0; i < e820.nr_map; i++) {
1159 if (e820_find_active_region(&e820.map[i],
1160 start_pfn, last_pfn,
1161 &ei_startpfn, &ei_endpfn))
1162 ram += ei_endpfn - ei_startpfn;
1163 }
1164 return end - start - ((u64)ram << PAGE_SHIFT);
1165}
1166
1167static void early_panic(char *msg)
1168{
1169 early_printk(msg);
1170 panic(msg);
1171}
1172
1173static int userdef __initdata;
1174
1175/* "mem=nopentium" disables the 4MB page tables. */
1176static int __init parse_memopt(char *p)
1177{
1178 u64 mem_size;
1179
1180 if (!p)
1181 return -EINVAL;
1182
1183#ifdef CONFIG_X86_32
1184 if (!strcmp(p, "nopentium")) {
1185 setup_clear_cpu_cap(X86_FEATURE_PSE);
1186 return 0;
1187 }
1188#endif
1189
1190 userdef = 1;
1191 mem_size = memparse(p, &p);
1192 e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
1193
1194 return 0;
1195}
1196early_param("mem", parse_memopt);
1197
1198static int __init parse_memmap_opt(char *p)
1199{
1200 char *oldp;
1201 u64 start_at, mem_size;
1202
1203 if (!p)
1204 return -EINVAL;
1205
1206 if (!strcmp(p, "exactmap")) {
1207#ifdef CONFIG_CRASH_DUMP
1208 /*
1209 * If we are doing a crash dump, we still need to know
1210 * the real mem size before original memory map is
1211 * reset.
1212 */
1213 saved_max_pfn = e820_end_of_ram_pfn();
1214#endif
1215 e820.nr_map = 0;
1216 userdef = 1;
1217 return 0;
1218 }
1219
1220 oldp = p;
1221 mem_size = memparse(p, &p);
1222 if (p == oldp)
1223 return -EINVAL;
1224
1225 userdef = 1;
1226 if (*p == '@') {
1227 start_at = memparse(p+1, &p);
1228 e820_add_region(start_at, mem_size, E820_RAM);
1229 } else if (*p == '#') {
1230 start_at = memparse(p+1, &p);
1231 e820_add_region(start_at, mem_size, E820_ACPI);
1232 } else if (*p == '$') {
1233 start_at = memparse(p+1, &p);
1234 e820_add_region(start_at, mem_size, E820_RESERVED);
1235 } else
1236 e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
1237
1238 return *p == '\0' ? 0 : -EINVAL;
1239}
1240early_param("memmap", parse_memmap_opt);
1241
1242void __init finish_e820_parsing(void)
1243{
1244 if (userdef) {
1245 int nr = e820.nr_map;
1246
1247 if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
1248 early_panic("Invalid user supplied memory map");
1249 e820.nr_map = nr;
1250
1251 printk(KERN_INFO "user-defined physical RAM map:\n");
1252 e820_print_map("user");
1253 }
1254}
1255
1256static inline const char *e820_type_to_string(int e820_type)
1257{
1258 switch (e820_type) {
1259 case E820_RESERVED_KERN:
1260 case E820_RAM: return "System RAM";
1261 case E820_ACPI: return "ACPI Tables";
1262 case E820_NVS: return "ACPI Non-volatile Storage";
1263 default: return "reserved";
1264 }
1265}
1266
1267/*
1268 * Mark e820 reserved areas as busy for the resource manager.
1269 */
1270void __init e820_reserve_resources(void)
1271{
1272 int i;
1273 struct resource *res;
1274 u64 end;
1275
1276 res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
1277 for (i = 0; i < e820.nr_map; i++) {
1278 end = e820.map[i].addr + e820.map[i].size - 1;
1279#ifndef CONFIG_RESOURCES_64BIT
1280 if (end > 0x100000000ULL) {
1281 res++;
1282 continue;
1283 }
1284#endif
1285 res->name = e820_type_to_string(e820.map[i].type);
1286 res->start = e820.map[i].addr;
1287 res->end = end;
1288
1289 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
1290 insert_resource(&iomem_resource, res);
1291 res++;
1292 }
1293
1294 for (i = 0; i < e820_saved.nr_map; i++) {
1295 struct e820entry *entry = &e820_saved.map[i];
1296 firmware_map_add_early(entry->addr,
1297 entry->addr + entry->size - 1,
1298 e820_type_to_string(entry->type));
1299 }
1300}
1301
1302char *__init default_machine_specific_memory_setup(void)
1303{
1304 char *who = "BIOS-e820";
1305 int new_nr;
1306 /*
1307 * Try to copy the BIOS-supplied E820-map.
1308 *
1309 * Otherwise fake a memory map; one section from 0k->640k,
1310 * the next section from 1mb->appropriate_mem_k
1311 */
1312 new_nr = boot_params.e820_entries;
1313 sanitize_e820_map(boot_params.e820_map,
1314 ARRAY_SIZE(boot_params.e820_map),
1315 &new_nr);
1316 boot_params.e820_entries = new_nr;
1317 if (append_e820_map(boot_params.e820_map, boot_params.e820_entries)
1318 < 0) {
1319 u64 mem_size;
1320
1321 /* compare results from other methods and take the greater */
1322 if (boot_params.alt_mem_k
1323 < boot_params.screen_info.ext_mem_k) {
1324 mem_size = boot_params.screen_info.ext_mem_k;
1325 who = "BIOS-88";
1326 } else {
1327 mem_size = boot_params.alt_mem_k;
1328 who = "BIOS-e801";
1329 }
1330
1331 e820.nr_map = 0;
1332 e820_add_region(0, LOWMEMSIZE(), E820_RAM);
1333 e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
1334 }
1335
1336 /* In case someone cares... */
1337 return who;
1338}
1339
1340char *__init __attribute__((weak)) machine_specific_memory_setup(void)
1341{
1342 if (x86_quirks->arch_memory_setup) {
1343 char *who = x86_quirks->arch_memory_setup();
1344
1345 if (who)
1346 return who;
1347 }
1348 return default_machine_specific_memory_setup();
1349}
1350
1351/* Overridden in paravirt.c if CONFIG_PARAVIRT */
1352char * __init __attribute__((weak)) memory_setup(void)
1353{
1354 return machine_specific_memory_setup();
1355}
1356
1357void __init setup_memory_map(void)
1358{
1359 char *who;
1360
1361 who = memory_setup();
1362 memcpy(&e820_saved, &e820, sizeof(struct e820map));
1363 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
1364 e820_print_map(who);
1365}
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
deleted file mode 100644
index ed733e7cf4e6..000000000000
--- a/arch/x86/kernel/e820_32.c
+++ /dev/null
@@ -1,775 +0,0 @@
1#include <linux/kernel.h>
2#include <linux/types.h>
3#include <linux/init.h>
4#include <linux/bootmem.h>
5#include <linux/ioport.h>
6#include <linux/string.h>
7#include <linux/kexec.h>
8#include <linux/module.h>
9#include <linux/mm.h>
10#include <linux/pfn.h>
11#include <linux/uaccess.h>
12#include <linux/suspend.h>
13
14#include <asm/pgtable.h>
15#include <asm/page.h>
16#include <asm/e820.h>
17#include <asm/setup.h>
18
19struct e820map e820;
20struct change_member {
21 struct e820entry *pbios; /* pointer to original bios entry */
22 unsigned long long addr; /* address for this change point */
23};
24static struct change_member change_point_list[2*E820MAX] __initdata;
25static struct change_member *change_point[2*E820MAX] __initdata;
26static struct e820entry *overlap_list[E820MAX] __initdata;
27static struct e820entry new_bios[E820MAX] __initdata;
28/* For PCI or other memory-mapped resources */
29unsigned long pci_mem_start = 0x10000000;
30#ifdef CONFIG_PCI
31EXPORT_SYMBOL(pci_mem_start);
32#endif
33extern int user_defined_memmap;
34
35static struct resource system_rom_resource = {
36 .name = "System ROM",
37 .start = 0xf0000,
38 .end = 0xfffff,
39 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
40};
41
42static struct resource extension_rom_resource = {
43 .name = "Extension ROM",
44 .start = 0xe0000,
45 .end = 0xeffff,
46 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
47};
48
49static struct resource adapter_rom_resources[] = { {
50 .name = "Adapter ROM",
51 .start = 0xc8000,
52 .end = 0,
53 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
54}, {
55 .name = "Adapter ROM",
56 .start = 0,
57 .end = 0,
58 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
59}, {
60 .name = "Adapter ROM",
61 .start = 0,
62 .end = 0,
63 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
64}, {
65 .name = "Adapter ROM",
66 .start = 0,
67 .end = 0,
68 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
69}, {
70 .name = "Adapter ROM",
71 .start = 0,
72 .end = 0,
73 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
74}, {
75 .name = "Adapter ROM",
76 .start = 0,
77 .end = 0,
78 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
79} };
80
81static struct resource video_rom_resource = {
82 .name = "Video ROM",
83 .start = 0xc0000,
84 .end = 0xc7fff,
85 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
86};
87
88#define ROMSIGNATURE 0xaa55
89
90static int __init romsignature(const unsigned char *rom)
91{
92 const unsigned short * const ptr = (const unsigned short *)rom;
93 unsigned short sig;
94
95 return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
96}
97
98static int __init romchecksum(const unsigned char *rom, unsigned long length)
99{
100 unsigned char sum, c;
101
102 for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
103 sum += c;
104 return !length && !sum;
105}
106
107static void __init probe_roms(void)
108{
109 const unsigned char *rom;
110 unsigned long start, length, upper;
111 unsigned char c;
112 int i;
113
114 /* video rom */
115 upper = adapter_rom_resources[0].start;
116 for (start = video_rom_resource.start; start < upper; start += 2048) {
117 rom = isa_bus_to_virt(start);
118 if (!romsignature(rom))
119 continue;
120
121 video_rom_resource.start = start;
122
123 if (probe_kernel_address(rom + 2, c) != 0)
124 continue;
125
126 /* 0 < length <= 0x7f * 512, historically */
127 length = c * 512;
128
129 /* if checksum okay, trust length byte */
130 if (length && romchecksum(rom, length))
131 video_rom_resource.end = start + length - 1;
132
133 request_resource(&iomem_resource, &video_rom_resource);
134 break;
135 }
136
137 start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
138 if (start < upper)
139 start = upper;
140
141 /* system rom */
142 request_resource(&iomem_resource, &system_rom_resource);
143 upper = system_rom_resource.start;
144
145 /* check for extension rom (ignore length byte!) */
146 rom = isa_bus_to_virt(extension_rom_resource.start);
147 if (romsignature(rom)) {
148 length = extension_rom_resource.end - extension_rom_resource.start + 1;
149 if (romchecksum(rom, length)) {
150 request_resource(&iomem_resource, &extension_rom_resource);
151 upper = extension_rom_resource.start;
152 }
153 }
154
155 /* check for adapter roms on 2k boundaries */
156 for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
157 rom = isa_bus_to_virt(start);
158 if (!romsignature(rom))
159 continue;
160
161 if (probe_kernel_address(rom + 2, c) != 0)
162 continue;
163
164 /* 0 < length <= 0x7f * 512, historically */
165 length = c * 512;
166
167 /* but accept any length that fits if checksum okay */
168 if (!length || start + length > upper || !romchecksum(rom, length))
169 continue;
170
171 adapter_rom_resources[i].start = start;
172 adapter_rom_resources[i].end = start + length - 1;
173 request_resource(&iomem_resource, &adapter_rom_resources[i]);
174
175 start = adapter_rom_resources[i++].end & ~2047UL;
176 }
177}
178
179/*
180 * Request address space for all standard RAM and ROM resources
181 * and also for regions reported as reserved by the e820.
182 */
183void __init init_iomem_resources(struct resource *code_resource,
184 struct resource *data_resource,
185 struct resource *bss_resource)
186{
187 int i;
188
189 probe_roms();
190 for (i = 0; i < e820.nr_map; i++) {
191 struct resource *res;
192#ifndef CONFIG_RESOURCES_64BIT
193 if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
194 continue;
195#endif
196 res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
197 switch (e820.map[i].type) {
198 case E820_RAM: res->name = "System RAM"; break;
199 case E820_ACPI: res->name = "ACPI Tables"; break;
200 case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
201 default: res->name = "reserved";
202 }
203 res->start = e820.map[i].addr;
204 res->end = res->start + e820.map[i].size - 1;
205 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
206 if (request_resource(&iomem_resource, res)) {
207 kfree(res);
208 continue;
209 }
210 if (e820.map[i].type == E820_RAM) {
211 /*
212 * We don't know which RAM region contains kernel data,
213 * so we try it repeatedly and let the resource manager
214 * test it.
215 */
216 request_resource(res, code_resource);
217 request_resource(res, data_resource);
218 request_resource(res, bss_resource);
219#ifdef CONFIG_KEXEC
220 if (crashk_res.start != crashk_res.end)
221 request_resource(res, &crashk_res);
222#endif
223 }
224 }
225}
226
227#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
228/**
229 * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
230 * correspond to e820 RAM areas and mark the corresponding pages as nosave for
231 * hibernation.
232 *
233 * This function requires the e820 map to be sorted and without any
234 * overlapping entries and assumes the first e820 area to be RAM.
235 */
236void __init e820_mark_nosave_regions(void)
237{
238 int i;
239 unsigned long pfn;
240
241 pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
242 for (i = 1; i < e820.nr_map; i++) {
243 struct e820entry *ei = &e820.map[i];
244
245 if (pfn < PFN_UP(ei->addr))
246 register_nosave_region(pfn, PFN_UP(ei->addr));
247
248 pfn = PFN_DOWN(ei->addr + ei->size);
249 if (ei->type != E820_RAM)
250 register_nosave_region(PFN_UP(ei->addr), pfn);
251
252 if (pfn >= max_low_pfn)
253 break;
254 }
255}
256#endif
257
258void __init add_memory_region(unsigned long long start,
259 unsigned long long size, int type)
260{
261 int x;
262
263 x = e820.nr_map;
264
265 if (x == E820MAX) {
266 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
267 return;
268 }
269
270 e820.map[x].addr = start;
271 e820.map[x].size = size;
272 e820.map[x].type = type;
273 e820.nr_map++;
274} /* add_memory_region */
275
276/*
277 * Sanitize the BIOS e820 map.
278 *
279 * Some e820 responses include overlapping entries. The following
280 * replaces the original e820 map with a new one, removing overlaps.
281 *
282 */
283int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
284{
285 struct change_member *change_tmp;
286 unsigned long current_type, last_type;
287 unsigned long long last_addr;
288 int chgidx, still_changing;
289 int overlap_entries;
290 int new_bios_entry;
291 int old_nr, new_nr, chg_nr;
292 int i;
293
294 /*
295 Visually we're performing the following (1,2,3,4 = memory types)...
296
297 Sample memory map (w/overlaps):
298 ____22__________________
299 ______________________4_
300 ____1111________________
301 _44_____________________
302 11111111________________
303 ____________________33__
304 ___________44___________
305 __________33333_________
306 ______________22________
307 ___________________2222_
308 _________111111111______
309 _____________________11_
310 _________________4______
311
312 Sanitized equivalent (no overlap):
313 1_______________________
314 _44_____________________
315 ___1____________________
316 ____22__________________
317 ______11________________
318 _________1______________
319 __________3_____________
320 ___________44___________
321 _____________33_________
322 _______________2________
323 ________________1_______
324 _________________4______
325 ___________________2____
326 ____________________33__
327 ______________________4_
328 */
329 /* if there's only one memory region, don't bother */
330 if (*pnr_map < 2) {
331 return -1;
332 }
333
334 old_nr = *pnr_map;
335
336 /* bail out if we find any unreasonable addresses in bios map */
337 for (i=0; i<old_nr; i++)
338 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
339 return -1;
340 }
341
342 /* create pointers for initial change-point information (for sorting) */
343 for (i=0; i < 2*old_nr; i++)
344 change_point[i] = &change_point_list[i];
345
346 /* record all known change-points (starting and ending addresses),
347 omitting those that are for empty memory regions */
348 chgidx = 0;
349 for (i=0; i < old_nr; i++) {
350 if (biosmap[i].size != 0) {
351 change_point[chgidx]->addr = biosmap[i].addr;
352 change_point[chgidx++]->pbios = &biosmap[i];
353 change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
354 change_point[chgidx++]->pbios = &biosmap[i];
355 }
356 }
357 chg_nr = chgidx; /* true number of change-points */
358
359 /* sort change-point list by memory addresses (low -> high) */
360 still_changing = 1;
361 while (still_changing) {
362 still_changing = 0;
363 for (i=1; i < chg_nr; i++) {
364 /* if <current_addr> > <last_addr>, swap */
365 /* or, if current=<start_addr> & last=<end_addr>, swap */
366 if ((change_point[i]->addr < change_point[i-1]->addr) ||
367 ((change_point[i]->addr == change_point[i-1]->addr) &&
368 (change_point[i]->addr == change_point[i]->pbios->addr) &&
369 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
370 )
371 {
372 change_tmp = change_point[i];
373 change_point[i] = change_point[i-1];
374 change_point[i-1] = change_tmp;
375 still_changing=1;
376 }
377 }
378 }
379
380 /* create a new bios memory map, removing overlaps */
381 overlap_entries=0; /* number of entries in the overlap table */
382 new_bios_entry=0; /* index for creating new bios map entries */
383 last_type = 0; /* start with undefined memory type */
384 last_addr = 0; /* start with 0 as last starting address */
385 /* loop through change-points, determining affect on the new bios map */
386 for (chgidx=0; chgidx < chg_nr; chgidx++)
387 {
388 /* keep track of all overlapping bios entries */
389 if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
390 {
391 /* add map entry to overlap list (> 1 entry implies an overlap) */
392 overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
393 }
394 else
395 {
396 /* remove entry from list (order independent, so swap with last) */
397 for (i=0; i<overlap_entries; i++)
398 {
399 if (overlap_list[i] == change_point[chgidx]->pbios)
400 overlap_list[i] = overlap_list[overlap_entries-1];
401 }
402 overlap_entries--;
403 }
404 /* if there are overlapping entries, decide which "type" to use */
405 /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
406 current_type = 0;
407 for (i=0; i<overlap_entries; i++)
408 if (overlap_list[i]->type > current_type)
409 current_type = overlap_list[i]->type;
410 /* continue building up new bios map based on this information */
411 if (current_type != last_type) {
412 if (last_type != 0) {
413 new_bios[new_bios_entry].size =
414 change_point[chgidx]->addr - last_addr;
415 /* move forward only if the new size was non-zero */
416 if (new_bios[new_bios_entry].size != 0)
417 if (++new_bios_entry >= E820MAX)
418 break; /* no more space left for new bios entries */
419 }
420 if (current_type != 0) {
421 new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
422 new_bios[new_bios_entry].type = current_type;
423 last_addr=change_point[chgidx]->addr;
424 }
425 last_type = current_type;
426 }
427 }
428 new_nr = new_bios_entry; /* retain count for new bios entries */
429
430 /* copy new bios mapping into original location */
431 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
432 *pnr_map = new_nr;
433
434 return 0;
435}
436
437/*
438 * Copy the BIOS e820 map into a safe place.
439 *
440 * Sanity-check it while we're at it..
441 *
442 * If we're lucky and live on a modern system, the setup code
443 * will have given us a memory map that we can use to properly
444 * set up memory. If we aren't, we'll fake a memory map.
445 *
446 * We check to see that the memory map contains at least 2 elements
447 * before we'll use it, because the detection code in setup.S may
448 * not be perfect and most every PC known to man has two memory
449 * regions: one from 0 to 640k, and one from 1mb up. (The IBM
450 * thinkpad 560x, for example, does not cooperate with the memory
451 * detection code.)
452 */
453int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
454{
455 /* Only one memory region (or negative)? Ignore it */
456 if (nr_map < 2)
457 return -1;
458
459 do {
460 u64 start = biosmap->addr;
461 u64 size = biosmap->size;
462 u64 end = start + size;
463 u32 type = biosmap->type;
464
465 /* Overflow in 64 bits? Ignore the memory map. */
466 if (start > end)
467 return -1;
468
469 add_memory_region(start, size, type);
470 } while (biosmap++, --nr_map);
471
472 return 0;
473}
474
475/*
476 * Find the highest page frame number we have available
477 */
478void __init propagate_e820_map(void)
479{
480 int i;
481
482 max_pfn = 0;
483
484 for (i = 0; i < e820.nr_map; i++) {
485 unsigned long start, end;
486 /* RAM? */
487 if (e820.map[i].type != E820_RAM)
488 continue;
489 start = PFN_UP(e820.map[i].addr);
490 end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
491 if (start >= end)
492 continue;
493 if (end > max_pfn)
494 max_pfn = end;
495 memory_present(0, start, end);
496 }
497}
498
499/*
500 * Register fully available low RAM pages with the bootmem allocator.
501 */
502void __init register_bootmem_low_pages(unsigned long max_low_pfn)
503{
504 int i;
505
506 for (i = 0; i < e820.nr_map; i++) {
507 unsigned long curr_pfn, last_pfn, size;
508 /*
509 * Reserve usable low memory
510 */
511 if (e820.map[i].type != E820_RAM)
512 continue;
513 /*
514 * We are rounding up the start address of usable memory:
515 */
516 curr_pfn = PFN_UP(e820.map[i].addr);
517 if (curr_pfn >= max_low_pfn)
518 continue;
519 /*
520 * ... and at the end of the usable range downwards:
521 */
522 last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
523
524 if (last_pfn > max_low_pfn)
525 last_pfn = max_low_pfn;
526
527 /*
528 * .. finally, did all the rounding and playing
529 * around just make the area go away?
530 */
531 if (last_pfn <= curr_pfn)
532 continue;
533
534 size = last_pfn - curr_pfn;
535 free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
536 }
537}
538
539void __init e820_register_memory(void)
540{
541 unsigned long gapstart, gapsize, round;
542 unsigned long long last;
543 int i;
544
545 /*
546 * Search for the biggest gap in the low 32 bits of the e820
547 * memory space.
548 */
549 last = 0x100000000ull;
550 gapstart = 0x10000000;
551 gapsize = 0x400000;
552 i = e820.nr_map;
553 while (--i >= 0) {
554 unsigned long long start = e820.map[i].addr;
555 unsigned long long end = start + e820.map[i].size;
556
557 /*
558 * Since "last" is at most 4GB, we know we'll
559 * fit in 32 bits if this condition is true
560 */
561 if (last > end) {
562 unsigned long gap = last - end;
563
564 if (gap > gapsize) {
565 gapsize = gap;
566 gapstart = end;
567 }
568 }
569 if (start < last)
570 last = start;
571 }
572
573 /*
574 * See how much we want to round up: start off with
575 * rounding to the next 1MB area.
576 */
577 round = 0x100000;
578 while ((gapsize >> 4) > round)
579 round += round;
580 /* Fun with two's complement */
581 pci_mem_start = (gapstart + round) & -round;
582
583 printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
584 pci_mem_start, gapstart, gapsize);
585}
586
587void __init print_memory_map(char *who)
588{
589 int i;
590
591 for (i = 0; i < e820.nr_map; i++) {
592 printk(" %s: %016Lx - %016Lx ", who,
593 e820.map[i].addr,
594 e820.map[i].addr + e820.map[i].size);
595 switch (e820.map[i].type) {
596 case E820_RAM: printk("(usable)\n");
597 break;
598 case E820_RESERVED:
599 printk("(reserved)\n");
600 break;
601 case E820_ACPI:
602 printk("(ACPI data)\n");
603 break;
604 case E820_NVS:
605 printk("(ACPI NVS)\n");
606 break;
607 default: printk("type %u\n", e820.map[i].type);
608 break;
609 }
610 }
611}
612
613void __init limit_regions(unsigned long long size)
614{
615 unsigned long long current_addr;
616 int i;
617
618 print_memory_map("limit_regions start");
619 for (i = 0; i < e820.nr_map; i++) {
620 current_addr = e820.map[i].addr + e820.map[i].size;
621 if (current_addr < size)
622 continue;
623
624 if (e820.map[i].type != E820_RAM)
625 continue;
626
627 if (e820.map[i].addr >= size) {
628 /*
629 * This region starts past the end of the
630 * requested size, skip it completely.
631 */
632 e820.nr_map = i;
633 } else {
634 e820.nr_map = i + 1;
635 e820.map[i].size -= current_addr - size;
636 }
637 print_memory_map("limit_regions endfor");
638 return;
639 }
640 print_memory_map("limit_regions endfunc");
641}
642
643/*
644 * This function checks if any part of the range <start,end> is mapped
645 * with type.
646 */
647int
648e820_any_mapped(u64 start, u64 end, unsigned type)
649{
650 int i;
651 for (i = 0; i < e820.nr_map; i++) {
652 const struct e820entry *ei = &e820.map[i];
653 if (type && ei->type != type)
654 continue;
655 if (ei->addr >= end || ei->addr + ei->size <= start)
656 continue;
657 return 1;
658 }
659 return 0;
660}
661EXPORT_SYMBOL_GPL(e820_any_mapped);
662
663 /*
664 * This function checks if the entire range <start,end> is mapped with type.
665 *
666 * Note: this function only works correct if the e820 table is sorted and
667 * not-overlapping, which is the case
668 */
669int __init
670e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
671{
672 u64 start = s;
673 u64 end = e;
674 int i;
675 for (i = 0; i < e820.nr_map; i++) {
676 struct e820entry *ei = &e820.map[i];
677 if (type && ei->type != type)
678 continue;
679 /* is the region (part) in overlap with the current region ?*/
680 if (ei->addr >= end || ei->addr + ei->size <= start)
681 continue;
682 /* if the region is at the beginning of <start,end> we move
683 * start to the end of the region since it's ok until there
684 */
685 if (ei->addr <= start)
686 start = ei->addr + ei->size;
687 /* if start is now at or beyond end, we're done, full
688 * coverage */
689 if (start >= end)
690 return 1; /* we're done */
691 }
692 return 0;
693}
694
695static int __init parse_memmap(char *arg)
696{
697 if (!arg)
698 return -EINVAL;
699
700 if (strcmp(arg, "exactmap") == 0) {
701#ifdef CONFIG_CRASH_DUMP
702 /* If we are doing a crash dump, we
703 * still need to know the real mem
704 * size before original memory map is
705 * reset.
706 */
707 propagate_e820_map();
708 saved_max_pfn = max_pfn;
709#endif
710 e820.nr_map = 0;
711 user_defined_memmap = 1;
712 } else {
713 /* If the user specifies memory size, we
714 * limit the BIOS-provided memory map to
715 * that size. exactmap can be used to specify
716 * the exact map. mem=number can be used to
717 * trim the existing memory map.
718 */
719 unsigned long long start_at, mem_size;
720
721 mem_size = memparse(arg, &arg);
722 if (*arg == '@') {
723 start_at = memparse(arg+1, &arg);
724 add_memory_region(start_at, mem_size, E820_RAM);
725 } else if (*arg == '#') {
726 start_at = memparse(arg+1, &arg);
727 add_memory_region(start_at, mem_size, E820_ACPI);
728 } else if (*arg == '$') {
729 start_at = memparse(arg+1, &arg);
730 add_memory_region(start_at, mem_size, E820_RESERVED);
731 } else {
732 limit_regions(mem_size);
733 user_defined_memmap = 1;
734 }
735 }
736 return 0;
737}
738early_param("memmap", parse_memmap);
739void __init update_memory_range(u64 start, u64 size, unsigned old_type,
740 unsigned new_type)
741{
742 int i;
743
744 BUG_ON(old_type == new_type);
745
746 for (i = 0; i < e820.nr_map; i++) {
747 struct e820entry *ei = &e820.map[i];
748 u64 final_start, final_end;
749 if (ei->type != old_type)
750 continue;
751 /* totally covered? */
752 if (ei->addr >= start && ei->size <= size) {
753 ei->type = new_type;
754 continue;
755 }
756 /* partially covered */
757 final_start = max(start, ei->addr);
758 final_end = min(start + size, ei->addr + ei->size);
759 if (final_start >= final_end)
760 continue;
761 add_memory_region(final_start, final_end - final_start,
762 new_type);
763 }
764}
765void __init update_e820(void)
766{
767 u8 nr_map;
768
769 nr_map = e820.nr_map;
770 if (sanitize_e820_map(e820.map, &nr_map))
771 return;
772 e820.nr_map = nr_map;
773 printk(KERN_INFO "modified physical RAM map:\n");
774 print_memory_map("modified");
775}
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
deleted file mode 100644
index 124480c0008d..000000000000
--- a/arch/x86/kernel/e820_64.c
+++ /dev/null
@@ -1,952 +0,0 @@
1/*
2 * Handle the memory map.
3 * The functions here do the job until bootmem takes over.
4 *
5 * Getting sanitize_e820_map() in sync with i386 version by applying change:
6 * - Provisions for empty E820 memory regions (reported by certain BIOSes).
7 * Alex Achenbach <xela@slit.de>, December 2002.
8 * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
9 *
10 */
11#include <linux/kernel.h>
12#include <linux/types.h>
13#include <linux/init.h>
14#include <linux/bootmem.h>
15#include <linux/ioport.h>
16#include <linux/string.h>
17#include <linux/kexec.h>
18#include <linux/module.h>
19#include <linux/mm.h>
20#include <linux/suspend.h>
21#include <linux/pfn.h>
22
23#include <asm/pgtable.h>
24#include <asm/page.h>
25#include <asm/e820.h>
26#include <asm/proto.h>
27#include <asm/setup.h>
28#include <asm/sections.h>
29#include <asm/kdebug.h>
30#include <asm/trampoline.h>
31
32struct e820map e820;
33
34/*
35 * PFN of last memory page.
36 */
37unsigned long end_pfn;
38
39/*
40 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
41 * The direct mapping extends to max_pfn_mapped, so that we can directly access
42 * apertures, ACPI and other tables without having to play with fixmaps.
43 */
44unsigned long max_pfn_mapped;
45
46/*
47 * Last pfn which the user wants to use.
48 */
49static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
50
51/*
52 * Early reserved memory areas.
53 */
54#define MAX_EARLY_RES 20
55
56struct early_res {
57 unsigned long start, end;
58 char name[16];
59};
60static struct early_res early_res[MAX_EARLY_RES] __initdata = {
61 { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
62#ifdef CONFIG_X86_TRAMPOLINE
63 { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
64#endif
65 {}
66};
67
68void __init reserve_early(unsigned long start, unsigned long end, char *name)
69{
70 int i;
71 struct early_res *r;
72 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
73 r = &early_res[i];
74 if (end > r->start && start < r->end)
75 panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n",
76 start, end - 1, name?name:"", r->start, r->end - 1, r->name);
77 }
78 if (i >= MAX_EARLY_RES)
79 panic("Too many early reservations");
80 r = &early_res[i];
81 r->start = start;
82 r->end = end;
83 if (name)
84 strncpy(r->name, name, sizeof(r->name) - 1);
85}
86
87void __init free_early(unsigned long start, unsigned long end)
88{
89 struct early_res *r;
90 int i, j;
91
92 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
93 r = &early_res[i];
94 if (start == r->start && end == r->end)
95 break;
96 }
97 if (i >= MAX_EARLY_RES || !early_res[i].end)
98 panic("free_early on not reserved area: %lx-%lx!", start, end);
99
100 for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
101 ;
102
103 memmove(&early_res[i], &early_res[i + 1],
104 (j - 1 - i) * sizeof(struct early_res));
105
106 early_res[j - 1].end = 0;
107}
108
109void __init early_res_to_bootmem(unsigned long start, unsigned long end)
110{
111 int i;
112 unsigned long final_start, final_end;
113 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
114 struct early_res *r = &early_res[i];
115 final_start = max(start, r->start);
116 final_end = min(end, r->end);
117 if (final_start >= final_end)
118 continue;
119 printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i,
120 final_start, final_end - 1, r->name);
121 reserve_bootmem_generic(final_start, final_end - final_start);
122 }
123}
124
125/* Check for already reserved areas */
126static inline int __init
127bad_addr(unsigned long *addrp, unsigned long size, unsigned long align)
128{
129 int i;
130 unsigned long addr = *addrp, last;
131 int changed = 0;
132again:
133 last = addr + size;
134 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
135 struct early_res *r = &early_res[i];
136 if (last >= r->start && addr < r->end) {
137 *addrp = addr = round_up(r->end, align);
138 changed = 1;
139 goto again;
140 }
141 }
142 return changed;
143}
144
145/* Check for already reserved areas */
146static inline int __init
147bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align)
148{
149 int i;
150 unsigned long addr = *addrp, last;
151 unsigned long size = *sizep;
152 int changed = 0;
153again:
154 last = addr + size;
155 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
156 struct early_res *r = &early_res[i];
157 if (last > r->start && addr < r->start) {
158 size = r->start - addr;
159 changed = 1;
160 goto again;
161 }
162 if (last > r->end && addr < r->end) {
163 addr = round_up(r->end, align);
164 size = last - addr;
165 changed = 1;
166 goto again;
167 }
168 if (last <= r->end && addr >= r->start) {
169 (*sizep)++;
170 return 0;
171 }
172 }
173 if (changed) {
174 *addrp = addr;
175 *sizep = size;
176 }
177 return changed;
178}
179/*
180 * This function checks if any part of the range <start,end> is mapped
181 * with type.
182 */
183int
184e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
185{
186 int i;
187
188 for (i = 0; i < e820.nr_map; i++) {
189 struct e820entry *ei = &e820.map[i];
190
191 if (type && ei->type != type)
192 continue;
193 if (ei->addr >= end || ei->addr + ei->size <= start)
194 continue;
195 return 1;
196 }
197 return 0;
198}
199EXPORT_SYMBOL_GPL(e820_any_mapped);
200
201/*
202 * This function checks if the entire range <start,end> is mapped with type.
203 *
204 * Note: this function only works correct if the e820 table is sorted and
205 * not-overlapping, which is the case
206 */
207int __init e820_all_mapped(unsigned long start, unsigned long end,
208 unsigned type)
209{
210 int i;
211
212 for (i = 0; i < e820.nr_map; i++) {
213 struct e820entry *ei = &e820.map[i];
214
215 if (type && ei->type != type)
216 continue;
217 /* is the region (part) in overlap with the current region ?*/
218 if (ei->addr >= end || ei->addr + ei->size <= start)
219 continue;
220
221 /* if the region is at the beginning of <start,end> we move
222 * start to the end of the region since it's ok until there
223 */
224 if (ei->addr <= start)
225 start = ei->addr + ei->size;
226 /*
227 * if start is now at or beyond end, we're done, full
228 * coverage
229 */
230 if (start >= end)
231 return 1;
232 }
233 return 0;
234}
235
236/*
237 * Find a free area with specified alignment in a specific range.
238 */
239unsigned long __init find_e820_area(unsigned long start, unsigned long end,
240 unsigned long size, unsigned long align)
241{
242 int i;
243
244 for (i = 0; i < e820.nr_map; i++) {
245 struct e820entry *ei = &e820.map[i];
246 unsigned long addr, last;
247 unsigned long ei_last;
248
249 if (ei->type != E820_RAM)
250 continue;
251 addr = round_up(ei->addr, align);
252 ei_last = ei->addr + ei->size;
253 if (addr < start)
254 addr = round_up(start, align);
255 if (addr >= ei_last)
256 continue;
257 while (bad_addr(&addr, size, align) && addr+size <= ei_last)
258 ;
259 last = addr + size;
260 if (last > ei_last)
261 continue;
262 if (last > end)
263 continue;
264 return addr;
265 }
266 return -1UL;
267}
268
269/*
270 * Find next free range after *start
271 */
272unsigned long __init find_e820_area_size(unsigned long start,
273 unsigned long *sizep,
274 unsigned long align)
275{
276 int i;
277
278 for (i = 0; i < e820.nr_map; i++) {
279 struct e820entry *ei = &e820.map[i];
280 unsigned long addr, last;
281 unsigned long ei_last;
282
283 if (ei->type != E820_RAM)
284 continue;
285 addr = round_up(ei->addr, align);
286 ei_last = ei->addr + ei->size;
287 if (addr < start)
288 addr = round_up(start, align);
289 if (addr >= ei_last)
290 continue;
291 *sizep = ei_last - addr;
292 while (bad_addr_size(&addr, sizep, align) &&
293 addr + *sizep <= ei_last)
294 ;
295 last = addr + *sizep;
296 if (last > ei_last)
297 continue;
298 return addr;
299 }
300 return -1UL;
301
302}
303/*
304 * Find the highest page frame number we have available
305 */
306unsigned long __init e820_end_of_ram(void)
307{
308 unsigned long end_pfn;
309
310 end_pfn = find_max_pfn_with_active_regions();
311
312 if (end_pfn > max_pfn_mapped)
313 max_pfn_mapped = end_pfn;
314 if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT)
315 max_pfn_mapped = MAXMEM>>PAGE_SHIFT;
316 if (end_pfn > end_user_pfn)
317 end_pfn = end_user_pfn;
318 if (end_pfn > max_pfn_mapped)
319 end_pfn = max_pfn_mapped;
320
321 printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped);
322 return end_pfn;
323}
324
325/*
326 * Mark e820 reserved areas as busy for the resource manager.
327 */
328void __init e820_reserve_resources(void)
329{
330 int i;
331 struct resource *res;
332
333 res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
334 for (i = 0; i < e820.nr_map; i++) {
335 switch (e820.map[i].type) {
336 case E820_RAM: res->name = "System RAM"; break;
337 case E820_ACPI: res->name = "ACPI Tables"; break;
338 case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
339 default: res->name = "reserved";
340 }
341 res->start = e820.map[i].addr;
342 res->end = res->start + e820.map[i].size - 1;
343 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
344 insert_resource(&iomem_resource, res);
345 res++;
346 }
347}
348
349/*
350 * Find the ranges of physical addresses that do not correspond to
351 * e820 RAM areas and mark the corresponding pages as nosave for software
352 * suspend and suspend to RAM.
353 *
354 * This function requires the e820 map to be sorted and without any
355 * overlapping entries and assumes the first e820 area to be RAM.
356 */
357void __init e820_mark_nosave_regions(void)
358{
359 int i;
360 unsigned long paddr;
361
362 paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
363 for (i = 1; i < e820.nr_map; i++) {
364 struct e820entry *ei = &e820.map[i];
365
366 if (paddr < ei->addr)
367 register_nosave_region(PFN_DOWN(paddr),
368 PFN_UP(ei->addr));
369
370 paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
371 if (ei->type != E820_RAM)
372 register_nosave_region(PFN_UP(ei->addr),
373 PFN_DOWN(paddr));
374
375 if (paddr >= (end_pfn << PAGE_SHIFT))
376 break;
377 }
378}
379
380/*
381 * Finds an active region in the address range from start_pfn to end_pfn and
382 * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
383 */
384static int __init e820_find_active_region(const struct e820entry *ei,
385 unsigned long start_pfn,
386 unsigned long end_pfn,
387 unsigned long *ei_startpfn,
388 unsigned long *ei_endpfn)
389{
390 *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
391 *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT;
392
393 /* Skip map entries smaller than a page */
394 if (*ei_startpfn >= *ei_endpfn)
395 return 0;
396
397 /* Check if max_pfn_mapped should be updated */
398 if (ei->type != E820_RAM && *ei_endpfn > max_pfn_mapped)
399 max_pfn_mapped = *ei_endpfn;
400
401 /* Skip if map is outside the node */
402 if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
403 *ei_startpfn >= end_pfn)
404 return 0;
405
406 /* Check for overlaps */
407 if (*ei_startpfn < start_pfn)
408 *ei_startpfn = start_pfn;
409 if (*ei_endpfn > end_pfn)
410 *ei_endpfn = end_pfn;
411
412 /* Obey end_user_pfn to save on memmap */
413 if (*ei_startpfn >= end_user_pfn)
414 return 0;
415 if (*ei_endpfn > end_user_pfn)
416 *ei_endpfn = end_user_pfn;
417
418 return 1;
419}
420
421/* Walk the e820 map and register active regions within a node */
422void __init
423e820_register_active_regions(int nid, unsigned long start_pfn,
424 unsigned long end_pfn)
425{
426 unsigned long ei_startpfn;
427 unsigned long ei_endpfn;
428 int i;
429
430 for (i = 0; i < e820.nr_map; i++)
431 if (e820_find_active_region(&e820.map[i],
432 start_pfn, end_pfn,
433 &ei_startpfn, &ei_endpfn))
434 add_active_range(nid, ei_startpfn, ei_endpfn);
435}
436
437/*
438 * Add a memory region to the kernel e820 map.
439 */
440void __init add_memory_region(unsigned long start, unsigned long size, int type)
441{
442 int x = e820.nr_map;
443
444 if (x == E820MAX) {
445 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
446 return;
447 }
448
449 e820.map[x].addr = start;
450 e820.map[x].size = size;
451 e820.map[x].type = type;
452 e820.nr_map++;
453}
454
455/*
456 * Find the hole size (in bytes) in the memory range.
457 * @start: starting address of the memory range to scan
458 * @end: ending address of the memory range to scan
459 */
460unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
461{
462 unsigned long start_pfn = start >> PAGE_SHIFT;
463 unsigned long end_pfn = end >> PAGE_SHIFT;
464 unsigned long ei_startpfn, ei_endpfn, ram = 0;
465 int i;
466
467 for (i = 0; i < e820.nr_map; i++) {
468 if (e820_find_active_region(&e820.map[i],
469 start_pfn, end_pfn,
470 &ei_startpfn, &ei_endpfn))
471 ram += ei_endpfn - ei_startpfn;
472 }
473 return end - start - (ram << PAGE_SHIFT);
474}
475
476static void __init e820_print_map(char *who)
477{
478 int i;
479
480 for (i = 0; i < e820.nr_map; i++) {
481 printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
482 (unsigned long long) e820.map[i].addr,
483 (unsigned long long)
484 (e820.map[i].addr + e820.map[i].size));
485 switch (e820.map[i].type) {
486 case E820_RAM:
487 printk(KERN_CONT "(usable)\n");
488 break;
489 case E820_RESERVED:
490 printk(KERN_CONT "(reserved)\n");
491 break;
492 case E820_ACPI:
493 printk(KERN_CONT "(ACPI data)\n");
494 break;
495 case E820_NVS:
496 printk(KERN_CONT "(ACPI NVS)\n");
497 break;
498 default:
499 printk(KERN_CONT "type %u\n", e820.map[i].type);
500 break;
501 }
502 }
503}
504
505/*
506 * Sanitize the BIOS e820 map.
507 *
508 * Some e820 responses include overlapping entries. The following
509 * replaces the original e820 map with a new one, removing overlaps.
510 *
511 */
512static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
513{
514 struct change_member {
515 struct e820entry *pbios; /* pointer to original bios entry */
516 unsigned long long addr; /* address for this change point */
517 };
518 static struct change_member change_point_list[2*E820MAX] __initdata;
519 static struct change_member *change_point[2*E820MAX] __initdata;
520 static struct e820entry *overlap_list[E820MAX] __initdata;
521 static struct e820entry new_bios[E820MAX] __initdata;
522 struct change_member *change_tmp;
523 unsigned long current_type, last_type;
524 unsigned long long last_addr;
525 int chgidx, still_changing;
526 int overlap_entries;
527 int new_bios_entry;
528 int old_nr, new_nr, chg_nr;
529 int i;
530
531 /*
532 Visually we're performing the following
533 (1,2,3,4 = memory types)...
534
535 Sample memory map (w/overlaps):
536 ____22__________________
537 ______________________4_
538 ____1111________________
539 _44_____________________
540 11111111________________
541 ____________________33__
542 ___________44___________
543 __________33333_________
544 ______________22________
545 ___________________2222_
546 _________111111111______
547 _____________________11_
548 _________________4______
549
550 Sanitized equivalent (no overlap):
551 1_______________________
552 _44_____________________
553 ___1____________________
554 ____22__________________
555 ______11________________
556 _________1______________
557 __________3_____________
558 ___________44___________
559 _____________33_________
560 _______________2________
561 ________________1_______
562 _________________4______
563 ___________________2____
564 ____________________33__
565 ______________________4_
566 */
567
568 /* if there's only one memory region, don't bother */
569 if (*pnr_map < 2)
570 return -1;
571
572 old_nr = *pnr_map;
573
574 /* bail out if we find any unreasonable addresses in bios map */
575 for (i = 0; i < old_nr; i++)
576 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
577 return -1;
578
579 /* create pointers for initial change-point information (for sorting) */
580 for (i = 0; i < 2 * old_nr; i++)
581 change_point[i] = &change_point_list[i];
582
583 /* record all known change-points (starting and ending addresses),
584 omitting those that are for empty memory regions */
585 chgidx = 0;
586 for (i = 0; i < old_nr; i++) {
587 if (biosmap[i].size != 0) {
588 change_point[chgidx]->addr = biosmap[i].addr;
589 change_point[chgidx++]->pbios = &biosmap[i];
590 change_point[chgidx]->addr = biosmap[i].addr +
591 biosmap[i].size;
592 change_point[chgidx++]->pbios = &biosmap[i];
593 }
594 }
595 chg_nr = chgidx;
596
597 /* sort change-point list by memory addresses (low -> high) */
598 still_changing = 1;
599 while (still_changing) {
600 still_changing = 0;
601 for (i = 1; i < chg_nr; i++) {
602 unsigned long long curaddr, lastaddr;
603 unsigned long long curpbaddr, lastpbaddr;
604
605 curaddr = change_point[i]->addr;
606 lastaddr = change_point[i - 1]->addr;
607 curpbaddr = change_point[i]->pbios->addr;
608 lastpbaddr = change_point[i - 1]->pbios->addr;
609
610 /*
611 * swap entries, when:
612 *
613 * curaddr > lastaddr or
614 * curaddr == lastaddr and curaddr == curpbaddr and
615 * lastaddr != lastpbaddr
616 */
617 if (curaddr < lastaddr ||
618 (curaddr == lastaddr && curaddr == curpbaddr &&
619 lastaddr != lastpbaddr)) {
620 change_tmp = change_point[i];
621 change_point[i] = change_point[i-1];
622 change_point[i-1] = change_tmp;
623 still_changing = 1;
624 }
625 }
626 }
627
628 /* create a new bios memory map, removing overlaps */
629 overlap_entries = 0; /* number of entries in the overlap table */
630 new_bios_entry = 0; /* index for creating new bios map entries */
631 last_type = 0; /* start with undefined memory type */
632 last_addr = 0; /* start with 0 as last starting address */
633
634 /* loop through change-points, determining affect on the new bios map */
635 for (chgidx = 0; chgidx < chg_nr; chgidx++) {
636 /* keep track of all overlapping bios entries */
637 if (change_point[chgidx]->addr ==
638 change_point[chgidx]->pbios->addr) {
639 /*
640 * add map entry to overlap list (> 1 entry
641 * implies an overlap)
642 */
643 overlap_list[overlap_entries++] =
644 change_point[chgidx]->pbios;
645 } else {
646 /*
647 * remove entry from list (order independent,
648 * so swap with last)
649 */
650 for (i = 0; i < overlap_entries; i++) {
651 if (overlap_list[i] ==
652 change_point[chgidx]->pbios)
653 overlap_list[i] =
654 overlap_list[overlap_entries-1];
655 }
656 overlap_entries--;
657 }
658 /*
659 * if there are overlapping entries, decide which
660 * "type" to use (larger value takes precedence --
661 * 1=usable, 2,3,4,4+=unusable)
662 */
663 current_type = 0;
664 for (i = 0; i < overlap_entries; i++)
665 if (overlap_list[i]->type > current_type)
666 current_type = overlap_list[i]->type;
667 /*
668 * continue building up new bios map based on this
669 * information
670 */
671 if (current_type != last_type) {
672 if (last_type != 0) {
673 new_bios[new_bios_entry].size =
674 change_point[chgidx]->addr - last_addr;
675 /*
676 * move forward only if the new size
677 * was non-zero
678 */
679 if (new_bios[new_bios_entry].size != 0)
680 /*
681 * no more space left for new
682 * bios entries ?
683 */
684 if (++new_bios_entry >= E820MAX)
685 break;
686 }
687 if (current_type != 0) {
688 new_bios[new_bios_entry].addr =
689 change_point[chgidx]->addr;
690 new_bios[new_bios_entry].type = current_type;
691 last_addr = change_point[chgidx]->addr;
692 }
693 last_type = current_type;
694 }
695 }
696 /* retain count for new bios entries */
697 new_nr = new_bios_entry;
698
699 /* copy new bios mapping into original location */
700 memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
701 *pnr_map = new_nr;
702
703 return 0;
704}
705
706/*
707 * Copy the BIOS e820 map into a safe place.
708 *
709 * Sanity-check it while we're at it..
710 *
711 * If we're lucky and live on a modern system, the setup code
712 * will have given us a memory map that we can use to properly
713 * set up memory. If we aren't, we'll fake a memory map.
714 */
715static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
716{
717 /* Only one memory region (or negative)? Ignore it */
718 if (nr_map < 2)
719 return -1;
720
721 do {
722 u64 start = biosmap->addr;
723 u64 size = biosmap->size;
724 u64 end = start + size;
725 u32 type = biosmap->type;
726
727 /* Overflow in 64 bits? Ignore the memory map. */
728 if (start > end)
729 return -1;
730
731 add_memory_region(start, size, type);
732 } while (biosmap++, --nr_map);
733 return 0;
734}
735
736static void early_panic(char *msg)
737{
738 early_printk(msg);
739 panic(msg);
740}
741
742/* We're not void only for x86 32-bit compat */
743char * __init machine_specific_memory_setup(void)
744{
745 char *who = "BIOS-e820";
746 /*
747 * Try to copy the BIOS-supplied E820-map.
748 *
749 * Otherwise fake a memory map; one section from 0k->640k,
750 * the next section from 1mb->appropriate_mem_k
751 */
752 sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
753 if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
754 early_panic("Cannot find a valid memory map");
755 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
756 e820_print_map(who);
757
758 /* In case someone cares... */
759 return who;
760}
761
762static int __init parse_memopt(char *p)
763{
764 if (!p)
765 return -EINVAL;
766 end_user_pfn = memparse(p, &p);
767 end_user_pfn >>= PAGE_SHIFT;
768 return 0;
769}
770early_param("mem", parse_memopt);
771
772static int userdef __initdata;
773
774static int __init parse_memmap_opt(char *p)
775{
776 char *oldp;
777 unsigned long long start_at, mem_size;
778
779 if (!strcmp(p, "exactmap")) {
780#ifdef CONFIG_CRASH_DUMP
781 /*
782 * If we are doing a crash dump, we still need to know
783 * the real mem size before original memory map is
784 * reset.
785 */
786 e820_register_active_regions(0, 0, -1UL);
787 saved_max_pfn = e820_end_of_ram();
788 remove_all_active_ranges();
789#endif
790 max_pfn_mapped = 0;
791 e820.nr_map = 0;
792 userdef = 1;
793 return 0;
794 }
795
796 oldp = p;
797 mem_size = memparse(p, &p);
798 if (p == oldp)
799 return -EINVAL;
800
801 userdef = 1;
802 if (*p == '@') {
803 start_at = memparse(p+1, &p);
804 add_memory_region(start_at, mem_size, E820_RAM);
805 } else if (*p == '#') {
806 start_at = memparse(p+1, &p);
807 add_memory_region(start_at, mem_size, E820_ACPI);
808 } else if (*p == '$') {
809 start_at = memparse(p+1, &p);
810 add_memory_region(start_at, mem_size, E820_RESERVED);
811 } else {
812 end_user_pfn = (mem_size >> PAGE_SHIFT);
813 }
814 return *p == '\0' ? 0 : -EINVAL;
815}
816early_param("memmap", parse_memmap_opt);
817
818void __init finish_e820_parsing(void)
819{
820 if (userdef) {
821 char nr = e820.nr_map;
822
823 if (sanitize_e820_map(e820.map, &nr) < 0)
824 early_panic("Invalid user supplied memory map");
825 e820.nr_map = nr;
826
827 printk(KERN_INFO "user-defined physical RAM map:\n");
828 e820_print_map("user");
829 }
830}
831
832void __init update_memory_range(u64 start, u64 size, unsigned old_type,
833 unsigned new_type)
834{
835 int i;
836
837 BUG_ON(old_type == new_type);
838
839 for (i = 0; i < e820.nr_map; i++) {
840 struct e820entry *ei = &e820.map[i];
841 u64 final_start, final_end;
842 if (ei->type != old_type)
843 continue;
844 /* totally covered? */
845 if (ei->addr >= start && ei->size <= size) {
846 ei->type = new_type;
847 continue;
848 }
849 /* partially covered */
850 final_start = max(start, ei->addr);
851 final_end = min(start + size, ei->addr + ei->size);
852 if (final_start >= final_end)
853 continue;
854 add_memory_region(final_start, final_end - final_start,
855 new_type);
856 }
857}
858
859void __init update_e820(void)
860{
861 u8 nr_map;
862
863 nr_map = e820.nr_map;
864 if (sanitize_e820_map(e820.map, &nr_map))
865 return;
866 e820.nr_map = nr_map;
867 printk(KERN_INFO "modified physical RAM map:\n");
868 e820_print_map("modified");
869}
870
871unsigned long pci_mem_start = 0xaeedbabe;
872EXPORT_SYMBOL(pci_mem_start);
873
874/*
875 * Search for the biggest gap in the low 32 bits of the e820
876 * memory space. We pass this space to PCI to assign MMIO resources
877 * for hotplug or unconfigured devices in.
878 * Hopefully the BIOS let enough space left.
879 */
880__init void e820_setup_gap(void)
881{
882 unsigned long gapstart, gapsize, round;
883 unsigned long last;
884 int i;
885 int found = 0;
886
887 last = 0x100000000ull;
888 gapstart = 0x10000000;
889 gapsize = 0x400000;
890 i = e820.nr_map;
891 while (--i >= 0) {
892 unsigned long long start = e820.map[i].addr;
893 unsigned long long end = start + e820.map[i].size;
894
895 /*
896 * Since "last" is at most 4GB, we know we'll
897 * fit in 32 bits if this condition is true
898 */
899 if (last > end) {
900 unsigned long gap = last - end;
901
902 if (gap > gapsize) {
903 gapsize = gap;
904 gapstart = end;
905 found = 1;
906 }
907 }
908 if (start < last)
909 last = start;
910 }
911
912 if (!found) {
913 gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
914 printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
915 "address range\n"
916 KERN_ERR "PCI: Unassigned devices with 32bit resource "
917 "registers may break!\n");
918 }
919
920 /*
921 * See how much we want to round up: start off with
922 * rounding to the next 1MB area.
923 */
924 round = 0x100000;
925 while ((gapsize >> 4) > round)
926 round += round;
927 /* Fun with two's complement */
928 pci_mem_start = (gapstart + round) & -round;
929
930 printk(KERN_INFO
931 "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
932 pci_mem_start, gapstart, gapsize);
933}
934
935int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
936{
937 int i;
938
939 if (slot < 0 || slot >= e820.nr_map)
940 return -1;
941 for (i = slot; i < e820.nr_map; i++) {
942 if (e820.map[i].type != E820_RAM)
943 continue;
944 break;
945 }
946 if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
947 return -1;
948 *addr = e820.map[i].addr;
949 *size = min_t(u64, e820.map[i].size + e820.map[i].addr,
950 max_pfn << PAGE_SHIFT) - *addr;
951 return i + 1;
952}
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 9f51e1ea9e82..4353cf5e6fac 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -16,10 +16,7 @@
16#include <asm/dma.h> 16#include <asm/dma.h>
17#include <asm/io_apic.h> 17#include <asm/io_apic.h>
18#include <asm/apic.h> 18#include <asm/apic.h>
19 19#include <asm/iommu.h>
20#ifdef CONFIG_GART_IOMMU
21#include <asm/gart.h>
22#endif
23 20
24static void __init fix_hypertransport_config(int num, int slot, int func) 21static void __init fix_hypertransport_config(int num, int slot, int func)
25{ 22{
@@ -50,7 +47,7 @@ static void __init fix_hypertransport_config(int num, int slot, int func)
50static void __init via_bugs(int num, int slot, int func) 47static void __init via_bugs(int num, int slot, int func)
51{ 48{
52#ifdef CONFIG_GART_IOMMU 49#ifdef CONFIG_GART_IOMMU
53 if ((end_pfn > MAX_DMA32_PFN || force_iommu) && 50 if ((max_pfn > MAX_DMA32_PFN || force_iommu) &&
54 !gart_iommu_aperture_allowed) { 51 !gart_iommu_aperture_allowed) {
55 printk(KERN_INFO 52 printk(KERN_INFO
56 "Looks like a VIA chipset. Disabling IOMMU." 53 "Looks like a VIA chipset. Disabling IOMMU."
@@ -98,17 +95,6 @@ static void __init nvidia_bugs(int num, int slot, int func)
98 95
99} 96}
100 97
101static void __init ati_bugs(int num, int slot, int func)
102{
103#ifdef CONFIG_X86_IO_APIC
104 if (timer_over_8254 == 1) {
105 timer_over_8254 = 0;
106 printk(KERN_INFO
107 "ATI board detected. Disabling timer routing over 8254.\n");
108 }
109#endif
110}
111
112#define QFLAG_APPLY_ONCE 0x1 98#define QFLAG_APPLY_ONCE 0x1
113#define QFLAG_APPLIED 0x2 99#define QFLAG_APPLIED 0x2
114#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) 100#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -126,14 +112,23 @@ static struct chipset early_qrk[] __initdata = {
126 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs }, 112 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs },
127 { PCI_VENDOR_ID_VIA, PCI_ANY_ID, 113 { PCI_VENDOR_ID_VIA, PCI_ANY_ID,
128 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs }, 114 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs },
129 { PCI_VENDOR_ID_ATI, PCI_ANY_ID,
130 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, ati_bugs },
131 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, 115 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
132 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, 116 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config },
133 {} 117 {}
134}; 118};
135 119
136static void __init check_dev_quirk(int num, int slot, int func) 120/**
121 * check_dev_quirk - apply early quirks to a given PCI device
122 * @num: bus number
123 * @slot: slot number
124 * @func: PCI function
125 *
126 * Check the vendor & device ID against the early quirks table.
127 *
128 * If the device is single function, let early_quirks() know so we don't
129 * poke at this device again.
130 */
131static int __init check_dev_quirk(int num, int slot, int func)
137{ 132{
138 u16 class; 133 u16 class;
139 u16 vendor; 134 u16 vendor;
@@ -144,7 +139,7 @@ static void __init check_dev_quirk(int num, int slot, int func)
144 class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE); 139 class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE);
145 140
146 if (class == 0xffff) 141 if (class == 0xffff)
147 return; 142 return -1; /* no class, treat as single function */
148 143
149 vendor = read_pci_config_16(num, slot, func, PCI_VENDOR_ID); 144 vendor = read_pci_config_16(num, slot, func, PCI_VENDOR_ID);
150 145
@@ -167,7 +162,9 @@ static void __init check_dev_quirk(int num, int slot, int func)
167 type = read_pci_config_byte(num, slot, func, 162 type = read_pci_config_byte(num, slot, func,
168 PCI_HEADER_TYPE); 163 PCI_HEADER_TYPE);
169 if (!(type & 0x80)) 164 if (!(type & 0x80))
170 return; 165 return -1;
166
167 return 0;
171} 168}
172 169
173void __init early_quirks(void) 170void __init early_quirks(void)
@@ -180,6 +177,9 @@ void __init early_quirks(void)
180 /* Poor man's PCI discovery */ 177 /* Poor man's PCI discovery */
181 for (num = 0; num < 32; num++) 178 for (num = 0; num < 32; num++)
182 for (slot = 0; slot < 32; slot++) 179 for (slot = 0; slot < 32; slot++)
183 for (func = 0; func < 8; func++) 180 for (func = 0; func < 8; func++) {
184 check_dev_quirk(num, slot, func); 181 /* Only probe function 0 on single fn devices */
182 if (check_dev_quirk(num, slot, func))
183 break;
184 }
185} 185}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 643fd861b724..ff9e7350da54 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -196,7 +196,7 @@ static struct console simnow_console = {
196static struct console *early_console = &early_vga_console; 196static struct console *early_console = &early_vga_console;
197static int early_console_initialized; 197static int early_console_initialized;
198 198
199void early_printk(const char *fmt, ...) 199asmlinkage void early_printk(const char *fmt, ...)
200{ 200{
201 char buf[512]; 201 char buf[512];
202 int n; 202 int n;
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 77d424cf68b3..06cc8d4254b1 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -64,6 +64,17 @@ static int __init setup_noefi(char *arg)
64} 64}
65early_param("noefi", setup_noefi); 65early_param("noefi", setup_noefi);
66 66
67int add_efi_memmap;
68EXPORT_SYMBOL(add_efi_memmap);
69
70static int __init setup_add_efi_memmap(char *arg)
71{
72 add_efi_memmap = 1;
73 return 0;
74}
75early_param("add_efi_memmap", setup_add_efi_memmap);
76
77
67static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) 78static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
68{ 79{
69 return efi_call_virt2(get_time, tm, tc); 80 return efi_call_virt2(get_time, tm, tc);
@@ -213,6 +224,50 @@ unsigned long efi_get_time(void)
213 eft.minute, eft.second); 224 eft.minute, eft.second);
214} 225}
215 226
227/*
228 * Tell the kernel about the EFI memory map. This might include
229 * more than the max 128 entries that can fit in the e820 legacy
230 * (zeropage) memory map.
231 */
232
233static void __init do_add_efi_memmap(void)
234{
235 void *p;
236
237 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
238 efi_memory_desc_t *md = p;
239 unsigned long long start = md->phys_addr;
240 unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
241 int e820_type;
242
243 if (md->attribute & EFI_MEMORY_WB)
244 e820_type = E820_RAM;
245 else
246 e820_type = E820_RESERVED;
247 e820_add_region(start, size, e820_type);
248 }
249 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
250}
251
252void __init efi_reserve_early(void)
253{
254 unsigned long pmap;
255
256#ifdef CONFIG_X86_32
257 pmap = boot_params.efi_info.efi_memmap;
258#else
259 pmap = (boot_params.efi_info.efi_memmap |
260 ((__u64)boot_params.efi_info.efi_memmap_hi<<32));
261#endif
262 memmap.phys_map = (void *)pmap;
263 memmap.nr_map = boot_params.efi_info.efi_memmap_size /
264 boot_params.efi_info.efi_memdesc_size;
265 memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
266 memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
267 reserve_early(pmap, pmap + memmap.nr_map * memmap.desc_size,
268 "EFI memmap");
269}
270
216#if EFI_DEBUG 271#if EFI_DEBUG
217static void __init print_efi_memmap(void) 272static void __init print_efi_memmap(void)
218{ 273{
@@ -244,19 +299,11 @@ void __init efi_init(void)
244 299
245#ifdef CONFIG_X86_32 300#ifdef CONFIG_X86_32
246 efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab; 301 efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab;
247 memmap.phys_map = (void *)boot_params.efi_info.efi_memmap;
248#else 302#else
249 efi_phys.systab = (efi_system_table_t *) 303 efi_phys.systab = (efi_system_table_t *)
250 (boot_params.efi_info.efi_systab | 304 (boot_params.efi_info.efi_systab |
251 ((__u64)boot_params.efi_info.efi_systab_hi<<32)); 305 ((__u64)boot_params.efi_info.efi_systab_hi<<32));
252 memmap.phys_map = (void *)
253 (boot_params.efi_info.efi_memmap |
254 ((__u64)boot_params.efi_info.efi_memmap_hi<<32));
255#endif 306#endif
256 memmap.nr_map = boot_params.efi_info.efi_memmap_size /
257 boot_params.efi_info.efi_memdesc_size;
258 memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
259 memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
260 307
261 efi.systab = early_ioremap((unsigned long)efi_phys.systab, 308 efi.systab = early_ioremap((unsigned long)efi_phys.systab,
262 sizeof(efi_system_table_t)); 309 sizeof(efi_system_table_t));
@@ -370,6 +417,8 @@ void __init efi_init(void)
370 if (memmap.desc_size != sizeof(efi_memory_desc_t)) 417 if (memmap.desc_size != sizeof(efi_memory_desc_t))
371 printk(KERN_WARNING "Kernel-defined memdesc" 418 printk(KERN_WARNING "Kernel-defined memdesc"
372 "doesn't match the one from EFI!\n"); 419 "doesn't match the one from EFI!\n");
420 if (add_efi_memmap)
421 do_add_efi_memmap();
373 422
374 /* Setup for EFI runtime service */ 423 /* Setup for EFI runtime service */
375 reboot_type = BOOT_EFI; 424 reboot_type = BOOT_EFI;
@@ -424,7 +473,7 @@ void __init efi_enter_virtual_mode(void)
424 size = md->num_pages << EFI_PAGE_SHIFT; 473 size = md->num_pages << EFI_PAGE_SHIFT;
425 end = md->phys_addr + size; 474 end = md->phys_addr + size;
426 475
427 if (PFN_UP(end) <= max_pfn_mapped) 476 if (PFN_UP(end) <= max_low_pfn_mapped)
428 va = __va(md->phys_addr); 477 va = __va(md->phys_addr);
429 else 478 else
430 va = efi_ioremap(md->phys_addr, size); 479 va = efi_ioremap(md->phys_addr, size);
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index d0060fdcccac..652c5287215f 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -97,13 +97,7 @@ void __init efi_call_phys_epilog(void)
97 early_runtime_code_mapping_set_exec(0); 97 early_runtime_code_mapping_set_exec(0);
98} 98}
99 99
100void __init efi_reserve_bootmem(void) 100void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size)
101{
102 reserve_bootmem_generic((unsigned long)memmap.phys_map,
103 memmap.nr_map * memmap.desc_size);
104}
105
106void __iomem * __init efi_ioremap(unsigned long phys_addr, unsigned long size)
107{ 101{
108 static unsigned pages_mapped __initdata; 102 static unsigned pages_mapped __initdata;
109 unsigned i, pages; 103 unsigned i, pages;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c778e4fa55a2..109792bc7cfa 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -51,14 +51,25 @@
51#include <asm/percpu.h> 51#include <asm/percpu.h>
52#include <asm/dwarf2.h> 52#include <asm/dwarf2.h>
53#include <asm/processor-flags.h> 53#include <asm/processor-flags.h>
54#include "irq_vectors.h" 54#include <asm/ftrace.h>
55#include <asm/irq_vectors.h>
56
57/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
58#include <linux/elf-em.h>
59#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
60#define __AUDIT_ARCH_LE 0x40000000
61
62#ifndef CONFIG_AUDITSYSCALL
63#define sysenter_audit syscall_trace_entry
64#define sysexit_audit syscall_exit_work
65#endif
55 66
56/* 67/*
57 * We use macros for low-level operations which need to be overridden 68 * We use macros for low-level operations which need to be overridden
58 * for paravirtualization. The following will never clobber any registers: 69 * for paravirtualization. The following will never clobber any registers:
59 * INTERRUPT_RETURN (aka. "iret") 70 * INTERRUPT_RETURN (aka. "iret")
60 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") 71 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
61 * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit"). 72 * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
62 * 73 *
63 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must 74 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
64 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). 75 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
@@ -331,8 +342,9 @@ sysenter_past_esp:
331 GET_THREAD_INFO(%ebp) 342 GET_THREAD_INFO(%ebp)
332 343
333 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ 344 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
334 testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) 345 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
335 jnz syscall_trace_entry 346 jnz sysenter_audit
347sysenter_do_call:
336 cmpl $(nr_syscalls), %eax 348 cmpl $(nr_syscalls), %eax
337 jae syscall_badsys 349 jae syscall_badsys
338 call *sys_call_table(,%eax,4) 350 call *sys_call_table(,%eax,4)
@@ -342,14 +354,54 @@ sysenter_past_esp:
342 TRACE_IRQS_OFF 354 TRACE_IRQS_OFF
343 movl TI_flags(%ebp), %ecx 355 movl TI_flags(%ebp), %ecx
344 testw $_TIF_ALLWORK_MASK, %cx 356 testw $_TIF_ALLWORK_MASK, %cx
345 jne syscall_exit_work 357 jne sysexit_audit
358sysenter_exit:
346/* if something modifies registers it must also disable sysexit */ 359/* if something modifies registers it must also disable sysexit */
347 movl PT_EIP(%esp), %edx 360 movl PT_EIP(%esp), %edx
348 movl PT_OLDESP(%esp), %ecx 361 movl PT_OLDESP(%esp), %ecx
349 xorl %ebp,%ebp 362 xorl %ebp,%ebp
350 TRACE_IRQS_ON 363 TRACE_IRQS_ON
3511: mov PT_FS(%esp), %fs 3641: mov PT_FS(%esp), %fs
352 ENABLE_INTERRUPTS_SYSCALL_RET 365 ENABLE_INTERRUPTS_SYSEXIT
366
367#ifdef CONFIG_AUDITSYSCALL
368sysenter_audit:
369 testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
370 jnz syscall_trace_entry
371 addl $4,%esp
372 CFI_ADJUST_CFA_OFFSET -4
373 /* %esi already in 8(%esp) 6th arg: 4th syscall arg */
374 /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */
375 /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */
376 movl %ebx,%ecx /* 3rd arg: 1st syscall arg */
377 movl %eax,%edx /* 2nd arg: syscall number */
378 movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
379 call audit_syscall_entry
380 pushl %ebx
381 CFI_ADJUST_CFA_OFFSET 4
382 movl PT_EAX(%esp),%eax /* reload syscall number */
383 jmp sysenter_do_call
384
385sysexit_audit:
386 testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
387 jne syscall_exit_work
388 TRACE_IRQS_ON
389 ENABLE_INTERRUPTS(CLBR_ANY)
390 movl %eax,%edx /* second arg, syscall return value */
391 cmpl $0,%eax /* is it < 0? */
392 setl %al /* 1 if so, 0 if not */
393 movzbl %al,%eax /* zero-extend that */
394 inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
395 call audit_syscall_exit
396 DISABLE_INTERRUPTS(CLBR_ANY)
397 TRACE_IRQS_OFF
398 movl TI_flags(%ebp), %ecx
399 testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
400 jne syscall_exit_work
401 movl PT_EAX(%esp),%eax /* reload syscall return value */
402 jmp sysenter_exit
403#endif
404
353 CFI_ENDPROC 405 CFI_ENDPROC
354.pushsection .fixup,"ax" 406.pushsection .fixup,"ax"
3552: movl $0,PT_FS(%esp) 4072: movl $0,PT_FS(%esp)
@@ -369,7 +421,7 @@ ENTRY(system_call)
369 GET_THREAD_INFO(%ebp) 421 GET_THREAD_INFO(%ebp)
370 # system call tracing in operation / emulation 422 # system call tracing in operation / emulation
371 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ 423 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
372 testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) 424 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
373 jnz syscall_trace_entry 425 jnz syscall_trace_entry
374 cmpl $(nr_syscalls), %eax 426 cmpl $(nr_syscalls), %eax
375 jae syscall_badsys 427 jae syscall_badsys
@@ -382,10 +434,6 @@ syscall_exit:
382 # setting need_resched or sigpending 434 # setting need_resched or sigpending
383 # between sampling and the iret 435 # between sampling and the iret
384 TRACE_IRQS_OFF 436 TRACE_IRQS_OFF
385 testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
386 jz no_singlestep
387 orl $_TIF_SINGLESTEP,TI_flags(%ebp)
388no_singlestep:
389 movl TI_flags(%ebp), %ecx 437 movl TI_flags(%ebp), %ecx
390 testw $_TIF_ALLWORK_MASK, %cx # current->work 438 testw $_TIF_ALLWORK_MASK, %cx # current->work
391 jne syscall_exit_work 439 jne syscall_exit_work
@@ -513,12 +561,8 @@ END(work_pending)
513syscall_trace_entry: 561syscall_trace_entry:
514 movl $-ENOSYS,PT_EAX(%esp) 562 movl $-ENOSYS,PT_EAX(%esp)
515 movl %esp, %eax 563 movl %esp, %eax
516 xorl %edx,%edx 564 call syscall_trace_enter
517 call do_syscall_trace 565 /* What it returned is what we'll actually use. */
518 cmpl $0, %eax
519 jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
520 # so must skip actual syscall
521 movl PT_ORIG_EAX(%esp), %eax
522 cmpl $(nr_syscalls), %eax 566 cmpl $(nr_syscalls), %eax
523 jnae syscall_call 567 jnae syscall_call
524 jmp syscall_exit 568 jmp syscall_exit
@@ -527,14 +571,13 @@ END(syscall_trace_entry)
527 # perform syscall exit tracing 571 # perform syscall exit tracing
528 ALIGN 572 ALIGN
529syscall_exit_work: 573syscall_exit_work:
530 testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl 574 testb $_TIF_WORK_SYSCALL_EXIT, %cl
531 jz work_pending 575 jz work_pending
532 TRACE_IRQS_ON 576 TRACE_IRQS_ON
533 ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call 577 ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
534 # schedule() instead 578 # schedule() instead
535 movl %esp, %eax 579 movl %esp, %eax
536 movl $1, %edx 580 call syscall_trace_leave
537 call do_syscall_trace
538 jmp resume_userspace 581 jmp resume_userspace
539END(syscall_exit_work) 582END(syscall_exit_work)
540 CFI_ENDPROC 583 CFI_ENDPROC
@@ -874,10 +917,10 @@ ENTRY(native_iret)
874.previous 917.previous
875END(native_iret) 918END(native_iret)
876 919
877ENTRY(native_irq_enable_syscall_ret) 920ENTRY(native_irq_enable_sysexit)
878 sti 921 sti
879 sysexit 922 sysexit
880END(native_irq_enable_syscall_ret) 923END(native_irq_enable_sysexit)
881#endif 924#endif
882 925
883KPROBE_ENTRY(int3) 926KPROBE_ENTRY(int3)
@@ -1023,7 +1066,9 @@ ENDPROC(kernel_thread_helper)
1023ENTRY(xen_sysenter_target) 1066ENTRY(xen_sysenter_target)
1024 RING0_INT_FRAME 1067 RING0_INT_FRAME
1025 addl $5*4, %esp /* remove xen-provided frame */ 1068 addl $5*4, %esp /* remove xen-provided frame */
1069 CFI_ADJUST_CFA_OFFSET -5*4
1026 jmp sysenter_past_esp 1070 jmp sysenter_past_esp
1071 CFI_ENDPROC
1027 1072
1028ENTRY(xen_hypervisor_callback) 1073ENTRY(xen_hypervisor_callback)
1029 CFI_STARTPROC 1074 CFI_STARTPROC
@@ -1110,6 +1155,77 @@ ENDPROC(xen_failsafe_callback)
1110 1155
1111#endif /* CONFIG_XEN */ 1156#endif /* CONFIG_XEN */
1112 1157
1158#ifdef CONFIG_FTRACE
1159#ifdef CONFIG_DYNAMIC_FTRACE
1160
1161ENTRY(mcount)
1162 pushl %eax
1163 pushl %ecx
1164 pushl %edx
1165 movl 0xc(%esp), %eax
1166 subl $MCOUNT_INSN_SIZE, %eax
1167
1168.globl mcount_call
1169mcount_call:
1170 call ftrace_stub
1171
1172 popl %edx
1173 popl %ecx
1174 popl %eax
1175
1176 ret
1177END(mcount)
1178
1179ENTRY(ftrace_caller)
1180 pushl %eax
1181 pushl %ecx
1182 pushl %edx
1183 movl 0xc(%esp), %eax
1184 movl 0x4(%ebp), %edx
1185 subl $MCOUNT_INSN_SIZE, %eax
1186
1187.globl ftrace_call
1188ftrace_call:
1189 call ftrace_stub
1190
1191 popl %edx
1192 popl %ecx
1193 popl %eax
1194
1195.globl ftrace_stub
1196ftrace_stub:
1197 ret
1198END(ftrace_caller)
1199
1200#else /* ! CONFIG_DYNAMIC_FTRACE */
1201
1202ENTRY(mcount)
1203 cmpl $ftrace_stub, ftrace_trace_function
1204 jnz trace
1205.globl ftrace_stub
1206ftrace_stub:
1207 ret
1208
1209 /* taken from glibc */
1210trace:
1211 pushl %eax
1212 pushl %ecx
1213 pushl %edx
1214 movl 0xc(%esp), %eax
1215 movl 0x4(%ebp), %edx
1216 subl $MCOUNT_INSN_SIZE, %eax
1217
1218 call *ftrace_trace_function
1219
1220 popl %edx
1221 popl %ecx
1222 popl %eax
1223
1224 jmp ftrace_stub
1225END(mcount)
1226#endif /* CONFIG_DYNAMIC_FTRACE */
1227#endif /* CONFIG_FTRACE */
1228
1113.section .rodata,"a" 1229.section .rodata,"a"
1114#include "syscall_table_32.S" 1230#include "syscall_table_32.S"
1115 1231
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 556a8df522a7..89434d439605 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -51,16 +51,127 @@
51#include <asm/page.h> 51#include <asm/page.h>
52#include <asm/irqflags.h> 52#include <asm/irqflags.h>
53#include <asm/paravirt.h> 53#include <asm/paravirt.h>
54#include <asm/ftrace.h>
55
56/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
57#include <linux/elf-em.h>
58#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
59#define __AUDIT_ARCH_64BIT 0x80000000
60#define __AUDIT_ARCH_LE 0x40000000
54 61
55 .code64 62 .code64
56 63
64#ifdef CONFIG_FTRACE
65#ifdef CONFIG_DYNAMIC_FTRACE
66ENTRY(mcount)
67
68 subq $0x38, %rsp
69 movq %rax, (%rsp)
70 movq %rcx, 8(%rsp)
71 movq %rdx, 16(%rsp)
72 movq %rsi, 24(%rsp)
73 movq %rdi, 32(%rsp)
74 movq %r8, 40(%rsp)
75 movq %r9, 48(%rsp)
76
77 movq 0x38(%rsp), %rdi
78 subq $MCOUNT_INSN_SIZE, %rdi
79
80.globl mcount_call
81mcount_call:
82 call ftrace_stub
83
84 movq 48(%rsp), %r9
85 movq 40(%rsp), %r8
86 movq 32(%rsp), %rdi
87 movq 24(%rsp), %rsi
88 movq 16(%rsp), %rdx
89 movq 8(%rsp), %rcx
90 movq (%rsp), %rax
91 addq $0x38, %rsp
92
93 retq
94END(mcount)
95
96ENTRY(ftrace_caller)
97
98 /* taken from glibc */
99 subq $0x38, %rsp
100 movq %rax, (%rsp)
101 movq %rcx, 8(%rsp)
102 movq %rdx, 16(%rsp)
103 movq %rsi, 24(%rsp)
104 movq %rdi, 32(%rsp)
105 movq %r8, 40(%rsp)
106 movq %r9, 48(%rsp)
107
108 movq 0x38(%rsp), %rdi
109 movq 8(%rbp), %rsi
110 subq $MCOUNT_INSN_SIZE, %rdi
111
112.globl ftrace_call
113ftrace_call:
114 call ftrace_stub
115
116 movq 48(%rsp), %r9
117 movq 40(%rsp), %r8
118 movq 32(%rsp), %rdi
119 movq 24(%rsp), %rsi
120 movq 16(%rsp), %rdx
121 movq 8(%rsp), %rcx
122 movq (%rsp), %rax
123 addq $0x38, %rsp
124
125.globl ftrace_stub
126ftrace_stub:
127 retq
128END(ftrace_caller)
129
130#else /* ! CONFIG_DYNAMIC_FTRACE */
131ENTRY(mcount)
132 cmpq $ftrace_stub, ftrace_trace_function
133 jnz trace
134.globl ftrace_stub
135ftrace_stub:
136 retq
137
138trace:
139 /* taken from glibc */
140 subq $0x38, %rsp
141 movq %rax, (%rsp)
142 movq %rcx, 8(%rsp)
143 movq %rdx, 16(%rsp)
144 movq %rsi, 24(%rsp)
145 movq %rdi, 32(%rsp)
146 movq %r8, 40(%rsp)
147 movq %r9, 48(%rsp)
148
149 movq 0x38(%rsp), %rdi
150 movq 8(%rbp), %rsi
151 subq $MCOUNT_INSN_SIZE, %rdi
152
153 call *ftrace_trace_function
154
155 movq 48(%rsp), %r9
156 movq 40(%rsp), %r8
157 movq 32(%rsp), %rdi
158 movq 24(%rsp), %rsi
159 movq 16(%rsp), %rdx
160 movq 8(%rsp), %rcx
161 movq (%rsp), %rax
162 addq $0x38, %rsp
163
164 jmp ftrace_stub
165END(mcount)
166#endif /* CONFIG_DYNAMIC_FTRACE */
167#endif /* CONFIG_FTRACE */
168
57#ifndef CONFIG_PREEMPT 169#ifndef CONFIG_PREEMPT
58#define retint_kernel retint_restore_args 170#define retint_kernel retint_restore_args
59#endif 171#endif
60 172
61#ifdef CONFIG_PARAVIRT 173#ifdef CONFIG_PARAVIRT
62ENTRY(native_irq_enable_syscall_ret) 174ENTRY(native_usergs_sysret64)
63 movq %gs:pda_oldrsp,%rsp
64 swapgs 175 swapgs
65 sysretq 176 sysretq
66#endif /* CONFIG_PARAVIRT */ 177#endif /* CONFIG_PARAVIRT */
@@ -104,7 +215,7 @@ ENTRY(native_irq_enable_syscall_ret)
104 .macro FAKE_STACK_FRAME child_rip 215 .macro FAKE_STACK_FRAME child_rip
105 /* push in order ss, rsp, eflags, cs, rip */ 216 /* push in order ss, rsp, eflags, cs, rip */
106 xorl %eax, %eax 217 xorl %eax, %eax
107 pushq %rax /* ss */ 218 pushq $__KERNEL_DS /* ss */
108 CFI_ADJUST_CFA_OFFSET 8 219 CFI_ADJUST_CFA_OFFSET 8
109 /*CFI_REL_OFFSET ss,0*/ 220 /*CFI_REL_OFFSET ss,0*/
110 pushq %rax /* rsp */ 221 pushq %rax /* rsp */
@@ -169,13 +280,13 @@ ENTRY(ret_from_fork)
169 CFI_ADJUST_CFA_OFFSET -4 280 CFI_ADJUST_CFA_OFFSET -4
170 call schedule_tail 281 call schedule_tail
171 GET_THREAD_INFO(%rcx) 282 GET_THREAD_INFO(%rcx)
172 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) 283 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
173 jnz rff_trace 284 jnz rff_trace
174rff_action: 285rff_action:
175 RESTORE_REST 286 RESTORE_REST
176 testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? 287 testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
177 je int_ret_from_sys_call 288 je int_ret_from_sys_call
178 testl $_TIF_IA32,threadinfo_flags(%rcx) 289 testl $_TIF_IA32,TI_flags(%rcx)
179 jnz int_ret_from_sys_call 290 jnz int_ret_from_sys_call
180 RESTORE_TOP_OF_STACK %rdi,ARGOFFSET 291 RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
181 jmp ret_from_sys_call 292 jmp ret_from_sys_call
@@ -244,8 +355,9 @@ ENTRY(system_call_after_swapgs)
244 movq %rcx,RIP-ARGOFFSET(%rsp) 355 movq %rcx,RIP-ARGOFFSET(%rsp)
245 CFI_REL_OFFSET rip,RIP-ARGOFFSET 356 CFI_REL_OFFSET rip,RIP-ARGOFFSET
246 GET_THREAD_INFO(%rcx) 357 GET_THREAD_INFO(%rcx)
247 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) 358 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
248 jnz tracesys 359 jnz tracesys
360system_call_fastpath:
249 cmpq $__NR_syscall_max,%rax 361 cmpq $__NR_syscall_max,%rax
250 ja badsys 362 ja badsys
251 movq %r10,%rcx 363 movq %r10,%rcx
@@ -263,7 +375,7 @@ sysret_check:
263 GET_THREAD_INFO(%rcx) 375 GET_THREAD_INFO(%rcx)
264 DISABLE_INTERRUPTS(CLBR_NONE) 376 DISABLE_INTERRUPTS(CLBR_NONE)
265 TRACE_IRQS_OFF 377 TRACE_IRQS_OFF
266 movl threadinfo_flags(%rcx),%edx 378 movl TI_flags(%rcx),%edx
267 andl %edi,%edx 379 andl %edi,%edx
268 jnz sysret_careful 380 jnz sysret_careful
269 CFI_REMEMBER_STATE 381 CFI_REMEMBER_STATE
@@ -275,7 +387,8 @@ sysret_check:
275 CFI_REGISTER rip,rcx 387 CFI_REGISTER rip,rcx
276 RESTORE_ARGS 0,-ARG_SKIP,1 388 RESTORE_ARGS 0,-ARG_SKIP,1
277 /*CFI_REGISTER rflags,r11*/ 389 /*CFI_REGISTER rflags,r11*/
278 ENABLE_INTERRUPTS_SYSCALL_RET 390 movq %gs:pda_oldrsp, %rsp
391 USERGS_SYSRET64
279 392
280 CFI_RESTORE_STATE 393 CFI_RESTORE_STATE
281 /* Handle reschedules */ 394 /* Handle reschedules */
@@ -296,16 +409,16 @@ sysret_careful:
296sysret_signal: 409sysret_signal:
297 TRACE_IRQS_ON 410 TRACE_IRQS_ON
298 ENABLE_INTERRUPTS(CLBR_NONE) 411 ENABLE_INTERRUPTS(CLBR_NONE)
299 testl $_TIF_DO_NOTIFY_MASK,%edx 412#ifdef CONFIG_AUDITSYSCALL
300 jz 1f 413 bt $TIF_SYSCALL_AUDIT,%edx
301 414 jc sysret_audit
302 /* Really a signal */ 415#endif
303 /* edx: work flags (arg3) */ 416 /* edx: work flags (arg3) */
304 leaq do_notify_resume(%rip),%rax 417 leaq do_notify_resume(%rip),%rax
305 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 418 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
306 xorl %esi,%esi # oldset -> arg2 419 xorl %esi,%esi # oldset -> arg2
307 call ptregscall_common 420 call ptregscall_common
3081: movl $_TIF_NEED_RESCHED,%edi 421 movl $_TIF_WORK_MASK,%edi
309 /* Use IRET because user could have changed frame. This 422 /* Use IRET because user could have changed frame. This
310 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ 423 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
311 DISABLE_INTERRUPTS(CLBR_NONE) 424 DISABLE_INTERRUPTS(CLBR_NONE)
@@ -316,14 +429,56 @@ badsys:
316 movq $-ENOSYS,RAX-ARGOFFSET(%rsp) 429 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
317 jmp ret_from_sys_call 430 jmp ret_from_sys_call
318 431
432#ifdef CONFIG_AUDITSYSCALL
433 /*
434 * Fast path for syscall audit without full syscall trace.
435 * We just call audit_syscall_entry() directly, and then
436 * jump back to the normal fast path.
437 */
438auditsys:
439 movq %r10,%r9 /* 6th arg: 4th syscall arg */
440 movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
441 movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
442 movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
443 movq %rax,%rsi /* 2nd arg: syscall number */
444 movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
445 call audit_syscall_entry
446 LOAD_ARGS 0 /* reload call-clobbered registers */
447 jmp system_call_fastpath
448
449 /*
450 * Return fast path for syscall audit. Call audit_syscall_exit()
451 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
452 * masked off.
453 */
454sysret_audit:
455 movq %rax,%rsi /* second arg, syscall return value */
456 cmpq $0,%rax /* is it < 0? */
457 setl %al /* 1 if so, 0 if not */
458 movzbl %al,%edi /* zero-extend that into %edi */
459 inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
460 call audit_syscall_exit
461 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
462 jmp sysret_check
463#endif /* CONFIG_AUDITSYSCALL */
464
319 /* Do syscall tracing */ 465 /* Do syscall tracing */
320tracesys: 466tracesys:
467#ifdef CONFIG_AUDITSYSCALL
468 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
469 jz auditsys
470#endif
321 SAVE_REST 471 SAVE_REST
322 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ 472 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
323 FIXUP_TOP_OF_STACK %rdi 473 FIXUP_TOP_OF_STACK %rdi
324 movq %rsp,%rdi 474 movq %rsp,%rdi
325 call syscall_trace_enter 475 call syscall_trace_enter
326 LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ 476 /*
477 * Reload arg registers from stack in case ptrace changed them.
478 * We don't reload %rax because syscall_trace_enter() returned
479 * the value it wants us to use in the table lookup.
480 */
481 LOAD_ARGS ARGOFFSET, 1
327 RESTORE_REST 482 RESTORE_REST
328 cmpq $__NR_syscall_max,%rax 483 cmpq $__NR_syscall_max,%rax
329 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ 484 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
@@ -337,6 +492,7 @@ tracesys:
337 * Has correct top of stack, but partial stack frame. 492 * Has correct top of stack, but partial stack frame.
338 */ 493 */
339 .globl int_ret_from_sys_call 494 .globl int_ret_from_sys_call
495 .globl int_with_check
340int_ret_from_sys_call: 496int_ret_from_sys_call:
341 DISABLE_INTERRUPTS(CLBR_NONE) 497 DISABLE_INTERRUPTS(CLBR_NONE)
342 TRACE_IRQS_OFF 498 TRACE_IRQS_OFF
@@ -347,10 +503,10 @@ int_ret_from_sys_call:
347int_with_check: 503int_with_check:
348 LOCKDEP_SYS_EXIT_IRQ 504 LOCKDEP_SYS_EXIT_IRQ
349 GET_THREAD_INFO(%rcx) 505 GET_THREAD_INFO(%rcx)
350 movl threadinfo_flags(%rcx),%edx 506 movl TI_flags(%rcx),%edx
351 andl %edi,%edx 507 andl %edi,%edx
352 jnz int_careful 508 jnz int_careful
353 andl $~TS_COMPAT,threadinfo_status(%rcx) 509 andl $~TS_COMPAT,TI_status(%rcx)
354 jmp retint_swapgs 510 jmp retint_swapgs
355 511
356 /* Either reschedule or signal or syscall exit tracking needed. */ 512 /* Either reschedule or signal or syscall exit tracking needed. */
@@ -376,7 +532,7 @@ int_very_careful:
376 ENABLE_INTERRUPTS(CLBR_NONE) 532 ENABLE_INTERRUPTS(CLBR_NONE)
377 SAVE_REST 533 SAVE_REST
378 /* Check for syscall exit trace */ 534 /* Check for syscall exit trace */
379 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx 535 testl $_TIF_WORK_SYSCALL_EXIT,%edx
380 jz int_signal 536 jz int_signal
381 pushq %rdi 537 pushq %rdi
382 CFI_ADJUST_CFA_OFFSET 8 538 CFI_ADJUST_CFA_OFFSET 8
@@ -384,7 +540,7 @@ int_very_careful:
384 call syscall_trace_leave 540 call syscall_trace_leave
385 popq %rdi 541 popq %rdi
386 CFI_ADJUST_CFA_OFFSET -8 542 CFI_ADJUST_CFA_OFFSET -8
387 andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi 543 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
388 jmp int_restore_rest 544 jmp int_restore_rest
389 545
390int_signal: 546int_signal:
@@ -393,7 +549,7 @@ int_signal:
393 movq %rsp,%rdi # &ptregs -> arg1 549 movq %rsp,%rdi # &ptregs -> arg1
394 xorl %esi,%esi # oldset -> arg2 550 xorl %esi,%esi # oldset -> arg2
395 call do_notify_resume 551 call do_notify_resume
3961: movl $_TIF_NEED_RESCHED,%edi 5521: movl $_TIF_WORK_MASK,%edi
397int_restore_rest: 553int_restore_rest:
398 RESTORE_REST 554 RESTORE_REST
399 DISABLE_INTERRUPTS(CLBR_NONE) 555 DISABLE_INTERRUPTS(CLBR_NONE)
@@ -420,7 +576,6 @@ END(\label)
420 PTREGSCALL stub_clone, sys_clone, %r8 576 PTREGSCALL stub_clone, sys_clone, %r8
421 PTREGSCALL stub_fork, sys_fork, %rdi 577 PTREGSCALL stub_fork, sys_fork, %rdi
422 PTREGSCALL stub_vfork, sys_vfork, %rdi 578 PTREGSCALL stub_vfork, sys_vfork, %rdi
423 PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
424 PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx 579 PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
425 PTREGSCALL stub_iopl, sys_iopl, %rsi 580 PTREGSCALL stub_iopl, sys_iopl, %rsi
426 581
@@ -559,7 +714,7 @@ retint_with_reschedule:
559 movl $_TIF_WORK_MASK,%edi 714 movl $_TIF_WORK_MASK,%edi
560retint_check: 715retint_check:
561 LOCKDEP_SYS_EXIT_IRQ 716 LOCKDEP_SYS_EXIT_IRQ
562 movl threadinfo_flags(%rcx),%edx 717 movl TI_flags(%rcx),%edx
563 andl %edi,%edx 718 andl %edi,%edx
564 CFI_REMEMBER_STATE 719 CFI_REMEMBER_STATE
565 jnz retint_careful 720 jnz retint_careful
@@ -647,17 +802,16 @@ retint_signal:
647 RESTORE_REST 802 RESTORE_REST
648 DISABLE_INTERRUPTS(CLBR_NONE) 803 DISABLE_INTERRUPTS(CLBR_NONE)
649 TRACE_IRQS_OFF 804 TRACE_IRQS_OFF
650 movl $_TIF_NEED_RESCHED,%edi
651 GET_THREAD_INFO(%rcx) 805 GET_THREAD_INFO(%rcx)
652 jmp retint_check 806 jmp retint_with_reschedule
653 807
654#ifdef CONFIG_PREEMPT 808#ifdef CONFIG_PREEMPT
655 /* Returning to kernel space. Check if we need preemption */ 809 /* Returning to kernel space. Check if we need preemption */
656 /* rcx: threadinfo. interrupts off. */ 810 /* rcx: threadinfo. interrupts off. */
657ENTRY(retint_kernel) 811ENTRY(retint_kernel)
658 cmpl $0,threadinfo_preempt_count(%rcx) 812 cmpl $0,TI_preempt_count(%rcx)
659 jnz retint_restore_args 813 jnz retint_restore_args
660 bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx) 814 bt $TIF_NEED_RESCHED,TI_flags(%rcx)
661 jnc retint_restore_args 815 jnc retint_restore_args
662 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ 816 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
663 jnc retint_restore_args 817 jnc retint_restore_args
@@ -711,6 +865,9 @@ END(invalidate_interrupt\num)
711ENTRY(call_function_interrupt) 865ENTRY(call_function_interrupt)
712 apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt 866 apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
713END(call_function_interrupt) 867END(call_function_interrupt)
868ENTRY(call_function_single_interrupt)
869 apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt
870END(call_function_single_interrupt)
714ENTRY(irq_move_cleanup_interrupt) 871ENTRY(irq_move_cleanup_interrupt)
715 apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt 872 apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
716END(irq_move_cleanup_interrupt) 873END(irq_move_cleanup_interrupt)
@@ -720,6 +877,10 @@ ENTRY(apic_timer_interrupt)
720 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt 877 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
721END(apic_timer_interrupt) 878END(apic_timer_interrupt)
722 879
880ENTRY(uv_bau_message_intr1)
881 apicinterrupt 220,uv_bau_message_interrupt
882END(uv_bau_message_intr1)
883
723ENTRY(error_interrupt) 884ENTRY(error_interrupt)
724 apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt 885 apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
725END(error_interrupt) 886END(error_interrupt)
@@ -733,6 +894,7 @@ END(spurious_interrupt)
733 */ 894 */
734 .macro zeroentry sym 895 .macro zeroentry sym
735 INTR_FRAME 896 INTR_FRAME
897 PARAVIRT_ADJUST_EXCEPTION_FRAME
736 pushq $0 /* push error code/oldrax */ 898 pushq $0 /* push error code/oldrax */
737 CFI_ADJUST_CFA_OFFSET 8 899 CFI_ADJUST_CFA_OFFSET 8
738 pushq %rax /* push real oldrax to the rdi slot */ 900 pushq %rax /* push real oldrax to the rdi slot */
@@ -745,6 +907,7 @@ END(spurious_interrupt)
745 907
746 .macro errorentry sym 908 .macro errorentry sym
747 XCPT_FRAME 909 XCPT_FRAME
910 PARAVIRT_ADJUST_EXCEPTION_FRAME
748 pushq %rax 911 pushq %rax
749 CFI_ADJUST_CFA_OFFSET 8 912 CFI_ADJUST_CFA_OFFSET 8
750 CFI_REL_OFFSET rax,0 913 CFI_REL_OFFSET rax,0
@@ -814,7 +977,7 @@ paranoid_restore\trace:
814 jmp irq_return 977 jmp irq_return
815paranoid_userspace\trace: 978paranoid_userspace\trace:
816 GET_THREAD_INFO(%rcx) 979 GET_THREAD_INFO(%rcx)
817 movl threadinfo_flags(%rcx),%ebx 980 movl TI_flags(%rcx),%ebx
818 andl $_TIF_WORK_MASK,%ebx 981 andl $_TIF_WORK_MASK,%ebx
819 jz paranoid_swapgs\trace 982 jz paranoid_swapgs\trace
820 movq %rsp,%rdi /* &pt_regs */ 983 movq %rsp,%rdi /* &pt_regs */
@@ -912,7 +1075,7 @@ error_exit:
912 testl %eax,%eax 1075 testl %eax,%eax
913 jne retint_kernel 1076 jne retint_kernel
914 LOCKDEP_SYS_EXIT_IRQ 1077 LOCKDEP_SYS_EXIT_IRQ
915 movl threadinfo_flags(%rcx),%edx 1078 movl TI_flags(%rcx),%edx
916 movl $_TIF_WORK_MASK,%edi 1079 movl $_TIF_WORK_MASK,%edi
917 andl %edi,%edx 1080 andl %edi,%edx
918 jnz retint_careful 1081 jnz retint_careful
@@ -926,11 +1089,11 @@ error_kernelspace:
926 iret run with kernel gs again, so don't set the user space flag. 1089 iret run with kernel gs again, so don't set the user space flag.
927 B stepping K8s sometimes report an truncated RIP for IRET 1090 B stepping K8s sometimes report an truncated RIP for IRET
928 exceptions returning to compat mode. Check for these here too. */ 1091 exceptions returning to compat mode. Check for these here too. */
929 leaq irq_return(%rip),%rbp 1092 leaq irq_return(%rip),%rcx
930 cmpq %rbp,RIP(%rsp) 1093 cmpq %rcx,RIP(%rsp)
931 je error_swapgs 1094 je error_swapgs
932 movl %ebp,%ebp /* zero extend */ 1095 movl %ecx,%ecx /* zero extend */
933 cmpq %rbp,RIP(%rsp) 1096 cmpq %rcx,RIP(%rsp)
934 je error_swapgs 1097 je error_swapgs
935 cmpq $gs_change,RIP(%rsp) 1098 cmpq $gs_change,RIP(%rsp)
936 je error_swapgs 1099 je error_swapgs
@@ -939,7 +1102,7 @@ KPROBE_END(error_entry)
939 1102
940 /* Reload gs selector with exception handling */ 1103 /* Reload gs selector with exception handling */
941 /* edi: new selector */ 1104 /* edi: new selector */
942ENTRY(load_gs_index) 1105ENTRY(native_load_gs_index)
943 CFI_STARTPROC 1106 CFI_STARTPROC
944 pushf 1107 pushf
945 CFI_ADJUST_CFA_OFFSET 8 1108 CFI_ADJUST_CFA_OFFSET 8
@@ -953,7 +1116,7 @@ gs_change:
953 CFI_ADJUST_CFA_OFFSET -8 1116 CFI_ADJUST_CFA_OFFSET -8
954 ret 1117 ret
955 CFI_ENDPROC 1118 CFI_ENDPROC
956ENDPROC(load_gs_index) 1119ENDPROC(native_load_gs_index)
957 1120
958 .section __ex_table,"a" 1121 .section __ex_table,"a"
959 .align 8 1122 .align 8
@@ -1075,6 +1238,7 @@ END(device_not_available)
1075 /* runs on exception stack */ 1238 /* runs on exception stack */
1076KPROBE_ENTRY(debug) 1239KPROBE_ENTRY(debug)
1077 INTR_FRAME 1240 INTR_FRAME
1241 PARAVIRT_ADJUST_EXCEPTION_FRAME
1078 pushq $0 1242 pushq $0
1079 CFI_ADJUST_CFA_OFFSET 8 1243 CFI_ADJUST_CFA_OFFSET 8
1080 paranoidentry do_debug, DEBUG_STACK 1244 paranoidentry do_debug, DEBUG_STACK
@@ -1084,6 +1248,7 @@ KPROBE_END(debug)
1084 /* runs on exception stack */ 1248 /* runs on exception stack */
1085KPROBE_ENTRY(nmi) 1249KPROBE_ENTRY(nmi)
1086 INTR_FRAME 1250 INTR_FRAME
1251 PARAVIRT_ADJUST_EXCEPTION_FRAME
1087 pushq $-1 1252 pushq $-1
1088 CFI_ADJUST_CFA_OFFSET 8 1253 CFI_ADJUST_CFA_OFFSET 8
1089 paranoidentry do_nmi, 0, 0 1254 paranoidentry do_nmi, 0, 0
@@ -1097,6 +1262,7 @@ KPROBE_END(nmi)
1097 1262
1098KPROBE_ENTRY(int3) 1263KPROBE_ENTRY(int3)
1099 INTR_FRAME 1264 INTR_FRAME
1265 PARAVIRT_ADJUST_EXCEPTION_FRAME
1100 pushq $0 1266 pushq $0
1101 CFI_ADJUST_CFA_OFFSET 8 1267 CFI_ADJUST_CFA_OFFSET 8
1102 paranoidentry do_int3, DEBUG_STACK 1268 paranoidentry do_int3, DEBUG_STACK
@@ -1120,13 +1286,10 @@ ENTRY(coprocessor_segment_overrun)
1120 zeroentry do_coprocessor_segment_overrun 1286 zeroentry do_coprocessor_segment_overrun
1121END(coprocessor_segment_overrun) 1287END(coprocessor_segment_overrun)
1122 1288
1123ENTRY(reserved)
1124 zeroentry do_reserved
1125END(reserved)
1126
1127 /* runs on exception stack */ 1289 /* runs on exception stack */
1128ENTRY(double_fault) 1290ENTRY(double_fault)
1129 XCPT_FRAME 1291 XCPT_FRAME
1292 PARAVIRT_ADJUST_EXCEPTION_FRAME
1130 paranoidentry do_double_fault 1293 paranoidentry do_double_fault
1131 jmp paranoid_exit1 1294 jmp paranoid_exit1
1132 CFI_ENDPROC 1295 CFI_ENDPROC
@@ -1143,6 +1306,7 @@ END(segment_not_present)
1143 /* runs on exception stack */ 1306 /* runs on exception stack */
1144ENTRY(stack_segment) 1307ENTRY(stack_segment)
1145 XCPT_FRAME 1308 XCPT_FRAME
1309 PARAVIRT_ADJUST_EXCEPTION_FRAME
1146 paranoidentry do_stack_segment 1310 paranoidentry do_stack_segment
1147 jmp paranoid_exit1 1311 jmp paranoid_exit1
1148 CFI_ENDPROC 1312 CFI_ENDPROC
@@ -1168,6 +1332,7 @@ END(spurious_interrupt_bug)
1168 /* runs on exception stack */ 1332 /* runs on exception stack */
1169ENTRY(machine_check) 1333ENTRY(machine_check)
1170 INTR_FRAME 1334 INTR_FRAME
1335 PARAVIRT_ADJUST_EXCEPTION_FRAME
1171 pushq $0 1336 pushq $0
1172 CFI_ADJUST_CFA_OFFSET 8 1337 CFI_ADJUST_CFA_OFFSET 8
1173 paranoidentry do_machine_check 1338 paranoidentry do_machine_check
@@ -1202,3 +1367,103 @@ KPROBE_ENTRY(ignore_sysret)
1202 sysret 1367 sysret
1203 CFI_ENDPROC 1368 CFI_ENDPROC
1204ENDPROC(ignore_sysret) 1369ENDPROC(ignore_sysret)
1370
1371#ifdef CONFIG_XEN
1372ENTRY(xen_hypervisor_callback)
1373 zeroentry xen_do_hypervisor_callback
1374END(xen_hypervisor_callback)
1375
1376/*
1377# A note on the "critical region" in our callback handler.
1378# We want to avoid stacking callback handlers due to events occurring
1379# during handling of the last event. To do this, we keep events disabled
1380# until we've done all processing. HOWEVER, we must enable events before
1381# popping the stack frame (can't be done atomically) and so it would still
1382# be possible to get enough handler activations to overflow the stack.
1383# Although unlikely, bugs of that kind are hard to track down, so we'd
1384# like to avoid the possibility.
1385# So, on entry to the handler we detect whether we interrupted an
1386# existing activation in its critical region -- if so, we pop the current
1387# activation and restart the handler using the previous one.
1388*/
1389ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1390 CFI_STARTPROC
1391/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
1392 see the correct pointer to the pt_regs */
1393 movq %rdi, %rsp # we don't return, adjust the stack frame
1394 CFI_ENDPROC
1395 CFI_DEFAULT_STACK
139611: incl %gs:pda_irqcount
1397 movq %rsp,%rbp
1398 CFI_DEF_CFA_REGISTER rbp
1399 cmovzq %gs:pda_irqstackptr,%rsp
1400 pushq %rbp # backlink for old unwinder
1401 call xen_evtchn_do_upcall
1402 popq %rsp
1403 CFI_DEF_CFA_REGISTER rsp
1404 decl %gs:pda_irqcount
1405 jmp error_exit
1406 CFI_ENDPROC
1407END(do_hypervisor_callback)
1408
1409/*
1410# Hypervisor uses this for application faults while it executes.
1411# We get here for two reasons:
1412# 1. Fault while reloading DS, ES, FS or GS
1413# 2. Fault while executing IRET
1414# Category 1 we do not need to fix up as Xen has already reloaded all segment
1415# registers that could be reloaded and zeroed the others.
1416# Category 2 we fix up by killing the current process. We cannot use the
1417# normal Linux return path in this case because if we use the IRET hypercall
1418# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1419# We distinguish between categories by comparing each saved segment register
1420# with its current contents: any discrepancy means we in category 1.
1421*/
1422ENTRY(xen_failsafe_callback)
1423 framesz = (RIP-0x30) /* workaround buggy gas */
1424 _frame framesz
1425 CFI_REL_OFFSET rcx, 0
1426 CFI_REL_OFFSET r11, 8
1427 movw %ds,%cx
1428 cmpw %cx,0x10(%rsp)
1429 CFI_REMEMBER_STATE
1430 jne 1f
1431 movw %es,%cx
1432 cmpw %cx,0x18(%rsp)
1433 jne 1f
1434 movw %fs,%cx
1435 cmpw %cx,0x20(%rsp)
1436 jne 1f
1437 movw %gs,%cx
1438 cmpw %cx,0x28(%rsp)
1439 jne 1f
1440 /* All segments match their saved values => Category 2 (Bad IRET). */
1441 movq (%rsp),%rcx
1442 CFI_RESTORE rcx
1443 movq 8(%rsp),%r11
1444 CFI_RESTORE r11
1445 addq $0x30,%rsp
1446 CFI_ADJUST_CFA_OFFSET -0x30
1447 pushq $0
1448 CFI_ADJUST_CFA_OFFSET 8
1449 pushq %r11
1450 CFI_ADJUST_CFA_OFFSET 8
1451 pushq %rcx
1452 CFI_ADJUST_CFA_OFFSET 8
1453 jmp general_protection
1454 CFI_RESTORE_STATE
14551: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
1456 movq (%rsp),%rcx
1457 CFI_RESTORE rcx
1458 movq 8(%rsp),%r11
1459 CFI_RESTORE r11
1460 addq $0x30,%rsp
1461 CFI_ADJUST_CFA_OFFSET -0x30
1462 pushq $0
1463 CFI_ADJUST_CFA_OFFSET 8
1464 SAVE_ALL
1465 jmp error_exit
1466 CFI_ENDPROC
1467END(xen_failsafe_callback)
1468
1469#endif /* CONFIG_XEN */
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
new file mode 100644
index 000000000000..ab115cd15fdf
--- /dev/null
+++ b/arch/x86/kernel/ftrace.c
@@ -0,0 +1,141 @@
1/*
2 * Code for replacing ftrace calls with jumps.
3 *
4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
5 *
6 * Thanks goes to Ingo Molnar, for suggesting the idea.
7 * Mathieu Desnoyers, for suggesting postponing the modifications.
8 * Arjan van de Ven, for keeping me straight, and explaining to me
9 * the dangers of modifying code on the run.
10 */
11
12#include <linux/spinlock.h>
13#include <linux/hardirq.h>
14#include <linux/ftrace.h>
15#include <linux/percpu.h>
16#include <linux/init.h>
17#include <linux/list.h>
18
19#include <asm/alternative.h>
20#include <asm/ftrace.h>
21
22
23/* Long is fine, even if it is only 4 bytes ;-) */
24static long *ftrace_nop;
25
26union ftrace_code_union {
27 char code[MCOUNT_INSN_SIZE];
28 struct {
29 char e8;
30 int offset;
31 } __attribute__((packed));
32};
33
34
35static int notrace ftrace_calc_offset(long ip, long addr)
36{
37 return (int)(addr - ip);
38}
39
40notrace unsigned char *ftrace_nop_replace(void)
41{
42 return (char *)ftrace_nop;
43}
44
45notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
46{
47 static union ftrace_code_union calc;
48
49 calc.e8 = 0xe8;
50 calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
51
52 /*
53 * No locking needed, this must be called via kstop_machine
54 * which in essence is like running on a uniprocessor machine.
55 */
56 return calc.code;
57}
58
59notrace int
60ftrace_modify_code(unsigned long ip, unsigned char *old_code,
61 unsigned char *new_code)
62{
63 unsigned replaced;
64 unsigned old = *(unsigned *)old_code; /* 4 bytes */
65 unsigned new = *(unsigned *)new_code; /* 4 bytes */
66 unsigned char newch = new_code[4];
67 int faulted = 0;
68
69 /*
70 * Note: Due to modules and __init, code can
71 * disappear and change, we need to protect against faulting
72 * as well as code changing.
73 *
74 * No real locking needed, this code is run through
75 * kstop_machine.
76 */
77 asm volatile (
78 "1: lock\n"
79 " cmpxchg %3, (%2)\n"
80 " jnz 2f\n"
81 " movb %b4, 4(%2)\n"
82 "2:\n"
83 ".section .fixup, \"ax\"\n"
84 "3: movl $1, %0\n"
85 " jmp 2b\n"
86 ".previous\n"
87 _ASM_EXTABLE(1b, 3b)
88 : "=r"(faulted), "=a"(replaced)
89 : "r"(ip), "r"(new), "c"(newch),
90 "0"(faulted), "a"(old)
91 : "memory");
92 sync_core();
93
94 if (replaced != old && replaced != new)
95 faulted = 2;
96
97 return faulted;
98}
99
100notrace int ftrace_update_ftrace_func(ftrace_func_t func)
101{
102 unsigned long ip = (unsigned long)(&ftrace_call);
103 unsigned char old[MCOUNT_INSN_SIZE], *new;
104 int ret;
105
106 memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
107 new = ftrace_call_replace(ip, (unsigned long)func);
108 ret = ftrace_modify_code(ip, old, new);
109
110 return ret;
111}
112
113notrace int ftrace_mcount_set(unsigned long *data)
114{
115 unsigned long ip = (long)(&mcount_call);
116 unsigned long *addr = data;
117 unsigned char old[MCOUNT_INSN_SIZE], *new;
118
119 /*
120 * Replace the mcount stub with a pointer to the
121 * ip recorder function.
122 */
123 memcpy(old, &mcount_call, MCOUNT_INSN_SIZE);
124 new = ftrace_call_replace(ip, *addr);
125 *addr = ftrace_modify_code(ip, old, new);
126
127 return 0;
128}
129
130int __init ftrace_dyn_arch_init(void *data)
131{
132 const unsigned char *const *noptable = find_nop_table();
133
134 /* This is running in kstop_machine */
135
136 ftrace_mcount_set(data);
137
138 ftrace_nop = (unsigned long *)noptable[MCOUNT_INSN_SIZE];
139
140 return 0;
141}
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index cbaaf69bedb2..1fa8be5bd217 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -51,7 +51,7 @@ void __init setup_apic_routing(void)
51 else 51 else
52#endif 52#endif
53 53
54 if (num_possible_cpus() <= 8) 54 if (max_physical_apicid < 8)
55 genapic = &apic_flat; 55 genapic = &apic_flat;
56 else 56 else
57 genapic = &apic_physflat; 57 genapic = &apic_physflat;
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
index 1a9c68845ee8..786548a62d38 100644
--- a/arch/x86/kernel/genapic_flat_64.c
+++ b/arch/x86/kernel/genapic_flat_64.c
@@ -168,7 +168,7 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
168 * May as well be the first. 168 * May as well be the first.
169 */ 169 */
170 cpu = first_cpu(cpumask); 170 cpu = first_cpu(cpumask);
171 if ((unsigned)cpu < NR_CPUS) 171 if ((unsigned)cpu < nr_cpu_ids)
172 return per_cpu(x86_cpu_to_apicid, cpu); 172 return per_cpu(x86_cpu_to_apicid, cpu);
173 else 173 else
174 return BAD_APICID; 174 return BAD_APICID;
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index ebf13908a743..2cfcbded888a 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -5,9 +5,10 @@
5 * 5 *
6 * SGI UV APIC functions (note: not an Intel compatible APIC) 6 * SGI UV APIC functions (note: not an Intel compatible APIC)
7 * 7 *
8 * Copyright (C) 2007 Silicon Graphics, Inc. All rights reserved. 8 * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
9 */ 9 */
10 10
11#include <linux/kernel.h>
11#include <linux/threads.h> 12#include <linux/threads.h>
12#include <linux/cpumask.h> 13#include <linux/cpumask.h>
13#include <linux/string.h> 14#include <linux/string.h>
@@ -20,8 +21,10 @@
20#include <asm/smp.h> 21#include <asm/smp.h>
21#include <asm/ipi.h> 22#include <asm/ipi.h>
22#include <asm/genapic.h> 23#include <asm/genapic.h>
24#include <asm/pgtable.h>
23#include <asm/uv/uv_mmrs.h> 25#include <asm/uv/uv_mmrs.h>
24#include <asm/uv/uv_hub.h> 26#include <asm/uv/uv_hub.h>
27#include <asm/uv/bios.h>
25 28
26DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); 29DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
27EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); 30EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info);
@@ -38,6 +41,9 @@ EXPORT_SYMBOL_GPL(uv_cpu_to_blade);
38short uv_possible_blades; 41short uv_possible_blades;
39EXPORT_SYMBOL_GPL(uv_possible_blades); 42EXPORT_SYMBOL_GPL(uv_possible_blades);
40 43
44unsigned long sn_rtc_cycles_per_second;
45EXPORT_SYMBOL(sn_rtc_cycles_per_second);
46
41/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ 47/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
42 48
43static cpumask_t uv_target_cpus(void) 49static cpumask_t uv_target_cpus(void)
@@ -55,44 +61,44 @@ static cpumask_t uv_vector_allocation_domain(int cpu)
55int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip) 61int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip)
56{ 62{
57 unsigned long val; 63 unsigned long val;
58 int nasid; 64 int pnode;
59 65
60 nasid = uv_apicid_to_nasid(phys_apicid); 66 pnode = uv_apicid_to_pnode(phys_apicid);
61 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 67 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
62 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | 68 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
63 (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | 69 (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
64 APIC_DM_INIT; 70 APIC_DM_INIT;
65 uv_write_global_mmr64(nasid, UVH_IPI_INT, val); 71 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
66 mdelay(10); 72 mdelay(10);
67 73
68 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 74 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
69 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | 75 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
70 (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | 76 (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
71 APIC_DM_STARTUP; 77 APIC_DM_STARTUP;
72 uv_write_global_mmr64(nasid, UVH_IPI_INT, val); 78 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
73 return 0; 79 return 0;
74} 80}
75 81
76static void uv_send_IPI_one(int cpu, int vector) 82static void uv_send_IPI_one(int cpu, int vector)
77{ 83{
78 unsigned long val, apicid, lapicid; 84 unsigned long val, apicid, lapicid;
79 int nasid; 85 int pnode;
80 86
81 apicid = per_cpu(x86_cpu_to_apicid, cpu); /* ZZZ - cache node-local ? */ 87 apicid = per_cpu(x86_cpu_to_apicid, cpu); /* ZZZ - cache node-local ? */
82 lapicid = apicid & 0x3f; /* ZZZ macro needed */ 88 lapicid = apicid & 0x3f; /* ZZZ macro needed */
83 nasid = uv_apicid_to_nasid(apicid); 89 pnode = uv_apicid_to_pnode(apicid);
84 val = 90 val =
85 (1UL << UVH_IPI_INT_SEND_SHFT) | (lapicid << 91 (1UL << UVH_IPI_INT_SEND_SHFT) | (lapicid <<
86 UVH_IPI_INT_APIC_ID_SHFT) | 92 UVH_IPI_INT_APIC_ID_SHFT) |
87 (vector << UVH_IPI_INT_VECTOR_SHFT); 93 (vector << UVH_IPI_INT_VECTOR_SHFT);
88 uv_write_global_mmr64(nasid, UVH_IPI_INT, val); 94 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
89} 95}
90 96
91static void uv_send_IPI_mask(cpumask_t mask, int vector) 97static void uv_send_IPI_mask(cpumask_t mask, int vector)
92{ 98{
93 unsigned int cpu; 99 unsigned int cpu;
94 100
95 for (cpu = 0; cpu < NR_CPUS; ++cpu) 101 for_each_possible_cpu(cpu)
96 if (cpu_isset(cpu, mask)) 102 if (cpu_isset(cpu, mask))
97 uv_send_IPI_one(cpu, vector); 103 uv_send_IPI_one(cpu, vector);
98} 104}
@@ -126,7 +132,7 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
126 * May as well be the first. 132 * May as well be the first.
127 */ 133 */
128 cpu = first_cpu(cpumask); 134 cpu = first_cpu(cpumask);
129 if ((unsigned)cpu < NR_CPUS) 135 if ((unsigned)cpu < nr_cpu_ids)
130 return per_cpu(x86_cpu_to_apicid, cpu); 136 return per_cpu(x86_cpu_to_apicid, cpu);
131 else 137 else
132 return BAD_APICID; 138 return BAD_APICID;
@@ -159,39 +165,163 @@ struct genapic apic_x2apic_uv_x = {
159 .phys_pkg_id = phys_pkg_id, /* Fixme ZZZ */ 165 .phys_pkg_id = phys_pkg_id, /* Fixme ZZZ */
160}; 166};
161 167
162static __cpuinit void set_x2apic_extra_bits(int nasid) 168static __cpuinit void set_x2apic_extra_bits(int pnode)
163{ 169{
164 __get_cpu_var(x2apic_extra_bits) = ((nasid >> 1) << 6); 170 __get_cpu_var(x2apic_extra_bits) = (pnode << 6);
165} 171}
166 172
167/* 173/*
168 * Called on boot cpu. 174 * Called on boot cpu.
169 */ 175 */
176static __init int boot_pnode_to_blade(int pnode)
177{
178 int blade;
179
180 for (blade = 0; blade < uv_num_possible_blades(); blade++)
181 if (pnode == uv_blade_info[blade].pnode)
182 return blade;
183 BUG();
184}
185
186struct redir_addr {
187 unsigned long redirect;
188 unsigned long alias;
189};
190
191#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT
192
193static __initdata struct redir_addr redir_addrs[] = {
194 {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_SI_ALIAS0_OVERLAY_CONFIG},
195 {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_SI_ALIAS1_OVERLAY_CONFIG},
196 {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_SI_ALIAS2_OVERLAY_CONFIG},
197};
198
199static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
200{
201 union uvh_si_alias0_overlay_config_u alias;
202 union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect;
203 int i;
204
205 for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) {
206 alias.v = uv_read_local_mmr(redir_addrs[i].alias);
207 if (alias.s.base == 0) {
208 *size = (1UL << alias.s.m_alias);
209 redirect.v = uv_read_local_mmr(redir_addrs[i].redirect);
210 *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT;
211 return;
212 }
213 }
214 BUG();
215}
216
217static __init void map_low_mmrs(void)
218{
219 init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
220 init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
221}
222
223enum map_type {map_wb, map_uc};
224
225static void map_high(char *id, unsigned long base, int shift, enum map_type map_type)
226{
227 unsigned long bytes, paddr;
228
229 paddr = base << shift;
230 bytes = (1UL << shift);
231 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
232 paddr + bytes);
233 if (map_type == map_uc)
234 init_extra_mapping_uc(paddr, bytes);
235 else
236 init_extra_mapping_wb(paddr, bytes);
237
238}
239static __init void map_gru_high(int max_pnode)
240{
241 union uvh_rh_gam_gru_overlay_config_mmr_u gru;
242 int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
243
244 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
245 if (gru.s.enable)
246 map_high("GRU", gru.s.base, shift, map_wb);
247}
248
249static __init void map_config_high(int max_pnode)
250{
251 union uvh_rh_gam_cfg_overlay_config_mmr_u cfg;
252 int shift = UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT;
253
254 cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR);
255 if (cfg.s.enable)
256 map_high("CONFIG", cfg.s.base, shift, map_uc);
257}
258
259static __init void map_mmr_high(int max_pnode)
260{
261 union uvh_rh_gam_mmr_overlay_config_mmr_u mmr;
262 int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT;
263
264 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
265 if (mmr.s.enable)
266 map_high("MMR", mmr.s.base, shift, map_uc);
267}
268
269static __init void map_mmioh_high(int max_pnode)
270{
271 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
272 int shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
273
274 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
275 if (mmioh.s.enable)
276 map_high("MMIOH", mmioh.s.base, shift, map_uc);
277}
278
279static __init void uv_rtc_init(void)
280{
281 long status, ticks_per_sec, drift;
282
283 status =
284 x86_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec,
285 &drift);
286 if (status != 0 || ticks_per_sec < 100000) {
287 printk(KERN_WARNING
288 "unable to determine platform RTC clock frequency, "
289 "guessing.\n");
290 /* BIOS gives wrong value for clock freq. so guess */
291 sn_rtc_cycles_per_second = 1000000000000UL / 30000UL;
292 } else
293 sn_rtc_cycles_per_second = ticks_per_sec;
294}
295
170static __init void uv_system_init(void) 296static __init void uv_system_init(void)
171{ 297{
172 union uvh_si_addr_map_config_u m_n_config; 298 union uvh_si_addr_map_config_u m_n_config;
173 int bytes, nid, cpu, lcpu, nasid, last_nasid, blade; 299 union uvh_node_id_u node_id;
174 unsigned long mmr_base; 300 unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
301 int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
302 int max_pnode = 0;
303 unsigned long mmr_base, present;
304
305 map_low_mmrs();
175 306
176 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); 307 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
308 m_val = m_n_config.s.m_skt;
309 n_val = m_n_config.s.n_skt;
177 mmr_base = 310 mmr_base =
178 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & 311 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
179 ~UV_MMR_ENABLE; 312 ~UV_MMR_ENABLE;
180 printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); 313 printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
181 314
182 last_nasid = -1; 315 for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
183 for_each_possible_cpu(cpu) { 316 uv_possible_blades +=
184 nid = cpu_to_node(cpu); 317 hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8));
185 nasid = uv_apicid_to_nasid(per_cpu(x86_cpu_to_apicid, cpu));
186 if (nasid != last_nasid)
187 uv_possible_blades++;
188 last_nasid = nasid;
189 }
190 printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades()); 318 printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades());
191 319
192 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); 320 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
193 uv_blade_info = alloc_bootmem_pages(bytes); 321 uv_blade_info = alloc_bootmem_pages(bytes);
194 322
323 get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
324
195 bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes(); 325 bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes();
196 uv_node_to_blade = alloc_bootmem_pages(bytes); 326 uv_node_to_blade = alloc_bootmem_pages(bytes);
197 memset(uv_node_to_blade, 255, bytes); 327 memset(uv_node_to_blade, 255, bytes);
@@ -200,43 +330,64 @@ static __init void uv_system_init(void)
200 uv_cpu_to_blade = alloc_bootmem_pages(bytes); 330 uv_cpu_to_blade = alloc_bootmem_pages(bytes);
201 memset(uv_cpu_to_blade, 255, bytes); 331 memset(uv_cpu_to_blade, 255, bytes);
202 332
203 last_nasid = -1; 333 blade = 0;
204 blade = -1; 334 for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) {
205 lcpu = -1; 335 present = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8);
206 for_each_possible_cpu(cpu) { 336 for (j = 0; j < 64; j++) {
207 nid = cpu_to_node(cpu); 337 if (!test_bit(j, &present))
208 nasid = uv_apicid_to_nasid(per_cpu(x86_cpu_to_apicid, cpu)); 338 continue;
209 if (nasid != last_nasid) { 339 uv_blade_info[blade].pnode = (i * 64 + j);
210 blade++; 340 uv_blade_info[blade].nr_possible_cpus = 0;
211 lcpu = -1;
212 uv_blade_info[blade].nr_posible_cpus = 0;
213 uv_blade_info[blade].nr_online_cpus = 0; 341 uv_blade_info[blade].nr_online_cpus = 0;
342 blade++;
214 } 343 }
215 last_nasid = nasid; 344 }
216 lcpu++; 345
346 node_id.v = uv_read_local_mmr(UVH_NODE_ID);
347 gnode_upper = (((unsigned long)node_id.s.node_id) &
348 ~((1 << n_val) - 1)) << m_val;
217 349
218 uv_cpu_hub_info(cpu)->m_val = m_n_config.s.m_skt; 350 uv_rtc_init();
219 uv_cpu_hub_info(cpu)->n_val = m_n_config.s.n_skt; 351
352 for_each_present_cpu(cpu) {
353 nid = cpu_to_node(cpu);
354 pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu));
355 blade = boot_pnode_to_blade(pnode);
356 lcpu = uv_blade_info[blade].nr_possible_cpus;
357 uv_blade_info[blade].nr_possible_cpus++;
358
359 uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
360 uv_cpu_hub_info(cpu)->lowmem_remap_top =
361 lowmem_redir_base + lowmem_redir_size;
362 uv_cpu_hub_info(cpu)->m_val = m_val;
363 uv_cpu_hub_info(cpu)->n_val = m_val;
220 uv_cpu_hub_info(cpu)->numa_blade_id = blade; 364 uv_cpu_hub_info(cpu)->numa_blade_id = blade;
221 uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; 365 uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
222 uv_cpu_hub_info(cpu)->local_nasid = nasid; 366 uv_cpu_hub_info(cpu)->pnode = pnode;
223 uv_cpu_hub_info(cpu)->gnode_upper = 367 uv_cpu_hub_info(cpu)->pnode_mask = (1 << n_val) - 1;
224 nasid & ~((1 << uv_hub_info->n_val) - 1); 368 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
369 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
225 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; 370 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
226 uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */ 371 uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */
227 uv_blade_info[blade].nasid = nasid;
228 uv_blade_info[blade].nr_posible_cpus++;
229 uv_node_to_blade[nid] = blade; 372 uv_node_to_blade[nid] = blade;
230 uv_cpu_to_blade[cpu] = blade; 373 uv_cpu_to_blade[cpu] = blade;
374 max_pnode = max(pnode, max_pnode);
231 375
232 printk(KERN_DEBUG "UV cpu %d, apicid 0x%x, nasid %d, nid %d\n", 376 printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, "
233 cpu, per_cpu(x86_cpu_to_apicid, cpu), nasid, nid); 377 "lcpu %d, blade %d\n",
234 printk(KERN_DEBUG "UV lcpu %d, blade %d\n", lcpu, blade); 378 cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid,
379 lcpu, blade);
235 } 380 }
381
382 map_gru_high(max_pnode);
383 map_mmr_high(max_pnode);
384 map_config_high(max_pnode);
385 map_mmioh_high(max_pnode);
236} 386}
237 387
238/* 388/*
239 * Called on each cpu to initialize the per_cpu UV data area. 389 * Called on each cpu to initialize the per_cpu UV data area.
390 * ZZZ hotplug not supported yet
240 */ 391 */
241void __cpuinit uv_cpu_init(void) 392void __cpuinit uv_cpu_init(void)
242{ 393{
@@ -246,5 +397,5 @@ void __cpuinit uv_cpu_init(void)
246 uv_blade_info[uv_numa_blade_id()].nr_online_cpus++; 397 uv_blade_info[uv_numa_blade_id()].nr_online_cpus++;
247 398
248 if (get_uv_system_type() == UV_NON_UNIQUE_APIC) 399 if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
249 set_x2apic_extra_bits(uv_hub_info->local_nasid); 400 set_x2apic_extra_bits(uv_hub_info->pnode);
250} 401}
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
new file mode 100644
index 000000000000..3e66bd364a9d
--- /dev/null
+++ b/arch/x86/kernel/head.c
@@ -0,0 +1,55 @@
1#include <linux/kernel.h>
2#include <linux/init.h>
3
4#include <asm/setup.h>
5#include <asm/bios_ebda.h>
6
7#define BIOS_LOWMEM_KILOBYTES 0x413
8
9/*
10 * The BIOS places the EBDA/XBDA at the top of conventional
11 * memory, and usually decreases the reported amount of
12 * conventional memory (int 0x12) too. This also contains a
13 * workaround for Dell systems that neglect to reserve EBDA.
14 * The same workaround also avoids a problem with the AMD768MPX
15 * chipset: reserve a page before VGA to prevent PCI prefetch
16 * into it (errata #56). Usually the page is reserved anyways,
17 * unless you have no PS/2 mouse plugged in.
18 */
19void __init reserve_ebda_region(void)
20{
21 unsigned int lowmem, ebda_addr;
22
23 /* To determine the position of the EBDA and the */
24 /* end of conventional memory, we need to look at */
25 /* the BIOS data area. In a paravirtual environment */
26 /* that area is absent. We'll just have to assume */
27 /* that the paravirt case can handle memory setup */
28 /* correctly, without our help. */
29 if (paravirt_enabled())
30 return;
31
32 /* end of low (conventional) memory */
33 lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
34 lowmem <<= 10;
35
36 /* start of EBDA area */
37 ebda_addr = get_bios_ebda();
38
39 /* Fixup: bios puts an EBDA in the top 64K segment */
40 /* of conventional memory, but does not adjust lowmem. */
41 if ((lowmem - ebda_addr) <= 0x10000)
42 lowmem = ebda_addr;
43
44 /* Fixup: bios does not report an EBDA at all. */
45 /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
46 if ((ebda_addr == 0) && (lowmem >= 0x9f000))
47 lowmem = 0x9f000;
48
49 /* Paranoia: should never happen, but... */
50 if ((lowmem == 0) || (lowmem >= 0x100000))
51 lowmem = 0x9f000;
52
53 /* reserve all memory between lowmem and the 1MB mark */
54 reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved");
55}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 3db059058927..fa1d25dd83e3 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -8,7 +8,34 @@
8#include <linux/init.h> 8#include <linux/init.h>
9#include <linux/start_kernel.h> 9#include <linux/start_kernel.h>
10 10
11#include <asm/setup.h>
12#include <asm/sections.h>
13#include <asm/e820.h>
14#include <asm/bios_ebda.h>
15
11void __init i386_start_kernel(void) 16void __init i386_start_kernel(void)
12{ 17{
18 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
19
20#ifdef CONFIG_BLK_DEV_INITRD
21 /* Reserve INITRD */
22 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
23 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
24 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
25 u64 ramdisk_end = ramdisk_image + ramdisk_size;
26 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
27 }
28#endif
29 reserve_early(init_pg_tables_start, init_pg_tables_end,
30 "INIT_PG_TABLE");
31
32 reserve_ebda_region();
33
34 /*
35 * At this point everything still needed from the boot loader
36 * or BIOS or kernel text should be early reserved or marked not
37 * RAM in e820. All other memory is free game.
38 */
39
13 start_kernel(); 40 start_kernel();
14} 41}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index e25c57b8aa84..1b318e903bf6 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -25,6 +25,27 @@
25#include <asm/e820.h> 25#include <asm/e820.h>
26#include <asm/bios_ebda.h> 26#include <asm/bios_ebda.h>
27 27
28/* boot cpu pda */
29static struct x8664_pda _boot_cpu_pda __read_mostly;
30
31#ifdef CONFIG_SMP
32/*
33 * We install an empty cpu_pda pointer table to indicate to early users
34 * (numa_set_node) that the cpu_pda pointer table for cpus other than
35 * the boot cpu is not yet setup.
36 */
37static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
38#else
39static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
40#endif
41
42void __init x86_64_init_pda(void)
43{
44 _cpu_pda = __cpu_pda;
45 cpu_pda(0) = &_boot_cpu_pda;
46 pda_init(0);
47}
48
28static void __init zap_identity_mappings(void) 49static void __init zap_identity_mappings(void)
29{ 50{
30 pgd_t *pgd = pgd_offset_k(0UL); 51 pgd_t *pgd = pgd_offset_k(0UL);
@@ -51,74 +72,6 @@ static void __init copy_bootdata(char *real_mode_data)
51 } 72 }
52} 73}
53 74
54#define BIOS_LOWMEM_KILOBYTES 0x413
55
56/*
57 * The BIOS places the EBDA/XBDA at the top of conventional
58 * memory, and usually decreases the reported amount of
59 * conventional memory (int 0x12) too. This also contains a
60 * workaround for Dell systems that neglect to reserve EBDA.
61 * The same workaround also avoids a problem with the AMD768MPX
62 * chipset: reserve a page before VGA to prevent PCI prefetch
63 * into it (errata #56). Usually the page is reserved anyways,
64 * unless you have no PS/2 mouse plugged in.
65 */
66static void __init reserve_ebda_region(void)
67{
68 unsigned int lowmem, ebda_addr;
69
70 /* To determine the position of the EBDA and the */
71 /* end of conventional memory, we need to look at */
72 /* the BIOS data area. In a paravirtual environment */
73 /* that area is absent. We'll just have to assume */
74 /* that the paravirt case can handle memory setup */
75 /* correctly, without our help. */
76 if (paravirt_enabled())
77 return;
78
79 /* end of low (conventional) memory */
80 lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
81 lowmem <<= 10;
82
83 /* start of EBDA area */
84 ebda_addr = get_bios_ebda();
85
86 /* Fixup: bios puts an EBDA in the top 64K segment */
87 /* of conventional memory, but does not adjust lowmem. */
88 if ((lowmem - ebda_addr) <= 0x10000)
89 lowmem = ebda_addr;
90
91 /* Fixup: bios does not report an EBDA at all. */
92 /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
93 if ((ebda_addr == 0) && (lowmem >= 0x9f000))
94 lowmem = 0x9f000;
95
96 /* Paranoia: should never happen, but... */
97 if ((lowmem == 0) || (lowmem >= 0x100000))
98 lowmem = 0x9f000;
99
100 /* reserve all memory between lowmem and the 1MB mark */
101 reserve_early(lowmem, 0x100000, "BIOS reserved");
102}
103
104static void __init reserve_setup_data(void)
105{
106 struct setup_data *data;
107 unsigned long pa_data;
108 char buf[32];
109
110 if (boot_params.hdr.version < 0x0209)
111 return;
112 pa_data = boot_params.hdr.setup_data;
113 while (pa_data) {
114 data = early_ioremap(pa_data, sizeof(*data));
115 sprintf(buf, "setup data %x", data->type);
116 reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
117 pa_data = data->next;
118 early_iounmap(data, sizeof(*data));
119 }
120}
121
122void __init x86_64_start_kernel(char * real_mode_data) 75void __init x86_64_start_kernel(char * real_mode_data)
123{ 76{
124 int i; 77 int i;
@@ -156,10 +109,15 @@ void __init x86_64_start_kernel(char * real_mode_data)
156 109
157 early_printk("Kernel alive\n"); 110 early_printk("Kernel alive\n");
158 111
159 for (i = 0; i < NR_CPUS; i++) 112 x86_64_init_pda();
160 cpu_pda(i) = &boot_cpu_pda[i];
161 113
162 pda_init(0); 114 early_printk("Kernel really alive\n");
115
116 x86_64_start_reservations(real_mode_data);
117}
118
119void __init x86_64_start_reservations(char *real_mode_data)
120{
163 copy_bootdata(__va(real_mode_data)); 121 copy_bootdata(__va(real_mode_data));
164 122
165 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); 123 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
@@ -175,7 +133,6 @@ void __init x86_64_start_kernel(char * real_mode_data)
175#endif 133#endif
176 134
177 reserve_ebda_region(); 135 reserve_ebda_region();
178 reserve_setup_data();
179 136
180 /* 137 /*
181 * At this point everything still needed from the boot loader 138 * At this point everything still needed from the boot loader
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index f7357cc0162c..f67e93441caf 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -194,6 +194,7 @@ default_entry:
194 xorl %ebx,%ebx /* %ebx is kept at zero */ 194 xorl %ebx,%ebx /* %ebx is kept at zero */
195 195
196 movl $pa(pg0), %edi 196 movl $pa(pg0), %edi
197 movl %edi, pa(init_pg_tables_start)
197 movl $pa(swapper_pg_pmd), %edx 198 movl $pa(swapper_pg_pmd), %edx
198 movl $PTE_ATTR, %eax 199 movl $PTE_ATTR, %eax
19910: 20010:
@@ -219,6 +220,8 @@ default_entry:
219 jb 10b 220 jb 10b
2201: 2211:
221 movl %edi,pa(init_pg_tables_end) 222 movl %edi,pa(init_pg_tables_end)
223 shrl $12, %eax
224 movl %eax, pa(max_pfn_mapped)
222 225
223 /* Do early initialization of the fixmap area */ 226 /* Do early initialization of the fixmap area */
224 movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax 227 movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax
@@ -228,6 +231,7 @@ default_entry:
228page_pde_offset = (__PAGE_OFFSET >> 20); 231page_pde_offset = (__PAGE_OFFSET >> 20);
229 232
230 movl $pa(pg0), %edi 233 movl $pa(pg0), %edi
234 movl %edi, pa(init_pg_tables_start)
231 movl $pa(swapper_pg_dir), %edx 235 movl $pa(swapper_pg_dir), %edx
232 movl $PTE_ATTR, %eax 236 movl $PTE_ATTR, %eax
23310: 23710:
@@ -249,6 +253,8 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
249 cmpl %ebp,%eax 253 cmpl %ebp,%eax
250 jb 10b 254 jb 10b
251 movl %edi,pa(init_pg_tables_end) 255 movl %edi,pa(init_pg_tables_end)
256 shrl $12, %eax
257 movl %eax, pa(max_pfn_mapped)
252 258
253 /* Do early initialization of the fixmap area */ 259 /* Do early initialization of the fixmap area */
254 movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax 260 movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax
@@ -446,10 +452,13 @@ is386: movl $2,%ecx # set MP
446 je 1f 452 je 1f
447 movl $(__KERNEL_PERCPU), %eax 453 movl $(__KERNEL_PERCPU), %eax
448 movl %eax,%fs # set this cpu's percpu 454 movl %eax,%fs # set this cpu's percpu
449 jmp initialize_secondary # all other CPUs call initialize_secondary 455 movl (stack_start), %esp
4501: 4561:
451#endif /* CONFIG_SMP */ 457#endif /* CONFIG_SMP */
452 jmp i386_start_kernel 458 jmp *(initial_code)
459.align 4
460ENTRY(initial_code)
461 .long i386_start_kernel
453 462
454/* 463/*
455 * We depend on ET to be correct. This checks for 287/387. 464 * We depend on ET to be correct. This checks for 287/387.
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b817974ef942..db3280afe886 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -18,6 +18,7 @@
18#include <asm/page.h> 18#include <asm/page.h>
19#include <asm/msr.h> 19#include <asm/msr.h>
20#include <asm/cache.h> 20#include <asm/cache.h>
21#include <asm/processor-flags.h>
21 22
22#ifdef CONFIG_PARAVIRT 23#ifdef CONFIG_PARAVIRT
23#include <asm/asm-offsets.h> 24#include <asm/asm-offsets.h>
@@ -31,6 +32,13 @@
31 * 32 *
32 */ 33 */
33 34
35#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
36
37L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET)
38L3_PAGE_OFFSET = pud_index(__PAGE_OFFSET)
39L4_START_KERNEL = pgd_index(__START_KERNEL_map)
40L3_START_KERNEL = pud_index(__START_KERNEL_map)
41
34 .text 42 .text
35 .section .text.head 43 .section .text.head
36 .code64 44 .code64
@@ -76,8 +84,8 @@ startup_64:
76 /* Fixup the physical addresses in the page table 84 /* Fixup the physical addresses in the page table
77 */ 85 */
78 addq %rbp, init_level4_pgt + 0(%rip) 86 addq %rbp, init_level4_pgt + 0(%rip)
79 addq %rbp, init_level4_pgt + (258*8)(%rip) 87 addq %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip)
80 addq %rbp, init_level4_pgt + (511*8)(%rip) 88 addq %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip)
81 89
82 addq %rbp, level3_ident_pgt + 0(%rip) 90 addq %rbp, level3_ident_pgt + 0(%rip)
83 91
@@ -154,9 +162,7 @@ ENTRY(secondary_startup_64)
154 */ 162 */
155 163
156 /* Enable PAE mode and PGE */ 164 /* Enable PAE mode and PGE */
157 xorq %rax, %rax 165 movl $(X86_CR4_PAE | X86_CR4_PGE), %eax
158 btsq $5, %rax
159 btsq $7, %rax
160 movq %rax, %cr4 166 movq %rax, %cr4
161 167
162 /* Setup early boot stage 4 level pagetables. */ 168 /* Setup early boot stage 4 level pagetables. */
@@ -184,19 +190,15 @@ ENTRY(secondary_startup_64)
1841: wrmsr /* Make changes effective */ 1901: wrmsr /* Make changes effective */
185 191
186 /* Setup cr0 */ 192 /* Setup cr0 */
187#define CR0_PM 1 /* protected mode */ 193#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
188#define CR0_MP (1<<1) 194 X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
189#define CR0_ET (1<<4) 195 X86_CR0_PG)
190#define CR0_NE (1<<5) 196 movl $CR0_STATE, %eax
191#define CR0_WP (1<<16)
192#define CR0_AM (1<<18)
193#define CR0_PAGING (1<<31)
194 movl $CR0_PM|CR0_MP|CR0_ET|CR0_NE|CR0_WP|CR0_AM|CR0_PAGING,%eax
195 /* Make changes effective */ 197 /* Make changes effective */
196 movq %rax, %cr0 198 movq %rax, %cr0
197 199
198 /* Setup a boot time stack */ 200 /* Setup a boot time stack */
199 movq init_rsp(%rip),%rsp 201 movq stack_start(%rip),%rsp
200 202
201 /* zero EFLAGS after setting rsp */ 203 /* zero EFLAGS after setting rsp */
202 pushq $0 204 pushq $0
@@ -208,7 +210,7 @@ ENTRY(secondary_startup_64)
208 * addresses where we're currently running on. We have to do that here 210 * addresses where we're currently running on. We have to do that here
209 * because in 32bit we couldn't load a 64bit linear address. 211 * because in 32bit we couldn't load a 64bit linear address.
210 */ 212 */
211 lgdt cpu_gdt_descr(%rip) 213 lgdt early_gdt_descr(%rip)
212 214
213 /* set up data segments. actually 0 would do too */ 215 /* set up data segments. actually 0 would do too */
214 movl $__KERNEL_DS,%eax 216 movl $__KERNEL_DS,%eax
@@ -257,8 +259,9 @@ ENTRY(secondary_startup_64)
257 .quad x86_64_start_kernel 259 .quad x86_64_start_kernel
258 __FINITDATA 260 __FINITDATA
259 261
260 ENTRY(init_rsp) 262 ENTRY(stack_start)
261 .quad init_thread_union+THREAD_SIZE-8 263 .quad init_thread_union+THREAD_SIZE-8
264 .word 0
262 265
263bad_address: 266bad_address:
264 jmp bad_address 267 jmp bad_address
@@ -327,11 +330,11 @@ early_idt_ripmsg:
327ENTRY(name) 330ENTRY(name)
328 331
329/* Automate the creation of 1 to 1 mapping pmd entries */ 332/* Automate the creation of 1 to 1 mapping pmd entries */
330#define PMDS(START, PERM, COUNT) \ 333#define PMDS(START, PERM, COUNT) \
331 i = 0 ; \ 334 i = 0 ; \
332 .rept (COUNT) ; \ 335 .rept (COUNT) ; \
333 .quad (START) + (i << 21) + (PERM) ; \ 336 .quad (START) + (i << PMD_SHIFT) + (PERM) ; \
334 i = i + 1 ; \ 337 i = i + 1 ; \
335 .endr 338 .endr
336 339
337 /* 340 /*
@@ -342,9 +345,9 @@ ENTRY(name)
342 */ 345 */
343NEXT_PAGE(init_level4_pgt) 346NEXT_PAGE(init_level4_pgt)
344 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 347 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
345 .fill 257,8,0 348 .org init_level4_pgt + L4_PAGE_OFFSET*8, 0
346 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 349 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
347 .fill 252,8,0 350 .org init_level4_pgt + L4_START_KERNEL*8, 0
348 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ 351 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
349 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE 352 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
350 353
@@ -353,7 +356,7 @@ NEXT_PAGE(level3_ident_pgt)
353 .fill 511,8,0 356 .fill 511,8,0
354 357
355NEXT_PAGE(level3_kernel_pgt) 358NEXT_PAGE(level3_kernel_pgt)
356 .fill 510,8,0 359 .fill L3_START_KERNEL,8,0
357 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ 360 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
358 .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE 361 .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
359 .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE 362 .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
@@ -384,7 +387,7 @@ NEXT_PAGE(level2_kernel_pgt)
384 * If you want to increase this then increase MODULES_VADDR 387 * If you want to increase this then increase MODULES_VADDR
385 * too.) 388 * too.)
386 */ 389 */
387 PMDS(0, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, 390 PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
388 KERNEL_IMAGE_SIZE/PMD_SIZE) 391 KERNEL_IMAGE_SIZE/PMD_SIZE)
389 392
390NEXT_PAGE(level2_spare_pgt) 393NEXT_PAGE(level2_spare_pgt)
@@ -395,54 +398,17 @@ NEXT_PAGE(level2_spare_pgt)
395 398
396 .data 399 .data
397 .align 16 400 .align 16
398 .globl cpu_gdt_descr 401 .globl early_gdt_descr
399cpu_gdt_descr: 402early_gdt_descr:
400 .word gdt_end-cpu_gdt_table-1 403 .word GDT_ENTRIES*8-1
401gdt: 404 .quad per_cpu__gdt_page
402 .quad cpu_gdt_table
403#ifdef CONFIG_SMP
404 .rept NR_CPUS-1
405 .word 0
406 .quad 0
407 .endr
408#endif
409 405
410ENTRY(phys_base) 406ENTRY(phys_base)
411 /* This must match the first entry in level2_kernel_pgt */ 407 /* This must match the first entry in level2_kernel_pgt */
412 .quad 0x0000000000000000 408 .quad 0x0000000000000000
413 409
414/* We need valid kernel segments for data and code in long mode too 410#include "../../x86/xen/xen-head.S"
415 * IRET will check the segment types kkeil 2000/10/28
416 * Also sysret mandates a special GDT layout
417 */
418
419 .section .data.page_aligned, "aw"
420 .align PAGE_SIZE
421
422/* The TLS descriptors are currently at a different place compared to i386.
423 Hopefully nobody expects them at a fixed place (Wine?) */
424 411
425ENTRY(cpu_gdt_table)
426 .quad 0x0000000000000000 /* NULL descriptor */
427 .quad 0x00cf9b000000ffff /* __KERNEL32_CS */
428 .quad 0x00af9b000000ffff /* __KERNEL_CS */
429 .quad 0x00cf93000000ffff /* __KERNEL_DS */
430 .quad 0x00cffb000000ffff /* __USER32_CS */
431 .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */
432 .quad 0x00affb000000ffff /* __USER_CS */
433 .quad 0x0 /* unused */
434 .quad 0,0 /* TSS */
435 .quad 0,0 /* LDT */
436 .quad 0,0,0 /* three TLS descriptors */
437 .quad 0x0000f40000000000 /* node/CPU stored in limit */
438gdt_end:
439 /* asm/segment.h:GDT_ENTRIES must match this */
440 /* This should be a multiple of the cache line size */
441 /* GDTs of other CPUs are now dynamically allocated */
442
443 /* zero the remaining page */
444 .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
445
446 .section .bss, "aw", @nobits 412 .section .bss, "aw", @nobits
447 .align L1_CACHE_BYTES 413 .align L1_CACHE_BYTES
448ENTRY(idt_table) 414ENTRY(idt_table)
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 9b5cfcdfc426..ad2b15a1334d 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -17,7 +17,7 @@
17 17
18/* FSEC = 10^-15 18/* FSEC = 10^-15
19 NSEC = 10^-9 */ 19 NSEC = 10^-9 */
20#define FSEC_PER_NSEC 1000000 20#define FSEC_PER_NSEC 1000000L
21 21
22/* 22/*
23 * HPET address is set in acpi/boot.c, when an ACPI entry exists 23 * HPET address is set in acpi/boot.c, when an ACPI entry exists
@@ -36,26 +36,15 @@ static inline void hpet_writel(unsigned long d, unsigned long a)
36} 36}
37 37
38#ifdef CONFIG_X86_64 38#ifdef CONFIG_X86_64
39
40#include <asm/pgtable.h> 39#include <asm/pgtable.h>
41 40#endif
42static inline void hpet_set_mapping(void)
43{
44 set_fixmap_nocache(FIX_HPET_BASE, hpet_address);
45 __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
46 hpet_virt_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE);
47}
48
49static inline void hpet_clear_mapping(void)
50{
51 hpet_virt_address = NULL;
52}
53
54#else
55 41
56static inline void hpet_set_mapping(void) 42static inline void hpet_set_mapping(void)
57{ 43{
58 hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); 44 hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
45#ifdef CONFIG_X86_64
46 __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
47#endif
59} 48}
60 49
61static inline void hpet_clear_mapping(void) 50static inline void hpet_clear_mapping(void)
@@ -63,7 +52,6 @@ static inline void hpet_clear_mapping(void)
63 iounmap(hpet_virt_address); 52 iounmap(hpet_virt_address);
64 hpet_virt_address = NULL; 53 hpet_virt_address = NULL;
65} 54}
66#endif
67 55
68/* 56/*
69 * HPET command line enable / disable 57 * HPET command line enable / disable
@@ -206,20 +194,19 @@ static void hpet_enable_legacy_int(void)
206 194
207static void hpet_legacy_clockevent_register(void) 195static void hpet_legacy_clockevent_register(void)
208{ 196{
209 uint64_t hpet_freq;
210
211 /* Start HPET legacy interrupts */ 197 /* Start HPET legacy interrupts */
212 hpet_enable_legacy_int(); 198 hpet_enable_legacy_int();
213 199
214 /* 200 /*
215 * The period is a femto seconds value. We need to calculate the 201 * The mult factor is defined as (include/linux/clockchips.h)
216 * scaled math multiplication factor for nanosecond to hpet tick 202 * mult/2^shift = cyc/ns (in contrast to ns/cyc in clocksource.h)
217 * conversion. 203 * hpet_period is in units of femtoseconds (per cycle), so
204 * mult/2^shift = cyc/ns = 10^6/hpet_period
205 * mult = (10^6 * 2^shift)/hpet_period
206 * mult = (FSEC_PER_NSEC << hpet_clockevent.shift)/hpet_period
218 */ 207 */
219 hpet_freq = 1000000000000000ULL; 208 hpet_clockevent.mult = div_sc((unsigned long) FSEC_PER_NSEC,
220 do_div(hpet_freq, hpet_period); 209 hpet_period, hpet_clockevent.shift);
221 hpet_clockevent.mult = div_sc((unsigned long) hpet_freq,
222 NSEC_PER_SEC, hpet_clockevent.shift);
223 /* Calculate the min / max delta */ 210 /* Calculate the min / max delta */
224 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, 211 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
225 &hpet_clockevent); 212 &hpet_clockevent);
@@ -324,7 +311,7 @@ static struct clocksource clocksource_hpet = {
324 311
325static int hpet_clocksource_register(void) 312static int hpet_clocksource_register(void)
326{ 313{
327 u64 tmp, start, now; 314 u64 start, now;
328 cycle_t t1; 315 cycle_t t1;
329 316
330 /* Start the counter */ 317 /* Start the counter */
@@ -351,21 +338,15 @@ static int hpet_clocksource_register(void)
351 return -ENODEV; 338 return -ENODEV;
352 } 339 }
353 340
354 /* Initialize and register HPET clocksource 341 /*
355 * 342 * The definition of mult is (include/linux/clocksource.h)
356 * hpet period is in femto seconds per cycle 343 * mult/2^shift = ns/cyc and hpet_period is in units of fsec/cyc
357 * so we need to convert this to ns/cyc units 344 * so we first need to convert hpet_period to ns/cyc units:
358 * approximated by mult/2^shift 345 * mult/2^shift = ns/cyc = hpet_period/10^6
359 * 346 * mult = (hpet_period * 2^shift)/10^6
360 * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift 347 * mult = (hpet_period << shift)/FSEC_PER_NSEC
361 * fsec/cyc * 1ns/1000000fsec * 2^shift = mult
362 * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
363 * (fsec/cyc << shift)/1000000 = mult
364 * (hpet_period << shift)/FSEC_PER_NSEC = mult
365 */ 348 */
366 tmp = (u64)hpet_period << HPET_SHIFT; 349 clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT);
367 do_div(tmp, FSEC_PER_NSEC);
368 clocksource_hpet.mult = (u32)tmp;
369 350
370 clocksource_register(&clocksource_hpet); 351 clocksource_register(&clocksource_hpet);
371 352
@@ -487,7 +468,7 @@ void hpet_disable(void)
487#define RTC_NUM_INTS 1 468#define RTC_NUM_INTS 1
488 469
489static unsigned long hpet_rtc_flags; 470static unsigned long hpet_rtc_flags;
490static unsigned long hpet_prev_update_sec; 471static int hpet_prev_update_sec;
491static struct rtc_time hpet_alarm_time; 472static struct rtc_time hpet_alarm_time;
492static unsigned long hpet_pie_count; 473static unsigned long hpet_pie_count;
493static unsigned long hpet_t1_cmp; 474static unsigned long hpet_t1_cmp;
@@ -594,6 +575,9 @@ int hpet_set_rtc_irq_bit(unsigned long bit_mask)
594 575
595 hpet_rtc_flags |= bit_mask; 576 hpet_rtc_flags |= bit_mask;
596 577
578 if ((bit_mask & RTC_UIE) && !(oldbits & RTC_UIE))
579 hpet_prev_update_sec = -1;
580
597 if (!oldbits) 581 if (!oldbits)
598 hpet_rtc_timer_init(); 582 hpet_rtc_timer_init();
599 583
@@ -671,7 +655,7 @@ static void hpet_rtc_timer_reinit(void)
671 if (hpet_rtc_flags & RTC_PIE) 655 if (hpet_rtc_flags & RTC_PIE)
672 hpet_pie_count += lost_ints; 656 hpet_pie_count += lost_ints;
673 if (printk_ratelimit()) 657 if (printk_ratelimit())
674 printk(KERN_WARNING "rtc: lost %d interrupts\n", 658 printk(KERN_WARNING "hpet1: lost %d rtc interrupts\n",
675 lost_ints); 659 lost_ints);
676 } 660 }
677} 661}
@@ -689,7 +673,8 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
689 673
690 if (hpet_rtc_flags & RTC_UIE && 674 if (hpet_rtc_flags & RTC_UIE &&
691 curr_time.tm_sec != hpet_prev_update_sec) { 675 curr_time.tm_sec != hpet_prev_update_sec) {
692 rtc_int_flag = RTC_UF; 676 if (hpet_prev_update_sec >= 0)
677 rtc_int_flag = RTC_UF;
693 hpet_prev_update_sec = curr_time.tm_sec; 678 hpet_prev_update_sec = curr_time.tm_sec;
694 } 679 }
695 680
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index deb43785e923..dd7ebee446af 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -1,7 +1,14 @@
1#include <linux/module.h> 1#include <linux/module.h>
2
2#include <asm/checksum.h> 3#include <asm/checksum.h>
3#include <asm/desc.h>
4#include <asm/pgtable.h> 4#include <asm/pgtable.h>
5#include <asm/desc.h>
6#include <asm/ftrace.h>
7
8#ifdef CONFIG_FTRACE
9/* mcount is defined in assembly */
10EXPORT_SYMBOL(mcount);
11#endif
5 12
6/* Networking helper routines. */ 13/* Networking helper routines. */
7EXPORT_SYMBOL(csum_partial_copy_generic); 14EXPORT_SYMBOL(csum_partial_copy_generic);
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 95e80e5033c3..eb9ddd8efb82 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -162,7 +162,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
162 int ret; 162 int ret;
163 163
164 if (!cpu_has_fxsr) 164 if (!cpu_has_fxsr)
165 return -EIO; 165 return -ENODEV;
166 166
167 ret = init_fpu(target); 167 ret = init_fpu(target);
168 if (ret) 168 if (ret)
@@ -179,7 +179,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
179 int ret; 179 int ret;
180 180
181 if (!cpu_has_fxsr) 181 if (!cpu_has_fxsr)
182 return -EIO; 182 return -ENODEV;
183 183
184 ret = init_fpu(target); 184 ret = init_fpu(target);
185 if (ret) 185 if (ret)
diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259.c
index fe631967d625..dc92b49d9204 100644
--- a/arch/x86/kernel/i8259_32.c
+++ b/arch/x86/kernel/i8259.c
@@ -1,8 +1,10 @@
1#include <linux/linkage.h>
1#include <linux/errno.h> 2#include <linux/errno.h>
2#include <linux/signal.h> 3#include <linux/signal.h>
3#include <linux/sched.h> 4#include <linux/sched.h>
4#include <linux/ioport.h> 5#include <linux/ioport.h>
5#include <linux/interrupt.h> 6#include <linux/interrupt.h>
7#include <linux/timex.h>
6#include <linux/slab.h> 8#include <linux/slab.h>
7#include <linux/random.h> 9#include <linux/random.h>
8#include <linux/init.h> 10#include <linux/init.h>
@@ -10,10 +12,12 @@
10#include <linux/sysdev.h> 12#include <linux/sysdev.h>
11#include <linux/bitops.h> 13#include <linux/bitops.h>
12 14
15#include <asm/acpi.h>
13#include <asm/atomic.h> 16#include <asm/atomic.h>
14#include <asm/system.h> 17#include <asm/system.h>
15#include <asm/io.h> 18#include <asm/io.h>
16#include <asm/timer.h> 19#include <asm/timer.h>
20#include <asm/hw_irq.h>
17#include <asm/pgtable.h> 21#include <asm/pgtable.h>
18#include <asm/delay.h> 22#include <asm/delay.h>
19#include <asm/desc.h> 23#include <asm/desc.h>
@@ -32,7 +36,7 @@ static int i8259A_auto_eoi;
32DEFINE_SPINLOCK(i8259A_lock); 36DEFINE_SPINLOCK(i8259A_lock);
33static void mask_and_ack_8259A(unsigned int); 37static void mask_and_ack_8259A(unsigned int);
34 38
35static struct irq_chip i8259A_chip = { 39struct irq_chip i8259A_chip = {
36 .name = "XT-PIC", 40 .name = "XT-PIC",
37 .mask = disable_8259A_irq, 41 .mask = disable_8259A_irq,
38 .disable = disable_8259A_irq, 42 .disable = disable_8259A_irq,
@@ -125,14 +129,14 @@ static inline int i8259A_irq_real(unsigned int irq)
125 int irqmask = 1<<irq; 129 int irqmask = 1<<irq;
126 130
127 if (irq < 8) { 131 if (irq < 8) {
128 outb(0x0B,PIC_MASTER_CMD); /* ISR register */ 132 outb(0x0B, PIC_MASTER_CMD); /* ISR register */
129 value = inb(PIC_MASTER_CMD) & irqmask; 133 value = inb(PIC_MASTER_CMD) & irqmask;
130 outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */ 134 outb(0x0A, PIC_MASTER_CMD); /* back to the IRR register */
131 return value; 135 return value;
132 } 136 }
133 outb(0x0B,PIC_SLAVE_CMD); /* ISR register */ 137 outb(0x0B, PIC_SLAVE_CMD); /* ISR register */
134 value = inb(PIC_SLAVE_CMD) & (irqmask >> 8); 138 value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
135 outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ 139 outb(0x0A, PIC_SLAVE_CMD); /* back to the IRR register */
136 return value; 140 return value;
137} 141}
138 142
@@ -171,12 +175,14 @@ handle_real_irq:
171 if (irq & 8) { 175 if (irq & 8) {
172 inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ 176 inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */
173 outb(cached_slave_mask, PIC_SLAVE_IMR); 177 outb(cached_slave_mask, PIC_SLAVE_IMR);
174 outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */ 178 /* 'Specific EOI' to slave */
175 outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */ 179 outb(0x60+(irq&7), PIC_SLAVE_CMD);
180 /* 'Specific EOI' to master-IRQ2 */
181 outb(0x60+PIC_CASCADE_IR, PIC_MASTER_CMD);
176 } else { 182 } else {
177 inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ 183 inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */
178 outb(cached_master_mask, PIC_MASTER_IMR); 184 outb(cached_master_mask, PIC_MASTER_IMR);
179 outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */ 185 outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */
180 } 186 }
181 spin_unlock_irqrestore(&i8259A_lock, flags); 187 spin_unlock_irqrestore(&i8259A_lock, flags);
182 return; 188 return;
@@ -199,7 +205,8 @@ spurious_8259A_irq:
199 * lets ACK and report it. [once per IRQ] 205 * lets ACK and report it. [once per IRQ]
200 */ 206 */
201 if (!(spurious_irq_mask & irqmask)) { 207 if (!(spurious_irq_mask & irqmask)) {
202 printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); 208 printk(KERN_DEBUG
209 "spurious 8259A interrupt: IRQ%d.\n", irq);
203 spurious_irq_mask |= irqmask; 210 spurious_irq_mask |= irqmask;
204 } 211 }
205 atomic_inc(&irq_err_count); 212 atomic_inc(&irq_err_count);
@@ -290,17 +297,28 @@ void init_8259A(int auto_eoi)
290 * outb_pic - this has to work on a wide range of PC hardware. 297 * outb_pic - this has to work on a wide range of PC hardware.
291 */ 298 */
292 outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ 299 outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
293 outb_pic(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ 300
294 outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ 301 /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 on x86-64,
302 to 0x20-0x27 on i386 */
303 outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
304
305 /* 8259A-1 (the master) has a slave on IR2 */
306 outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);
307
295 if (auto_eoi) /* master does Auto EOI */ 308 if (auto_eoi) /* master does Auto EOI */
296 outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); 309 outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
297 else /* master expects normal EOI */ 310 else /* master expects normal EOI */
298 outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); 311 outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
299 312
300 outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ 313 outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */
301 outb_pic(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ 314
302 outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ 315 /* ICW2: 8259A-2 IR0-7 mapped to IRQ8_VECTOR */
303 outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ 316 outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR);
317 /* 8259A-2 is a slave on master's IR2 */
318 outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
319 /* (slave's support for AEOI in flat mode is to be investigated) */
320 outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR);
321
304 if (auto_eoi) 322 if (auto_eoi)
305 /* 323 /*
306 * In AEOI mode we just have to mask the interrupt 324 * In AEOI mode we just have to mask the interrupt
@@ -317,93 +335,3 @@ void init_8259A(int auto_eoi)
317 335
318 spin_unlock_irqrestore(&i8259A_lock, flags); 336 spin_unlock_irqrestore(&i8259A_lock, flags);
319} 337}
320
321/*
322 * Note that on a 486, we don't want to do a SIGFPE on an irq13
323 * as the irq is unreliable, and exception 16 works correctly
324 * (ie as explained in the intel literature). On a 386, you
325 * can't use exception 16 due to bad IBM design, so we have to
326 * rely on the less exact irq13.
327 *
328 * Careful.. Not only is IRQ13 unreliable, but it is also
329 * leads to races. IBM designers who came up with it should
330 * be shot.
331 */
332
333
334static irqreturn_t math_error_irq(int cpl, void *dev_id)
335{
336 extern void math_error(void __user *);
337 outb(0,0xF0);
338 if (ignore_fpu_irq || !boot_cpu_data.hard_math)
339 return IRQ_NONE;
340 math_error((void __user *)get_irq_regs()->ip);
341 return IRQ_HANDLED;
342}
343
344/*
345 * New motherboards sometimes make IRQ 13 be a PCI interrupt,
346 * so allow interrupt sharing.
347 */
348static struct irqaction fpu_irq = {
349 .handler = math_error_irq,
350 .mask = CPU_MASK_NONE,
351 .name = "fpu",
352};
353
354void __init init_ISA_irqs (void)
355{
356 int i;
357
358#ifdef CONFIG_X86_LOCAL_APIC
359 init_bsp_APIC();
360#endif
361 init_8259A(0);
362
363 /*
364 * 16 old-style INTA-cycle interrupts:
365 */
366 for (i = 0; i < 16; i++) {
367 set_irq_chip_and_handler_name(i, &i8259A_chip,
368 handle_level_irq, "XT");
369 }
370}
371
372/* Overridden in paravirt.c */
373void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
374
375void __init native_init_IRQ(void)
376{
377 int i;
378
379 /* all the set up before the call gates are initialised */
380 pre_intr_init_hook();
381
382 /*
383 * Cover the whole vector space, no vector can escape
384 * us. (some of these will be overridden and become
385 * 'special' SMP interrupts)
386 */
387 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
388 int vector = FIRST_EXTERNAL_VECTOR + i;
389 if (i >= NR_IRQS)
390 break;
391 /* SYSCALL_VECTOR was reserved in trap_init. */
392 if (!test_bit(vector, used_vectors))
393 set_intr_gate(vector, interrupt[i]);
394 }
395
396 /* setup after call gates are initialised (usually add in
397 * the architecture specific gates)
398 */
399 intr_init_hook();
400
401 /*
402 * External FPU? Set up irq13 if so, for
403 * original braindamaged IBM FERR coupling.
404 */
405 if (boot_cpu_data.hard_math && !cpu_has_fpu)
406 setup_irq(FPU_IRQ, &fpu_irq);
407
408 irq_ctx_init(smp_processor_id());
409}
diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c
deleted file mode 100644
index fa57a1568508..000000000000
--- a/arch/x86/kernel/i8259_64.c
+++ /dev/null
@@ -1,512 +0,0 @@
1#include <linux/linkage.h>
2#include <linux/errno.h>
3#include <linux/signal.h>
4#include <linux/sched.h>
5#include <linux/ioport.h>
6#include <linux/interrupt.h>
7#include <linux/timex.h>
8#include <linux/slab.h>
9#include <linux/random.h>
10#include <linux/init.h>
11#include <linux/kernel_stat.h>
12#include <linux/sysdev.h>
13#include <linux/bitops.h>
14
15#include <asm/acpi.h>
16#include <asm/atomic.h>
17#include <asm/system.h>
18#include <asm/io.h>
19#include <asm/hw_irq.h>
20#include <asm/pgtable.h>
21#include <asm/delay.h>
22#include <asm/desc.h>
23#include <asm/apic.h>
24#include <asm/i8259.h>
25
26/*
27 * Common place to define all x86 IRQ vectors
28 *
29 * This builds up the IRQ handler stubs using some ugly macros in irq.h
30 *
31 * These macros create the low-level assembly IRQ routines that save
32 * register context and call do_IRQ(). do_IRQ() then does all the
33 * operations that are needed to keep the AT (or SMP IOAPIC)
34 * interrupt-controller happy.
35 */
36
37#define BI(x,y) \
38 BUILD_IRQ(x##y)
39
40#define BUILD_16_IRQS(x) \
41 BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
42 BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
43 BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
44 BI(x,c) BI(x,d) BI(x,e) BI(x,f)
45
46/*
47 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
48 * (these are usually mapped to vectors 0x30-0x3f)
49 */
50
51/*
52 * The IO-APIC gives us many more interrupt sources. Most of these
53 * are unused but an SMP system is supposed to have enough memory ...
54 * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
55 * across the spectrum, so we really want to be prepared to get all
56 * of these. Plus, more powerful systems might have more than 64
57 * IO-APIC registers.
58 *
59 * (these are usually mapped into the 0x30-0xff vector range)
60 */
61 BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
62BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
63BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
64BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
65
66#undef BUILD_16_IRQS
67#undef BI
68
69
70#define IRQ(x,y) \
71 IRQ##x##y##_interrupt
72
73#define IRQLIST_16(x) \
74 IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
75 IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
76 IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
77 IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
78
79/* for the irq vectors */
80static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
81 IRQLIST_16(0x2), IRQLIST_16(0x3),
82 IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
83 IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
84 IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
85};
86
87#undef IRQ
88#undef IRQLIST_16
89
90/*
91 * This is the 'legacy' 8259A Programmable Interrupt Controller,
92 * present in the majority of PC/AT boxes.
93 * plus some generic x86 specific things if generic specifics makes
94 * any sense at all.
95 * this file should become arch/i386/kernel/irq.c when the old irq.c
96 * moves to arch independent land
97 */
98
99static int i8259A_auto_eoi;
100DEFINE_SPINLOCK(i8259A_lock);
101static void mask_and_ack_8259A(unsigned int);
102
103static struct irq_chip i8259A_chip = {
104 .name = "XT-PIC",
105 .mask = disable_8259A_irq,
106 .disable = disable_8259A_irq,
107 .unmask = enable_8259A_irq,
108 .mask_ack = mask_and_ack_8259A,
109};
110
111/*
112 * 8259A PIC functions to handle ISA devices:
113 */
114
115/*
116 * This contains the irq mask for both 8259A irq controllers,
117 */
118unsigned int cached_irq_mask = 0xffff;
119
120/*
121 * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
122 * boards the timer interrupt is not really connected to any IO-APIC pin,
123 * it's fed to the master 8259A's IR0 line only.
124 *
125 * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
126 * this 'mixed mode' IRQ handling costs nothing because it's only used
127 * at IRQ setup time.
128 */
129unsigned long io_apic_irqs;
130
131void disable_8259A_irq(unsigned int irq)
132{
133 unsigned int mask = 1 << irq;
134 unsigned long flags;
135
136 spin_lock_irqsave(&i8259A_lock, flags);
137 cached_irq_mask |= mask;
138 if (irq & 8)
139 outb(cached_slave_mask, PIC_SLAVE_IMR);
140 else
141 outb(cached_master_mask, PIC_MASTER_IMR);
142 spin_unlock_irqrestore(&i8259A_lock, flags);
143}
144
145void enable_8259A_irq(unsigned int irq)
146{
147 unsigned int mask = ~(1 << irq);
148 unsigned long flags;
149
150 spin_lock_irqsave(&i8259A_lock, flags);
151 cached_irq_mask &= mask;
152 if (irq & 8)
153 outb(cached_slave_mask, PIC_SLAVE_IMR);
154 else
155 outb(cached_master_mask, PIC_MASTER_IMR);
156 spin_unlock_irqrestore(&i8259A_lock, flags);
157}
158
159int i8259A_irq_pending(unsigned int irq)
160{
161 unsigned int mask = 1<<irq;
162 unsigned long flags;
163 int ret;
164
165 spin_lock_irqsave(&i8259A_lock, flags);
166 if (irq < 8)
167 ret = inb(PIC_MASTER_CMD) & mask;
168 else
169 ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
170 spin_unlock_irqrestore(&i8259A_lock, flags);
171
172 return ret;
173}
174
175void make_8259A_irq(unsigned int irq)
176{
177 disable_irq_nosync(irq);
178 io_apic_irqs &= ~(1<<irq);
179 set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
180 "XT");
181 enable_irq(irq);
182}
183
184/*
185 * This function assumes to be called rarely. Switching between
186 * 8259A registers is slow.
187 * This has to be protected by the irq controller spinlock
188 * before being called.
189 */
190static inline int i8259A_irq_real(unsigned int irq)
191{
192 int value;
193 int irqmask = 1<<irq;
194
195 if (irq < 8) {
196 outb(0x0B,PIC_MASTER_CMD); /* ISR register */
197 value = inb(PIC_MASTER_CMD) & irqmask;
198 outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */
199 return value;
200 }
201 outb(0x0B,PIC_SLAVE_CMD); /* ISR register */
202 value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
203 outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */
204 return value;
205}
206
207/*
208 * Careful! The 8259A is a fragile beast, it pretty
209 * much _has_ to be done exactly like this (mask it
210 * first, _then_ send the EOI, and the order of EOI
211 * to the two 8259s is important!
212 */
213static void mask_and_ack_8259A(unsigned int irq)
214{
215 unsigned int irqmask = 1 << irq;
216 unsigned long flags;
217
218 spin_lock_irqsave(&i8259A_lock, flags);
219 /*
220 * Lightweight spurious IRQ detection. We do not want
221 * to overdo spurious IRQ handling - it's usually a sign
222 * of hardware problems, so we only do the checks we can
223 * do without slowing down good hardware unnecessarily.
224 *
225 * Note that IRQ7 and IRQ15 (the two spurious IRQs
226 * usually resulting from the 8259A-1|2 PICs) occur
227 * even if the IRQ is masked in the 8259A. Thus we
228 * can check spurious 8259A IRQs without doing the
229 * quite slow i8259A_irq_real() call for every IRQ.
230 * This does not cover 100% of spurious interrupts,
231 * but should be enough to warn the user that there
232 * is something bad going on ...
233 */
234 if (cached_irq_mask & irqmask)
235 goto spurious_8259A_irq;
236 cached_irq_mask |= irqmask;
237
238handle_real_irq:
239 if (irq & 8) {
240 inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */
241 outb(cached_slave_mask, PIC_SLAVE_IMR);
242 /* 'Specific EOI' to slave */
243 outb(0x60+(irq&7),PIC_SLAVE_CMD);
244 /* 'Specific EOI' to master-IRQ2 */
245 outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD);
246 } else {
247 inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */
248 outb(cached_master_mask, PIC_MASTER_IMR);
249 /* 'Specific EOI' to master */
250 outb(0x60+irq,PIC_MASTER_CMD);
251 }
252 spin_unlock_irqrestore(&i8259A_lock, flags);
253 return;
254
255spurious_8259A_irq:
256 /*
257 * this is the slow path - should happen rarely.
258 */
259 if (i8259A_irq_real(irq))
260 /*
261 * oops, the IRQ _is_ in service according to the
262 * 8259A - not spurious, go handle it.
263 */
264 goto handle_real_irq;
265
266 {
267 static int spurious_irq_mask;
268 /*
269 * At this point we can be sure the IRQ is spurious,
270 * lets ACK and report it. [once per IRQ]
271 */
272 if (!(spurious_irq_mask & irqmask)) {
273 printk(KERN_DEBUG
274 "spurious 8259A interrupt: IRQ%d.\n", irq);
275 spurious_irq_mask |= irqmask;
276 }
277 atomic_inc(&irq_err_count);
278 /*
279 * Theoretically we do not have to handle this IRQ,
280 * but in Linux this does not cause problems and is
281 * simpler for us.
282 */
283 goto handle_real_irq;
284 }
285}
286
287static char irq_trigger[2];
288/**
289 * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
290 */
291static void restore_ELCR(char *trigger)
292{
293 outb(trigger[0], 0x4d0);
294 outb(trigger[1], 0x4d1);
295}
296
297static void save_ELCR(char *trigger)
298{
299 /* IRQ 0,1,2,8,13 are marked as reserved */
300 trigger[0] = inb(0x4d0) & 0xF8;
301 trigger[1] = inb(0x4d1) & 0xDE;
302}
303
304static int i8259A_resume(struct sys_device *dev)
305{
306 init_8259A(i8259A_auto_eoi);
307 restore_ELCR(irq_trigger);
308 return 0;
309}
310
311static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
312{
313 save_ELCR(irq_trigger);
314 return 0;
315}
316
317static int i8259A_shutdown(struct sys_device *dev)
318{
319 /* Put the i8259A into a quiescent state that
320 * the kernel initialization code can get it
321 * out of.
322 */
323 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
324 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */
325 return 0;
326}
327
328static struct sysdev_class i8259_sysdev_class = {
329 .name = "i8259",
330 .suspend = i8259A_suspend,
331 .resume = i8259A_resume,
332 .shutdown = i8259A_shutdown,
333};
334
335static struct sys_device device_i8259A = {
336 .id = 0,
337 .cls = &i8259_sysdev_class,
338};
339
340static int __init i8259A_init_sysfs(void)
341{
342 int error = sysdev_class_register(&i8259_sysdev_class);
343 if (!error)
344 error = sysdev_register(&device_i8259A);
345 return error;
346}
347
348device_initcall(i8259A_init_sysfs);
349
350void init_8259A(int auto_eoi)
351{
352 unsigned long flags;
353
354 i8259A_auto_eoi = auto_eoi;
355
356 spin_lock_irqsave(&i8259A_lock, flags);
357
358 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
359 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
360
361 /*
362 * outb_pic - this has to work on a wide range of PC hardware.
363 */
364 outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
365 /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
366 outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
367 /* 8259A-1 (the master) has a slave on IR2 */
368 outb_pic(0x04, PIC_MASTER_IMR);
369 if (auto_eoi) /* master does Auto EOI */
370 outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
371 else /* master expects normal EOI */
372 outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
373
374 outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */
375 /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */
376 outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR);
377 /* 8259A-2 is a slave on master's IR2 */
378 outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
379 /* (slave's support for AEOI in flat mode is to be investigated) */
380 outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR);
381
382 if (auto_eoi)
383 /*
384 * In AEOI mode we just have to mask the interrupt
385 * when acking.
386 */
387 i8259A_chip.mask_ack = disable_8259A_irq;
388 else
389 i8259A_chip.mask_ack = mask_and_ack_8259A;
390
391 udelay(100); /* wait for 8259A to initialize */
392
393 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
394 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
395
396 spin_unlock_irqrestore(&i8259A_lock, flags);
397}
398
399
400
401
402/*
403 * IRQ2 is cascade interrupt to second interrupt controller
404 */
405
406static struct irqaction irq2 = {
407 .handler = no_action,
408 .mask = CPU_MASK_NONE,
409 .name = "cascade",
410};
411DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
412 [0 ... IRQ0_VECTOR - 1] = -1,
413 [IRQ0_VECTOR] = 0,
414 [IRQ1_VECTOR] = 1,
415 [IRQ2_VECTOR] = 2,
416 [IRQ3_VECTOR] = 3,
417 [IRQ4_VECTOR] = 4,
418 [IRQ5_VECTOR] = 5,
419 [IRQ6_VECTOR] = 6,
420 [IRQ7_VECTOR] = 7,
421 [IRQ8_VECTOR] = 8,
422 [IRQ9_VECTOR] = 9,
423 [IRQ10_VECTOR] = 10,
424 [IRQ11_VECTOR] = 11,
425 [IRQ12_VECTOR] = 12,
426 [IRQ13_VECTOR] = 13,
427 [IRQ14_VECTOR] = 14,
428 [IRQ15_VECTOR] = 15,
429 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
430};
431
432void __init init_ISA_irqs (void)
433{
434 int i;
435
436 init_bsp_APIC();
437 init_8259A(0);
438
439 for (i = 0; i < NR_IRQS; i++) {
440 irq_desc[i].status = IRQ_DISABLED;
441 irq_desc[i].action = NULL;
442 irq_desc[i].depth = 1;
443
444 if (i < 16) {
445 /*
446 * 16 old-style INTA-cycle interrupts:
447 */
448 set_irq_chip_and_handler_name(i, &i8259A_chip,
449 handle_level_irq, "XT");
450 } else {
451 /*
452 * 'high' PCI IRQs filled in on demand
453 */
454 irq_desc[i].chip = &no_irq_chip;
455 }
456 }
457}
458
459void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
460
461void __init native_init_IRQ(void)
462{
463 int i;
464
465 init_ISA_irqs();
466 /*
467 * Cover the whole vector space, no vector can escape
468 * us. (some of these will be overridden and become
469 * 'special' SMP interrupts)
470 */
471 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
472 int vector = FIRST_EXTERNAL_VECTOR + i;
473 if (vector != IA32_SYSCALL_VECTOR)
474 set_intr_gate(vector, interrupt[i]);
475 }
476
477#ifdef CONFIG_SMP
478 /*
479 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
480 * IPI, driven by wakeup.
481 */
482 set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
483
484 /* IPIs for invalidation */
485 set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
486 set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
487 set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
488 set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
489 set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
490 set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
491 set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
492 set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
493
494 /* IPI for generic function call */
495 set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
496
497 /* Low priority IPI to cleanup after moving an irq */
498 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
499#endif
500 set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
501 set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
502
503 /* self generated IPI for local APIC timer */
504 set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
505
506 /* IPI vectors for APIC spurious and error interrupts */
507 set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
508 set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
509
510 if (!acpi_ioapic)
511 setup_irq(2, &irq2);
512}
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 4dc8600d9d20..de9aa0e3a9c5 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -25,6 +25,7 @@
25#include <linux/init.h> 25#include <linux/init.h>
26#include <linux/delay.h> 26#include <linux/delay.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/bootmem.h>
28#include <linux/mc146818rtc.h> 29#include <linux/mc146818rtc.h>
29#include <linux/compiler.h> 30#include <linux/compiler.h>
30#include <linux/acpi.h> 31#include <linux/acpi.h>
@@ -58,7 +59,7 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
58static DEFINE_SPINLOCK(ioapic_lock); 59static DEFINE_SPINLOCK(ioapic_lock);
59static DEFINE_SPINLOCK(vector_lock); 60static DEFINE_SPINLOCK(vector_lock);
60 61
61int timer_over_8254 __initdata = 1; 62int timer_through_8259 __initdata;
62 63
63/* 64/*
64 * Is the SiS APIC rmw bug present ? 65 * Is the SiS APIC rmw bug present ?
@@ -72,15 +73,21 @@ int sis_apic_bug = -1;
72int nr_ioapic_registers[MAX_IO_APICS]; 73int nr_ioapic_registers[MAX_IO_APICS];
73 74
74/* I/O APIC entries */ 75/* I/O APIC entries */
75struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; 76struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
76int nr_ioapics; 77int nr_ioapics;
77 78
78/* MP IRQ source entries */ 79/* MP IRQ source entries */
79struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; 80struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
80 81
81/* # of MP IRQ source entries */ 82/* # of MP IRQ source entries */
82int mp_irq_entries; 83int mp_irq_entries;
83 84
85#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
86int mp_bus_id_to_type[MAX_MP_BUSSES];
87#endif
88
89DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
90
84static int disable_timer_pin_1 __initdata; 91static int disable_timer_pin_1 __initdata;
85 92
86/* 93/*
@@ -110,7 +117,7 @@ struct io_apic {
110static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) 117static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
111{ 118{
112 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) 119 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
113 + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK); 120 + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
114} 121}
115 122
116static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) 123static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
@@ -239,7 +246,7 @@ static void __init replace_pin_at_irq(unsigned int irq,
239 } 246 }
240} 247}
241 248
242static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable) 249static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable)
243{ 250{
244 struct irq_pin_list *entry = irq_2_pin + irq; 251 struct irq_pin_list *entry = irq_2_pin + irq;
245 unsigned int pin, reg; 252 unsigned int pin, reg;
@@ -259,30 +266,32 @@ static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsign
259} 266}
260 267
261/* mask = 1 */ 268/* mask = 1 */
262static void __mask_IO_APIC_irq (unsigned int irq) 269static void __mask_IO_APIC_irq(unsigned int irq)
263{ 270{
264 __modify_IO_APIC_irq(irq, 0x00010000, 0); 271 __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0);
265} 272}
266 273
267/* mask = 0 */ 274/* mask = 0 */
268static void __unmask_IO_APIC_irq (unsigned int irq) 275static void __unmask_IO_APIC_irq(unsigned int irq)
269{ 276{
270 __modify_IO_APIC_irq(irq, 0, 0x00010000); 277 __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED);
271} 278}
272 279
273/* mask = 1, trigger = 0 */ 280/* mask = 1, trigger = 0 */
274static void __mask_and_edge_IO_APIC_irq (unsigned int irq) 281static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
275{ 282{
276 __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); 283 __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED,
284 IO_APIC_REDIR_LEVEL_TRIGGER);
277} 285}
278 286
279/* mask = 0, trigger = 1 */ 287/* mask = 0, trigger = 1 */
280static void __unmask_and_level_IO_APIC_irq (unsigned int irq) 288static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
281{ 289{
282 __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); 290 __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER,
291 IO_APIC_REDIR_MASKED);
283} 292}
284 293
285static void mask_IO_APIC_irq (unsigned int irq) 294static void mask_IO_APIC_irq(unsigned int irq)
286{ 295{
287 unsigned long flags; 296 unsigned long flags;
288 297
@@ -291,7 +300,7 @@ static void mask_IO_APIC_irq (unsigned int irq)
291 spin_unlock_irqrestore(&ioapic_lock, flags); 300 spin_unlock_irqrestore(&ioapic_lock, flags);
292} 301}
293 302
294static void unmask_IO_APIC_irq (unsigned int irq) 303static void unmask_IO_APIC_irq(unsigned int irq)
295{ 304{
296 unsigned long flags; 305 unsigned long flags;
297 306
@@ -303,7 +312,7 @@ static void unmask_IO_APIC_irq (unsigned int irq)
303static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) 312static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
304{ 313{
305 struct IO_APIC_route_entry entry; 314 struct IO_APIC_route_entry entry;
306 315
307 /* Check delivery_mode to be sure we're not clearing an SMI pin */ 316 /* Check delivery_mode to be sure we're not clearing an SMI pin */
308 entry = ioapic_read_entry(apic, pin); 317 entry = ioapic_read_entry(apic, pin);
309 if (entry.delivery_mode == dest_SMI) 318 if (entry.delivery_mode == dest_SMI)
@@ -315,7 +324,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
315 ioapic_mask_entry(apic, pin); 324 ioapic_mask_entry(apic, pin);
316} 325}
317 326
318static void clear_IO_APIC (void) 327static void clear_IO_APIC(void)
319{ 328{
320 int apic, pin; 329 int apic, pin;
321 330
@@ -332,7 +341,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
332 struct irq_pin_list *entry = irq_2_pin + irq; 341 struct irq_pin_list *entry = irq_2_pin + irq;
333 unsigned int apicid_value; 342 unsigned int apicid_value;
334 cpumask_t tmp; 343 cpumask_t tmp;
335 344
336 cpus_and(tmp, cpumask, cpu_online_map); 345 cpus_and(tmp, cpumask, cpu_online_map);
337 if (cpus_empty(tmp)) 346 if (cpus_empty(tmp))
338 tmp = TARGET_CPUS; 347 tmp = TARGET_CPUS;
@@ -361,7 +370,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
361# include <linux/kernel_stat.h> /* kstat */ 370# include <linux/kernel_stat.h> /* kstat */
362# include <linux/slab.h> /* kmalloc() */ 371# include <linux/slab.h> /* kmalloc() */
363# include <linux/timer.h> 372# include <linux/timer.h>
364 373
365#define IRQBALANCE_CHECK_ARCH -999 374#define IRQBALANCE_CHECK_ARCH -999
366#define MAX_BALANCED_IRQ_INTERVAL (5*HZ) 375#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
367#define MIN_BALANCED_IRQ_INTERVAL (HZ/2) 376#define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
@@ -373,14 +382,14 @@ static int physical_balance __read_mostly;
373static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL; 382static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
374 383
375static struct irq_cpu_info { 384static struct irq_cpu_info {
376 unsigned long * last_irq; 385 unsigned long *last_irq;
377 unsigned long * irq_delta; 386 unsigned long *irq_delta;
378 unsigned long irq; 387 unsigned long irq;
379} irq_cpu_data[NR_CPUS]; 388} irq_cpu_data[NR_CPUS];
380 389
381#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq) 390#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
382#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq]) 391#define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq])
383#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq]) 392#define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq])
384 393
385#define IDLE_ENOUGH(cpu,now) \ 394#define IDLE_ENOUGH(cpu,now) \
386 (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1)) 395 (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
@@ -419,8 +428,8 @@ inside:
419 if (cpu == -1) 428 if (cpu == -1)
420 cpu = NR_CPUS-1; 429 cpu = NR_CPUS-1;
421 } 430 }
422 } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) || 431 } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) ||
423 (search_idle && !IDLE_ENOUGH(cpu,now))); 432 (search_idle && !IDLE_ENOUGH(cpu, now)));
424 433
425 return cpu; 434 return cpu;
426} 435}
@@ -430,15 +439,14 @@ static inline void balance_irq(int cpu, int irq)
430 unsigned long now = jiffies; 439 unsigned long now = jiffies;
431 cpumask_t allowed_mask; 440 cpumask_t allowed_mask;
432 unsigned int new_cpu; 441 unsigned int new_cpu;
433 442
434 if (irqbalance_disabled) 443 if (irqbalance_disabled)
435 return; 444 return;
436 445
437 cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]); 446 cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
438 new_cpu = move(cpu, allowed_mask, now, 1); 447 new_cpu = move(cpu, allowed_mask, now, 1);
439 if (cpu != new_cpu) { 448 if (cpu != new_cpu)
440 set_pending_irq(irq, cpumask_of_cpu(new_cpu)); 449 set_pending_irq(irq, cpumask_of_cpu(new_cpu));
441 }
442} 450}
443 451
444static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) 452static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
@@ -450,14 +458,14 @@ static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
450 if (!irq_desc[j].action) 458 if (!irq_desc[j].action)
451 continue; 459 continue;
452 /* Is it a significant load ? */ 460 /* Is it a significant load ? */
453 if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) < 461 if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) <
454 useful_load_threshold) 462 useful_load_threshold)
455 continue; 463 continue;
456 balance_irq(i, j); 464 balance_irq(i, j);
457 } 465 }
458 } 466 }
459 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, 467 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
460 balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); 468 balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
461 return; 469 return;
462} 470}
463 471
@@ -486,22 +494,22 @@ static void do_irq_balance(void)
486 /* Is this an active IRQ or balancing disabled ? */ 494 /* Is this an active IRQ or balancing disabled ? */
487 if (!irq_desc[j].action || irq_balancing_disabled(j)) 495 if (!irq_desc[j].action || irq_balancing_disabled(j))
488 continue; 496 continue;
489 if ( package_index == i ) 497 if (package_index == i)
490 IRQ_DELTA(package_index,j) = 0; 498 IRQ_DELTA(package_index, j) = 0;
491 /* Determine the total count per processor per IRQ */ 499 /* Determine the total count per processor per IRQ */
492 value_now = (unsigned long) kstat_cpu(i).irqs[j]; 500 value_now = (unsigned long) kstat_cpu(i).irqs[j];
493 501
494 /* Determine the activity per processor per IRQ */ 502 /* Determine the activity per processor per IRQ */
495 delta = value_now - LAST_CPU_IRQ(i,j); 503 delta = value_now - LAST_CPU_IRQ(i, j);
496 504
497 /* Update last_cpu_irq[][] for the next time */ 505 /* Update last_cpu_irq[][] for the next time */
498 LAST_CPU_IRQ(i,j) = value_now; 506 LAST_CPU_IRQ(i, j) = value_now;
499 507
500 /* Ignore IRQs whose rate is less than the clock */ 508 /* Ignore IRQs whose rate is less than the clock */
501 if (delta < useful_load_threshold) 509 if (delta < useful_load_threshold)
502 continue; 510 continue;
503 /* update the load for the processor or package total */ 511 /* update the load for the processor or package total */
504 IRQ_DELTA(package_index,j) += delta; 512 IRQ_DELTA(package_index, j) += delta;
505 513
506 /* Keep track of the higher numbered sibling as well */ 514 /* Keep track of the higher numbered sibling as well */
507 if (i != package_index) 515 if (i != package_index)
@@ -527,7 +535,8 @@ static void do_irq_balance(void)
527 max_cpu_irq = ULONG_MAX; 535 max_cpu_irq = ULONG_MAX;
528 536
529tryanothercpu: 537tryanothercpu:
530 /* Look for heaviest loaded processor. 538 /*
539 * Look for heaviest loaded processor.
531 * We may come back to get the next heaviest loaded processor. 540 * We may come back to get the next heaviest loaded processor.
532 * Skip processors with trivial loads. 541 * Skip processors with trivial loads.
533 */ 542 */
@@ -536,7 +545,7 @@ tryanothercpu:
536 for_each_online_cpu(i) { 545 for_each_online_cpu(i) {
537 if (i != CPU_TO_PACKAGEINDEX(i)) 546 if (i != CPU_TO_PACKAGEINDEX(i))
538 continue; 547 continue;
539 if (max_cpu_irq <= CPU_IRQ(i)) 548 if (max_cpu_irq <= CPU_IRQ(i))
540 continue; 549 continue;
541 if (tmp_cpu_irq < CPU_IRQ(i)) { 550 if (tmp_cpu_irq < CPU_IRQ(i)) {
542 tmp_cpu_irq = CPU_IRQ(i); 551 tmp_cpu_irq = CPU_IRQ(i);
@@ -545,8 +554,9 @@ tryanothercpu:
545 } 554 }
546 555
547 if (tmp_loaded == -1) { 556 if (tmp_loaded == -1) {
548 /* In the case of small number of heavy interrupt sources, 557 /*
549 * loading some of the cpus too much. We use Ingo's original 558 * In the case of small number of heavy interrupt sources,
559 * loading some of the cpus too much. We use Ingo's original
550 * approach to rotate them around. 560 * approach to rotate them around.
551 */ 561 */
552 if (!first_attempt && imbalance >= useful_load_threshold) { 562 if (!first_attempt && imbalance >= useful_load_threshold) {
@@ -555,13 +565,14 @@ tryanothercpu:
555 } 565 }
556 goto not_worth_the_effort; 566 goto not_worth_the_effort;
557 } 567 }
558 568
559 first_attempt = 0; /* heaviest search */ 569 first_attempt = 0; /* heaviest search */
560 max_cpu_irq = tmp_cpu_irq; /* load */ 570 max_cpu_irq = tmp_cpu_irq; /* load */
561 max_loaded = tmp_loaded; /* processor */ 571 max_loaded = tmp_loaded; /* processor */
562 imbalance = (max_cpu_irq - min_cpu_irq) / 2; 572 imbalance = (max_cpu_irq - min_cpu_irq) / 2;
563 573
564 /* if imbalance is less than approx 10% of max load, then 574 /*
575 * if imbalance is less than approx 10% of max load, then
565 * observe diminishing returns action. - quit 576 * observe diminishing returns action. - quit
566 */ 577 */
567 if (imbalance < (max_cpu_irq >> 3)) 578 if (imbalance < (max_cpu_irq >> 3))
@@ -577,26 +588,25 @@ tryanotherirq:
577 /* Is this an active IRQ? */ 588 /* Is this an active IRQ? */
578 if (!irq_desc[j].action) 589 if (!irq_desc[j].action)
579 continue; 590 continue;
580 if (imbalance <= IRQ_DELTA(max_loaded,j)) 591 if (imbalance <= IRQ_DELTA(max_loaded, j))
581 continue; 592 continue;
582 /* Try to find the IRQ that is closest to the imbalance 593 /* Try to find the IRQ that is closest to the imbalance
583 * without going over. 594 * without going over.
584 */ 595 */
585 if (move_this_load < IRQ_DELTA(max_loaded,j)) { 596 if (move_this_load < IRQ_DELTA(max_loaded, j)) {
586 move_this_load = IRQ_DELTA(max_loaded,j); 597 move_this_load = IRQ_DELTA(max_loaded, j);
587 selected_irq = j; 598 selected_irq = j;
588 } 599 }
589 } 600 }
590 if (selected_irq == -1) { 601 if (selected_irq == -1)
591 goto tryanothercpu; 602 goto tryanothercpu;
592 }
593 603
594 imbalance = move_this_load; 604 imbalance = move_this_load;
595 605
596 /* For physical_balance case, we accumulated both load 606 /* For physical_balance case, we accumulated both load
597 * values in the one of the siblings cpu_irq[], 607 * values in the one of the siblings cpu_irq[],
598 * to use the same code for physical and logical processors 608 * to use the same code for physical and logical processors
599 * as much as possible. 609 * as much as possible.
600 * 610 *
601 * NOTE: the cpu_irq[] array holds the sum of the load for 611 * NOTE: the cpu_irq[] array holds the sum of the load for
602 * sibling A and sibling B in the slot for the lowest numbered 612 * sibling A and sibling B in the slot for the lowest numbered
@@ -625,11 +635,11 @@ tryanotherirq:
625 /* mark for change destination */ 635 /* mark for change destination */
626 set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded)); 636 set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
627 637
628 /* Since we made a change, come back sooner to 638 /* Since we made a change, come back sooner to
629 * check for more variation. 639 * check for more variation.
630 */ 640 */
631 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, 641 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
632 balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); 642 balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
633 return; 643 return;
634 } 644 }
635 goto tryanotherirq; 645 goto tryanotherirq;
@@ -640,7 +650,7 @@ not_worth_the_effort:
640 * upward 650 * upward
641 */ 651 */
642 balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL, 652 balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
643 balanced_irq_interval + BALANCED_IRQ_MORE_DELTA); 653 balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
644 return; 654 return;
645} 655}
646 656
@@ -679,13 +689,13 @@ static int __init balanced_irq_init(void)
679 cpumask_t tmp; 689 cpumask_t tmp;
680 690
681 cpus_shift_right(tmp, cpu_online_map, 2); 691 cpus_shift_right(tmp, cpu_online_map, 2);
682 c = &boot_cpu_data; 692 c = &boot_cpu_data;
683 /* When not overwritten by the command line ask subarchitecture. */ 693 /* When not overwritten by the command line ask subarchitecture. */
684 if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH) 694 if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
685 irqbalance_disabled = NO_BALANCE_IRQ; 695 irqbalance_disabled = NO_BALANCE_IRQ;
686 if (irqbalance_disabled) 696 if (irqbalance_disabled)
687 return 0; 697 return 0;
688 698
689 /* disable irqbalance completely if there is only one processor online */ 699 /* disable irqbalance completely if there is only one processor online */
690 if (num_online_cpus() < 2) { 700 if (num_online_cpus() < 2) {
691 irqbalance_disabled = 1; 701 irqbalance_disabled = 1;
@@ -699,16 +709,14 @@ static int __init balanced_irq_init(void)
699 physical_balance = 1; 709 physical_balance = 1;
700 710
701 for_each_online_cpu(i) { 711 for_each_online_cpu(i) {
702 irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); 712 irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
703 irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); 713 irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
704 if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) { 714 if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
705 printk(KERN_ERR "balanced_irq_init: out of memory"); 715 printk(KERN_ERR "balanced_irq_init: out of memory");
706 goto failed; 716 goto failed;
707 } 717 }
708 memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
709 memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
710 } 718 }
711 719
712 printk(KERN_INFO "Starting balanced_irq\n"); 720 printk(KERN_INFO "Starting balanced_irq\n");
713 if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd"))) 721 if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
714 return 0; 722 return 0;
@@ -748,7 +756,7 @@ void send_IPI_self(int vector)
748 /* 756 /*
749 * Send the IPI. The write to APIC_ICR fires this off. 757 * Send the IPI. The write to APIC_ICR fires this off.
750 */ 758 */
751 apic_write_around(APIC_ICR, cfg); 759 apic_write(APIC_ICR, cfg);
752} 760}
753#endif /* !CONFIG_SMP */ 761#endif /* !CONFIG_SMP */
754 762
@@ -801,10 +809,10 @@ static int find_irq_entry(int apic, int pin, int type)
801 int i; 809 int i;
802 810
803 for (i = 0; i < mp_irq_entries; i++) 811 for (i = 0; i < mp_irq_entries; i++)
804 if (mp_irqs[i].mpc_irqtype == type && 812 if (mp_irqs[i].mp_irqtype == type &&
805 (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || 813 (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
806 mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && 814 mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
807 mp_irqs[i].mpc_dstirq == pin) 815 mp_irqs[i].mp_dstirq == pin)
808 return i; 816 return i;
809 817
810 return -1; 818 return -1;
@@ -818,13 +826,13 @@ static int __init find_isa_irq_pin(int irq, int type)
818 int i; 826 int i;
819 827
820 for (i = 0; i < mp_irq_entries; i++) { 828 for (i = 0; i < mp_irq_entries; i++) {
821 int lbus = mp_irqs[i].mpc_srcbus; 829 int lbus = mp_irqs[i].mp_srcbus;
822 830
823 if (test_bit(lbus, mp_bus_not_pci) && 831 if (test_bit(lbus, mp_bus_not_pci) &&
824 (mp_irqs[i].mpc_irqtype == type) && 832 (mp_irqs[i].mp_irqtype == type) &&
825 (mp_irqs[i].mpc_srcbusirq == irq)) 833 (mp_irqs[i].mp_srcbusirq == irq))
826 834
827 return mp_irqs[i].mpc_dstirq; 835 return mp_irqs[i].mp_dstirq;
828 } 836 }
829 return -1; 837 return -1;
830} 838}
@@ -834,17 +842,17 @@ static int __init find_isa_irq_apic(int irq, int type)
834 int i; 842 int i;
835 843
836 for (i = 0; i < mp_irq_entries; i++) { 844 for (i = 0; i < mp_irq_entries; i++) {
837 int lbus = mp_irqs[i].mpc_srcbus; 845 int lbus = mp_irqs[i].mp_srcbus;
838 846
839 if (test_bit(lbus, mp_bus_not_pci) && 847 if (test_bit(lbus, mp_bus_not_pci) &&
840 (mp_irqs[i].mpc_irqtype == type) && 848 (mp_irqs[i].mp_irqtype == type) &&
841 (mp_irqs[i].mpc_srcbusirq == irq)) 849 (mp_irqs[i].mp_srcbusirq == irq))
842 break; 850 break;
843 } 851 }
844 if (i < mp_irq_entries) { 852 if (i < mp_irq_entries) {
845 int apic; 853 int apic;
846 for(apic = 0; apic < nr_ioapics; apic++) { 854 for (apic = 0; apic < nr_ioapics; apic++) {
847 if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) 855 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
848 return apic; 856 return apic;
849 } 857 }
850 } 858 }
@@ -864,28 +872,28 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
864 872
865 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, " 873 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
866 "slot:%d, pin:%d.\n", bus, slot, pin); 874 "slot:%d, pin:%d.\n", bus, slot, pin);
867 if (mp_bus_id_to_pci_bus[bus] == -1) { 875 if (test_bit(bus, mp_bus_not_pci)) {
868 printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus); 876 printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
869 return -1; 877 return -1;
870 } 878 }
871 for (i = 0; i < mp_irq_entries; i++) { 879 for (i = 0; i < mp_irq_entries; i++) {
872 int lbus = mp_irqs[i].mpc_srcbus; 880 int lbus = mp_irqs[i].mp_srcbus;
873 881
874 for (apic = 0; apic < nr_ioapics; apic++) 882 for (apic = 0; apic < nr_ioapics; apic++)
875 if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || 883 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
876 mp_irqs[i].mpc_dstapic == MP_APIC_ALL) 884 mp_irqs[i].mp_dstapic == MP_APIC_ALL)
877 break; 885 break;
878 886
879 if (!test_bit(lbus, mp_bus_not_pci) && 887 if (!test_bit(lbus, mp_bus_not_pci) &&
880 !mp_irqs[i].mpc_irqtype && 888 !mp_irqs[i].mp_irqtype &&
881 (bus == lbus) && 889 (bus == lbus) &&
882 (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { 890 (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
883 int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); 891 int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq);
884 892
885 if (!(apic || IO_APIC_IRQ(irq))) 893 if (!(apic || IO_APIC_IRQ(irq)))
886 continue; 894 continue;
887 895
888 if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) 896 if (pin == (mp_irqs[i].mp_srcbusirq & 3))
889 return irq; 897 return irq;
890 /* 898 /*
891 * Use the first all-but-pin matching entry as a 899 * Use the first all-but-pin matching entry as a
@@ -900,7 +908,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
900EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); 908EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
901 909
902/* 910/*
903 * This function currently is only a helper for the i386 smp boot process where 911 * This function currently is only a helper for the i386 smp boot process where
904 * we need to reprogram the ioredtbls to cater for the cpus which have come online 912 * we need to reprogram the ioredtbls to cater for the cpus which have come online
905 * so mask in all cases should simply be TARGET_CPUS 913 * so mask in all cases should simply be TARGET_CPUS
906 */ 914 */
@@ -952,7 +960,7 @@ static int EISA_ELCR(unsigned int irq)
952 * EISA conforming in the MP table, that means its trigger type must 960 * EISA conforming in the MP table, that means its trigger type must
953 * be read in from the ELCR */ 961 * be read in from the ELCR */
954 962
955#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) 963#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
956#define default_EISA_polarity(idx) default_ISA_polarity(idx) 964#define default_EISA_polarity(idx) default_ISA_polarity(idx)
957 965
958/* PCI interrupts are always polarity one level triggered, 966/* PCI interrupts are always polarity one level triggered,
@@ -969,118 +977,115 @@ static int EISA_ELCR(unsigned int irq)
969 977
970static int MPBIOS_polarity(int idx) 978static int MPBIOS_polarity(int idx)
971{ 979{
972 int bus = mp_irqs[idx].mpc_srcbus; 980 int bus = mp_irqs[idx].mp_srcbus;
973 int polarity; 981 int polarity;
974 982
975 /* 983 /*
976 * Determine IRQ line polarity (high active or low active): 984 * Determine IRQ line polarity (high active or low active):
977 */ 985 */
978 switch (mp_irqs[idx].mpc_irqflag & 3) 986 switch (mp_irqs[idx].mp_irqflag & 3) {
987 case 0: /* conforms, ie. bus-type dependent polarity */
979 { 988 {
980 case 0: /* conforms, ie. bus-type dependent polarity */ 989 polarity = test_bit(bus, mp_bus_not_pci)?
981 { 990 default_ISA_polarity(idx):
982 polarity = test_bit(bus, mp_bus_not_pci)? 991 default_PCI_polarity(idx);
983 default_ISA_polarity(idx): 992 break;
984 default_PCI_polarity(idx); 993 }
985 break; 994 case 1: /* high active */
986 } 995 {
987 case 1: /* high active */ 996 polarity = 0;
988 { 997 break;
989 polarity = 0; 998 }
990 break; 999 case 2: /* reserved */
991 } 1000 {
992 case 2: /* reserved */ 1001 printk(KERN_WARNING "broken BIOS!!\n");
993 { 1002 polarity = 1;
994 printk(KERN_WARNING "broken BIOS!!\n"); 1003 break;
995 polarity = 1; 1004 }
996 break; 1005 case 3: /* low active */
997 } 1006 {
998 case 3: /* low active */ 1007 polarity = 1;
999 { 1008 break;
1000 polarity = 1; 1009 }
1001 break; 1010 default: /* invalid */
1002 } 1011 {
1003 default: /* invalid */ 1012 printk(KERN_WARNING "broken BIOS!!\n");
1004 { 1013 polarity = 1;
1005 printk(KERN_WARNING "broken BIOS!!\n"); 1014 break;
1006 polarity = 1; 1015 }
1007 break;
1008 }
1009 } 1016 }
1010 return polarity; 1017 return polarity;
1011} 1018}
1012 1019
1013static int MPBIOS_trigger(int idx) 1020static int MPBIOS_trigger(int idx)
1014{ 1021{
1015 int bus = mp_irqs[idx].mpc_srcbus; 1022 int bus = mp_irqs[idx].mp_srcbus;
1016 int trigger; 1023 int trigger;
1017 1024
1018 /* 1025 /*
1019 * Determine IRQ trigger mode (edge or level sensitive): 1026 * Determine IRQ trigger mode (edge or level sensitive):
1020 */ 1027 */
1021 switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) 1028 switch ((mp_irqs[idx].mp_irqflag>>2) & 3) {
1029 case 0: /* conforms, ie. bus-type dependent */
1022 { 1030 {
1023 case 0: /* conforms, ie. bus-type dependent */ 1031 trigger = test_bit(bus, mp_bus_not_pci)?
1024 { 1032 default_ISA_trigger(idx):
1025 trigger = test_bit(bus, mp_bus_not_pci)? 1033 default_PCI_trigger(idx);
1026 default_ISA_trigger(idx):
1027 default_PCI_trigger(idx);
1028#if defined(CONFIG_EISA) || defined(CONFIG_MCA) 1034#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
1029 switch (mp_bus_id_to_type[bus]) 1035 switch (mp_bus_id_to_type[bus]) {
1030 { 1036 case MP_BUS_ISA: /* ISA pin */
1031 case MP_BUS_ISA: /* ISA pin */ 1037 {
1032 { 1038 /* set before the switch */
1033 /* set before the switch */
1034 break;
1035 }
1036 case MP_BUS_EISA: /* EISA pin */
1037 {
1038 trigger = default_EISA_trigger(idx);
1039 break;
1040 }
1041 case MP_BUS_PCI: /* PCI pin */
1042 {
1043 /* set before the switch */
1044 break;
1045 }
1046 case MP_BUS_MCA: /* MCA pin */
1047 {
1048 trigger = default_MCA_trigger(idx);
1049 break;
1050 }
1051 default:
1052 {
1053 printk(KERN_WARNING "broken BIOS!!\n");
1054 trigger = 1;
1055 break;
1056 }
1057 }
1058#endif
1059 break; 1039 break;
1060 } 1040 }
1061 case 1: /* edge */ 1041 case MP_BUS_EISA: /* EISA pin */
1062 { 1042 {
1063 trigger = 0; 1043 trigger = default_EISA_trigger(idx);
1064 break; 1044 break;
1065 } 1045 }
1066 case 2: /* reserved */ 1046 case MP_BUS_PCI: /* PCI pin */
1067 { 1047 {
1068 printk(KERN_WARNING "broken BIOS!!\n"); 1048 /* set before the switch */
1069 trigger = 1;
1070 break; 1049 break;
1071 } 1050 }
1072 case 3: /* level */ 1051 case MP_BUS_MCA: /* MCA pin */
1073 { 1052 {
1074 trigger = 1; 1053 trigger = default_MCA_trigger(idx);
1075 break; 1054 break;
1076 } 1055 }
1077 default: /* invalid */ 1056 default:
1078 { 1057 {
1079 printk(KERN_WARNING "broken BIOS!!\n"); 1058 printk(KERN_WARNING "broken BIOS!!\n");
1080 trigger = 0; 1059 trigger = 1;
1081 break; 1060 break;
1082 } 1061 }
1083 } 1062 }
1063#endif
1064 break;
1065 }
1066 case 1: /* edge */
1067 {
1068 trigger = 0;
1069 break;
1070 }
1071 case 2: /* reserved */
1072 {
1073 printk(KERN_WARNING "broken BIOS!!\n");
1074 trigger = 1;
1075 break;
1076 }
1077 case 3: /* level */
1078 {
1079 trigger = 1;
1080 break;
1081 }
1082 default: /* invalid */
1083 {
1084 printk(KERN_WARNING "broken BIOS!!\n");
1085 trigger = 0;
1086 break;
1087 }
1088 }
1084 return trigger; 1089 return trigger;
1085} 1090}
1086 1091
@@ -1097,16 +1102,16 @@ static inline int irq_trigger(int idx)
1097static int pin_2_irq(int idx, int apic, int pin) 1102static int pin_2_irq(int idx, int apic, int pin)
1098{ 1103{
1099 int irq, i; 1104 int irq, i;
1100 int bus = mp_irqs[idx].mpc_srcbus; 1105 int bus = mp_irqs[idx].mp_srcbus;
1101 1106
1102 /* 1107 /*
1103 * Debugging check, we are in big trouble if this message pops up! 1108 * Debugging check, we are in big trouble if this message pops up!
1104 */ 1109 */
1105 if (mp_irqs[idx].mpc_dstirq != pin) 1110 if (mp_irqs[idx].mp_dstirq != pin)
1106 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); 1111 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
1107 1112
1108 if (test_bit(bus, mp_bus_not_pci)) 1113 if (test_bit(bus, mp_bus_not_pci))
1109 irq = mp_irqs[idx].mpc_srcbusirq; 1114 irq = mp_irqs[idx].mp_srcbusirq;
1110 else { 1115 else {
1111 /* 1116 /*
1112 * PCI IRQs are mapped in order 1117 * PCI IRQs are mapped in order
@@ -1148,8 +1153,8 @@ static inline int IO_APIC_irq_trigger(int irq)
1148 1153
1149 for (apic = 0; apic < nr_ioapics; apic++) { 1154 for (apic = 0; apic < nr_ioapics; apic++) {
1150 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 1155 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1151 idx = find_irq_entry(apic,pin,mp_INT); 1156 idx = find_irq_entry(apic, pin, mp_INT);
1152 if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin))) 1157 if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
1153 return irq_trigger(idx); 1158 return irq_trigger(idx);
1154 } 1159 }
1155 } 1160 }
@@ -1164,7 +1169,7 @@ static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }
1164 1169
1165static int __assign_irq_vector(int irq) 1170static int __assign_irq_vector(int irq)
1166{ 1171{
1167 static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; 1172 static int current_vector = FIRST_DEVICE_VECTOR, current_offset;
1168 int vector, offset; 1173 int vector, offset;
1169 1174
1170 BUG_ON((unsigned)irq >= NR_IRQ_VECTORS); 1175 BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
@@ -1176,7 +1181,7 @@ static int __assign_irq_vector(int irq)
1176 offset = current_offset; 1181 offset = current_offset;
1177next: 1182next:
1178 vector += 8; 1183 vector += 8;
1179 if (vector >= FIRST_SYSTEM_VECTOR) { 1184 if (vector >= first_system_vector) {
1180 offset = (offset + 1) % 8; 1185 offset = (offset + 1) % 8;
1181 vector = FIRST_DEVICE_VECTOR + offset; 1186 vector = FIRST_DEVICE_VECTOR + offset;
1182 } 1187 }
@@ -1203,6 +1208,11 @@ static int assign_irq_vector(int irq)
1203 1208
1204 return vector; 1209 return vector;
1205} 1210}
1211
1212void setup_vector_irq(int cpu)
1213{
1214}
1215
1206static struct irq_chip ioapic_chip; 1216static struct irq_chip ioapic_chip;
1207 1217
1208#define IOAPIC_AUTO -1 1218#define IOAPIC_AUTO -1
@@ -1237,25 +1247,25 @@ static void __init setup_IO_APIC_irqs(void)
1237 /* 1247 /*
1238 * add it to the IO-APIC irq-routing table: 1248 * add it to the IO-APIC irq-routing table:
1239 */ 1249 */
1240 memset(&entry,0,sizeof(entry)); 1250 memset(&entry, 0, sizeof(entry));
1241 1251
1242 entry.delivery_mode = INT_DELIVERY_MODE; 1252 entry.delivery_mode = INT_DELIVERY_MODE;
1243 entry.dest_mode = INT_DEST_MODE; 1253 entry.dest_mode = INT_DEST_MODE;
1244 entry.mask = 0; /* enable IRQ */ 1254 entry.mask = 0; /* enable IRQ */
1245 entry.dest.logical.logical_dest = 1255 entry.dest.logical.logical_dest =
1246 cpu_mask_to_apicid(TARGET_CPUS); 1256 cpu_mask_to_apicid(TARGET_CPUS);
1247 1257
1248 idx = find_irq_entry(apic,pin,mp_INT); 1258 idx = find_irq_entry(apic, pin, mp_INT);
1249 if (idx == -1) { 1259 if (idx == -1) {
1250 if (first_notcon) { 1260 if (first_notcon) {
1251 apic_printk(APIC_VERBOSE, KERN_DEBUG 1261 apic_printk(APIC_VERBOSE, KERN_DEBUG
1252 " IO-APIC (apicid-pin) %d-%d", 1262 " IO-APIC (apicid-pin) %d-%d",
1253 mp_ioapics[apic].mpc_apicid, 1263 mp_ioapics[apic].mp_apicid,
1254 pin); 1264 pin);
1255 first_notcon = 0; 1265 first_notcon = 0;
1256 } else 1266 } else
1257 apic_printk(APIC_VERBOSE, ", %d-%d", 1267 apic_printk(APIC_VERBOSE, ", %d-%d",
1258 mp_ioapics[apic].mpc_apicid, pin); 1268 mp_ioapics[apic].mp_apicid, pin);
1259 continue; 1269 continue;
1260 } 1270 }
1261 1271
@@ -1289,7 +1299,7 @@ static void __init setup_IO_APIC_irqs(void)
1289 vector = assign_irq_vector(irq); 1299 vector = assign_irq_vector(irq);
1290 entry.vector = vector; 1300 entry.vector = vector;
1291 ioapic_register_intr(irq, vector, IOAPIC_AUTO); 1301 ioapic_register_intr(irq, vector, IOAPIC_AUTO);
1292 1302
1293 if (!apic && (irq < 16)) 1303 if (!apic && (irq < 16))
1294 disable_8259A_irq(irq); 1304 disable_8259A_irq(irq);
1295 } 1305 }
@@ -1302,25 +1312,21 @@ static void __init setup_IO_APIC_irqs(void)
1302} 1312}
1303 1313
1304/* 1314/*
1305 * Set up the 8259A-master output pin: 1315 * Set up the timer pin, possibly with the 8259A-master behind.
1306 */ 1316 */
1307static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) 1317static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
1318 int vector)
1308{ 1319{
1309 struct IO_APIC_route_entry entry; 1320 struct IO_APIC_route_entry entry;
1310 1321
1311 memset(&entry,0,sizeof(entry)); 1322 memset(&entry, 0, sizeof(entry));
1312
1313 disable_8259A_irq(0);
1314
1315 /* mask LVT0 */
1316 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
1317 1323
1318 /* 1324 /*
1319 * We use logical delivery to get the timer IRQ 1325 * We use logical delivery to get the timer IRQ
1320 * to the first CPU. 1326 * to the first CPU.
1321 */ 1327 */
1322 entry.dest_mode = INT_DEST_MODE; 1328 entry.dest_mode = INT_DEST_MODE;
1323 entry.mask = 0; /* unmask IRQ now */ 1329 entry.mask = 1; /* mask IRQ now */
1324 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); 1330 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
1325 entry.delivery_mode = INT_DELIVERY_MODE; 1331 entry.delivery_mode = INT_DELIVERY_MODE;
1326 entry.polarity = 0; 1332 entry.polarity = 0;
@@ -1329,17 +1335,14 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
1329 1335
1330 /* 1336 /*
1331 * The timer IRQ doesn't have to know that behind the 1337 * The timer IRQ doesn't have to know that behind the
1332 * scene we have a 8259A-master in AEOI mode ... 1338 * scene we may have a 8259A-master in AEOI mode ...
1333 */ 1339 */
1334 irq_desc[0].chip = &ioapic_chip; 1340 ioapic_register_intr(0, vector, IOAPIC_EDGE);
1335 set_irq_handler(0, handle_edge_irq);
1336 1341
1337 /* 1342 /*
1338 * Add it to the IO-APIC irq-routing table: 1343 * Add it to the IO-APIC irq-routing table:
1339 */ 1344 */
1340 ioapic_write_entry(apic, pin, entry); 1345 ioapic_write_entry(apic, pin, entry);
1341
1342 enable_8259A_irq(0);
1343} 1346}
1344 1347
1345void __init print_IO_APIC(void) 1348void __init print_IO_APIC(void)
@@ -1354,10 +1357,10 @@ void __init print_IO_APIC(void)
1354 if (apic_verbosity == APIC_QUIET) 1357 if (apic_verbosity == APIC_QUIET)
1355 return; 1358 return;
1356 1359
1357 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); 1360 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
1358 for (i = 0; i < nr_ioapics; i++) 1361 for (i = 0; i < nr_ioapics; i++)
1359 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", 1362 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
1360 mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); 1363 mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
1361 1364
1362 /* 1365 /*
1363 * We are a bit conservative about what we expect. We have to 1366 * We are a bit conservative about what we expect. We have to
@@ -1376,7 +1379,7 @@ void __init print_IO_APIC(void)
1376 reg_03.raw = io_apic_read(apic, 3); 1379 reg_03.raw = io_apic_read(apic, 3);
1377 spin_unlock_irqrestore(&ioapic_lock, flags); 1380 spin_unlock_irqrestore(&ioapic_lock, flags);
1378 1381
1379 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); 1382 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
1380 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); 1383 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1381 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); 1384 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1382 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); 1385 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
@@ -1459,7 +1462,7 @@ void __init print_IO_APIC(void)
1459 1462
1460#if 0 1463#if 0
1461 1464
1462static void print_APIC_bitfield (int base) 1465static void print_APIC_bitfield(int base)
1463{ 1466{
1464 unsigned int v; 1467 unsigned int v;
1465 int i, j; 1468 int i, j;
@@ -1480,7 +1483,7 @@ static void print_APIC_bitfield (int base)
1480 } 1483 }
1481} 1484}
1482 1485
1483void /*__init*/ print_local_APIC(void * dummy) 1486void /*__init*/ print_local_APIC(void *dummy)
1484{ 1487{
1485 unsigned int v, ver, maxlvt; 1488 unsigned int v, ver, maxlvt;
1486 1489
@@ -1489,6 +1492,7 @@ void /*__init*/ print_local_APIC(void * dummy)
1489 1492
1490 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", 1493 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1491 smp_processor_id(), hard_smp_processor_id()); 1494 smp_processor_id(), hard_smp_processor_id());
1495 v = apic_read(APIC_ID);
1492 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, 1496 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v,
1493 GET_APIC_ID(read_apic_id())); 1497 GET_APIC_ID(read_apic_id()));
1494 v = apic_read(APIC_LVR); 1498 v = apic_read(APIC_LVR);
@@ -1563,9 +1567,9 @@ void /*__init*/ print_local_APIC(void * dummy)
1563 printk("\n"); 1567 printk("\n");
1564} 1568}
1565 1569
1566void print_all_local_APICs (void) 1570void print_all_local_APICs(void)
1567{ 1571{
1568 on_each_cpu(print_local_APIC, NULL, 1, 1); 1572 on_each_cpu(print_local_APIC, NULL, 1);
1569} 1573}
1570 1574
1571void /*__init*/ print_PIC(void) 1575void /*__init*/ print_PIC(void)
@@ -1586,11 +1590,11 @@ void /*__init*/ print_PIC(void)
1586 v = inb(0xa0) << 8 | inb(0x20); 1590 v = inb(0xa0) << 8 | inb(0x20);
1587 printk(KERN_DEBUG "... PIC IRR: %04x\n", v); 1591 printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
1588 1592
1589 outb(0x0b,0xa0); 1593 outb(0x0b, 0xa0);
1590 outb(0x0b,0x20); 1594 outb(0x0b, 0x20);
1591 v = inb(0xa0) << 8 | inb(0x20); 1595 v = inb(0xa0) << 8 | inb(0x20);
1592 outb(0x0a,0xa0); 1596 outb(0x0a, 0xa0);
1593 outb(0x0a,0x20); 1597 outb(0x0a, 0x20);
1594 1598
1595 spin_unlock_irqrestore(&i8259A_lock, flags); 1599 spin_unlock_irqrestore(&i8259A_lock, flags);
1596 1600
@@ -1626,7 +1630,7 @@ static void __init enable_IO_APIC(void)
1626 spin_unlock_irqrestore(&ioapic_lock, flags); 1630 spin_unlock_irqrestore(&ioapic_lock, flags);
1627 nr_ioapic_registers[apic] = reg_01.bits.entries+1; 1631 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
1628 } 1632 }
1629 for(apic = 0; apic < nr_ioapics; apic++) { 1633 for (apic = 0; apic < nr_ioapics; apic++) {
1630 int pin; 1634 int pin;
1631 /* See if any of the pins is in ExtINT mode */ 1635 /* See if any of the pins is in ExtINT mode */
1632 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 1636 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
@@ -1716,7 +1720,6 @@ void disable_IO_APIC(void)
1716 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 1720 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
1717 */ 1721 */
1718 1722
1719#ifndef CONFIG_X86_NUMAQ
1720static void __init setup_ioapic_ids_from_mpc(void) 1723static void __init setup_ioapic_ids_from_mpc(void)
1721{ 1724{
1722 union IO_APIC_reg_00 reg_00; 1725 union IO_APIC_reg_00 reg_00;
@@ -1726,6 +1729,11 @@ static void __init setup_ioapic_ids_from_mpc(void)
1726 unsigned char old_id; 1729 unsigned char old_id;
1727 unsigned long flags; 1730 unsigned long flags;
1728 1731
1732#ifdef CONFIG_X86_NUMAQ
1733 if (found_numaq)
1734 return;
1735#endif
1736
1729 /* 1737 /*
1730 * Don't check I/O APIC IDs for xAPIC systems. They have 1738 * Don't check I/O APIC IDs for xAPIC systems. They have
1731 * no meaning without the serial APIC bus. 1739 * no meaning without the serial APIC bus.
@@ -1748,15 +1756,15 @@ static void __init setup_ioapic_ids_from_mpc(void)
1748 spin_lock_irqsave(&ioapic_lock, flags); 1756 spin_lock_irqsave(&ioapic_lock, flags);
1749 reg_00.raw = io_apic_read(apic, 0); 1757 reg_00.raw = io_apic_read(apic, 0);
1750 spin_unlock_irqrestore(&ioapic_lock, flags); 1758 spin_unlock_irqrestore(&ioapic_lock, flags);
1751
1752 old_id = mp_ioapics[apic].mpc_apicid;
1753 1759
1754 if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) { 1760 old_id = mp_ioapics[apic].mp_apicid;
1761
1762 if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
1755 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", 1763 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
1756 apic, mp_ioapics[apic].mpc_apicid); 1764 apic, mp_ioapics[apic].mp_apicid);
1757 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", 1765 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1758 reg_00.bits.ID); 1766 reg_00.bits.ID);
1759 mp_ioapics[apic].mpc_apicid = reg_00.bits.ID; 1767 mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
1760 } 1768 }
1761 1769
1762 /* 1770 /*
@@ -1765,9 +1773,9 @@ static void __init setup_ioapic_ids_from_mpc(void)
1765 * 'stuck on smp_invalidate_needed IPI wait' messages. 1773 * 'stuck on smp_invalidate_needed IPI wait' messages.
1766 */ 1774 */
1767 if (check_apicid_used(phys_id_present_map, 1775 if (check_apicid_used(phys_id_present_map,
1768 mp_ioapics[apic].mpc_apicid)) { 1776 mp_ioapics[apic].mp_apicid)) {
1769 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", 1777 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
1770 apic, mp_ioapics[apic].mpc_apicid); 1778 apic, mp_ioapics[apic].mp_apicid);
1771 for (i = 0; i < get_physical_broadcast(); i++) 1779 for (i = 0; i < get_physical_broadcast(); i++)
1772 if (!physid_isset(i, phys_id_present_map)) 1780 if (!physid_isset(i, phys_id_present_map))
1773 break; 1781 break;
@@ -1776,13 +1784,13 @@ static void __init setup_ioapic_ids_from_mpc(void)
1776 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", 1784 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1777 i); 1785 i);
1778 physid_set(i, phys_id_present_map); 1786 physid_set(i, phys_id_present_map);
1779 mp_ioapics[apic].mpc_apicid = i; 1787 mp_ioapics[apic].mp_apicid = i;
1780 } else { 1788 } else {
1781 physid_mask_t tmp; 1789 physid_mask_t tmp;
1782 tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid); 1790 tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
1783 apic_printk(APIC_VERBOSE, "Setting %d in the " 1791 apic_printk(APIC_VERBOSE, "Setting %d in the "
1784 "phys_id_present_map\n", 1792 "phys_id_present_map\n",
1785 mp_ioapics[apic].mpc_apicid); 1793 mp_ioapics[apic].mp_apicid);
1786 physids_or(phys_id_present_map, phys_id_present_map, tmp); 1794 physids_or(phys_id_present_map, phys_id_present_map, tmp);
1787 } 1795 }
1788 1796
@@ -1791,21 +1799,21 @@ static void __init setup_ioapic_ids_from_mpc(void)
1791 * We need to adjust the IRQ routing table 1799 * We need to adjust the IRQ routing table
1792 * if the ID changed. 1800 * if the ID changed.
1793 */ 1801 */
1794 if (old_id != mp_ioapics[apic].mpc_apicid) 1802 if (old_id != mp_ioapics[apic].mp_apicid)
1795 for (i = 0; i < mp_irq_entries; i++) 1803 for (i = 0; i < mp_irq_entries; i++)
1796 if (mp_irqs[i].mpc_dstapic == old_id) 1804 if (mp_irqs[i].mp_dstapic == old_id)
1797 mp_irqs[i].mpc_dstapic 1805 mp_irqs[i].mp_dstapic
1798 = mp_ioapics[apic].mpc_apicid; 1806 = mp_ioapics[apic].mp_apicid;
1799 1807
1800 /* 1808 /*
1801 * Read the right value from the MPC table and 1809 * Read the right value from the MPC table and
1802 * write it into the ID register. 1810 * write it into the ID register.
1803 */ 1811 */
1804 apic_printk(APIC_VERBOSE, KERN_INFO 1812 apic_printk(APIC_VERBOSE, KERN_INFO
1805 "...changing IO-APIC physical APIC ID to %d ...", 1813 "...changing IO-APIC physical APIC ID to %d ...",
1806 mp_ioapics[apic].mpc_apicid); 1814 mp_ioapics[apic].mp_apicid);
1807 1815
1808 reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; 1816 reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
1809 spin_lock_irqsave(&ioapic_lock, flags); 1817 spin_lock_irqsave(&ioapic_lock, flags);
1810 io_apic_write(apic, 0, reg_00.raw); 1818 io_apic_write(apic, 0, reg_00.raw);
1811 spin_unlock_irqrestore(&ioapic_lock, flags); 1819 spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -1816,15 +1824,12 @@ static void __init setup_ioapic_ids_from_mpc(void)
1816 spin_lock_irqsave(&ioapic_lock, flags); 1824 spin_lock_irqsave(&ioapic_lock, flags);
1817 reg_00.raw = io_apic_read(apic, 0); 1825 reg_00.raw = io_apic_read(apic, 0);
1818 spin_unlock_irqrestore(&ioapic_lock, flags); 1826 spin_unlock_irqrestore(&ioapic_lock, flags);
1819 if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) 1827 if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
1820 printk("could not set ID!\n"); 1828 printk("could not set ID!\n");
1821 else 1829 else
1822 apic_printk(APIC_VERBOSE, " ok.\n"); 1830 apic_printk(APIC_VERBOSE, " ok.\n");
1823 } 1831 }
1824} 1832}
1825#else
1826static void __init setup_ioapic_ids_from_mpc(void) { }
1827#endif
1828 1833
1829int no_timer_check __initdata; 1834int no_timer_check __initdata;
1830 1835
@@ -2015,45 +2020,53 @@ static inline void init_IO_APIC_traps(void)
2015 * The local APIC irq-chip implementation: 2020 * The local APIC irq-chip implementation:
2016 */ 2021 */
2017 2022
2018static void ack_apic(unsigned int irq) 2023static void ack_lapic_irq(unsigned int irq)
2019{ 2024{
2020 ack_APIC_irq(); 2025 ack_APIC_irq();
2021} 2026}
2022 2027
2023static void mask_lapic_irq (unsigned int irq) 2028static void mask_lapic_irq(unsigned int irq)
2024{ 2029{
2025 unsigned long v; 2030 unsigned long v;
2026 2031
2027 v = apic_read(APIC_LVT0); 2032 v = apic_read(APIC_LVT0);
2028 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); 2033 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
2029} 2034}
2030 2035
2031static void unmask_lapic_irq (unsigned int irq) 2036static void unmask_lapic_irq(unsigned int irq)
2032{ 2037{
2033 unsigned long v; 2038 unsigned long v;
2034 2039
2035 v = apic_read(APIC_LVT0); 2040 v = apic_read(APIC_LVT0);
2036 apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); 2041 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
2037} 2042}
2038 2043
2039static struct irq_chip lapic_chip __read_mostly = { 2044static struct irq_chip lapic_chip __read_mostly = {
2040 .name = "local-APIC-edge", 2045 .name = "local-APIC",
2041 .mask = mask_lapic_irq, 2046 .mask = mask_lapic_irq,
2042 .unmask = unmask_lapic_irq, 2047 .unmask = unmask_lapic_irq,
2043 .eoi = ack_apic, 2048 .ack = ack_lapic_irq,
2044}; 2049};
2045 2050
2051static void lapic_register_intr(int irq, int vector)
2052{
2053 irq_desc[irq].status &= ~IRQ_LEVEL;
2054 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
2055 "edge");
2056 set_intr_gate(vector, interrupt[irq]);
2057}
2058
2046static void __init setup_nmi(void) 2059static void __init setup_nmi(void)
2047{ 2060{
2048 /* 2061 /*
2049 * Dirty trick to enable the NMI watchdog ... 2062 * Dirty trick to enable the NMI watchdog ...
2050 * We put the 8259A master into AEOI mode and 2063 * We put the 8259A master into AEOI mode and
2051 * unmask on all local APICs LVT0 as NMI. 2064 * unmask on all local APICs LVT0 as NMI.
2052 * 2065 *
2053 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') 2066 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
2054 * is from Maciej W. Rozycki - so we do not have to EOI from 2067 * is from Maciej W. Rozycki - so we do not have to EOI from
2055 * the NMI handler or the timer interrupt. 2068 * the NMI handler or the timer interrupt.
2056 */ 2069 */
2057 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); 2070 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
2058 2071
2059 enable_NMI_through_LVT0(); 2072 enable_NMI_through_LVT0();
@@ -2129,11 +2142,16 @@ static inline void __init unlock_ExtINT_logic(void)
2129static inline void __init check_timer(void) 2142static inline void __init check_timer(void)
2130{ 2143{
2131 int apic1, pin1, apic2, pin2; 2144 int apic1, pin1, apic2, pin2;
2145 int no_pin1 = 0;
2132 int vector; 2146 int vector;
2147 unsigned int ver;
2133 unsigned long flags; 2148 unsigned long flags;
2134 2149
2135 local_irq_save(flags); 2150 local_irq_save(flags);
2136 2151
2152 ver = apic_read(APIC_LVR);
2153 ver = GET_APIC_VERSION(ver);
2154
2137 /* 2155 /*
2138 * get/set the timer IRQ vector: 2156 * get/set the timer IRQ vector:
2139 */ 2157 */
@@ -2142,34 +2160,54 @@ static inline void __init check_timer(void)
2142 set_intr_gate(vector, interrupt[0]); 2160 set_intr_gate(vector, interrupt[0]);
2143 2161
2144 /* 2162 /*
2145 * Subtle, code in do_timer_interrupt() expects an AEOI 2163 * As IRQ0 is to be enabled in the 8259A, the virtual
2146 * mode for the 8259A whenever interrupts are routed 2164 * wire has to be disabled in the local APIC. Also
2147 * through I/O APICs. Also IRQ0 has to be enabled in 2165 * timer interrupts need to be acknowledged manually in
2148 * the 8259A which implies the virtual wire has to be 2166 * the 8259A for the i82489DX when using the NMI
2149 * disabled in the local APIC. 2167 * watchdog as that APIC treats NMIs as level-triggered.
2168 * The AEOI mode will finish them in the 8259A
2169 * automatically.
2150 */ 2170 */
2151 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 2171 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2152 init_8259A(1); 2172 init_8259A(1);
2153 timer_ack = 1; 2173 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
2154 if (timer_over_8254 > 0)
2155 enable_8259A_irq(0);
2156 2174
2157 pin1 = find_isa_irq_pin(0, mp_INT); 2175 pin1 = find_isa_irq_pin(0, mp_INT);
2158 apic1 = find_isa_irq_apic(0, mp_INT); 2176 apic1 = find_isa_irq_apic(0, mp_INT);
2159 pin2 = ioapic_i8259.pin; 2177 pin2 = ioapic_i8259.pin;
2160 apic2 = ioapic_i8259.apic; 2178 apic2 = ioapic_i8259.apic;
2161 2179
2162 printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", 2180 apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
2163 vector, apic1, pin1, apic2, pin2); 2181 "apic1=%d pin1=%d apic2=%d pin2=%d\n",
2182 vector, apic1, pin1, apic2, pin2);
2183
2184 /*
2185 * Some BIOS writers are clueless and report the ExtINTA
2186 * I/O APIC input from the cascaded 8259A as the timer
2187 * interrupt input. So just in case, if only one pin
2188 * was found above, try it both directly and through the
2189 * 8259A.
2190 */
2191 if (pin1 == -1) {
2192 pin1 = pin2;
2193 apic1 = apic2;
2194 no_pin1 = 1;
2195 } else if (pin2 == -1) {
2196 pin2 = pin1;
2197 apic2 = apic1;
2198 }
2164 2199
2165 if (pin1 != -1) { 2200 if (pin1 != -1) {
2166 /* 2201 /*
2167 * Ok, does IRQ0 through the IOAPIC work? 2202 * Ok, does IRQ0 through the IOAPIC work?
2168 */ 2203 */
2204 if (no_pin1) {
2205 add_pin_to_irq(0, apic1, pin1);
2206 setup_timer_IRQ0_pin(apic1, pin1, vector);
2207 }
2169 unmask_IO_APIC_irq(0); 2208 unmask_IO_APIC_irq(0);
2170 if (timer_irq_works()) { 2209 if (timer_irq_works()) {
2171 if (nmi_watchdog == NMI_IO_APIC) { 2210 if (nmi_watchdog == NMI_IO_APIC) {
2172 disable_8259A_irq(0);
2173 setup_nmi(); 2211 setup_nmi();
2174 enable_8259A_irq(0); 2212 enable_8259A_irq(0);
2175 } 2213 }
@@ -2178,81 +2216,97 @@ static inline void __init check_timer(void)
2178 goto out; 2216 goto out;
2179 } 2217 }
2180 clear_IO_APIC_pin(apic1, pin1); 2218 clear_IO_APIC_pin(apic1, pin1);
2181 printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to " 2219 if (!no_pin1)
2182 "IO-APIC\n"); 2220 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
2183 } 2221 "8254 timer not connected to IO-APIC\n");
2184 2222
2185 printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... "); 2223 apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
2186 if (pin2 != -1) { 2224 "(IRQ0) through the 8259A ...\n");
2187 printk("\n..... (found pin %d) ...", pin2); 2225 apic_printk(APIC_QUIET, KERN_INFO
2226 "..... (found apic %d pin %d) ...\n", apic2, pin2);
2188 /* 2227 /*
2189 * legacy devices should be connected to IO APIC #0 2228 * legacy devices should be connected to IO APIC #0
2190 */ 2229 */
2191 setup_ExtINT_IRQ0_pin(apic2, pin2, vector); 2230 replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
2231 setup_timer_IRQ0_pin(apic2, pin2, vector);
2232 unmask_IO_APIC_irq(0);
2233 enable_8259A_irq(0);
2192 if (timer_irq_works()) { 2234 if (timer_irq_works()) {
2193 printk("works.\n"); 2235 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
2194 if (pin1 != -1) 2236 timer_through_8259 = 1;
2195 replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
2196 else
2197 add_pin_to_irq(0, apic2, pin2);
2198 if (nmi_watchdog == NMI_IO_APIC) { 2237 if (nmi_watchdog == NMI_IO_APIC) {
2238 disable_8259A_irq(0);
2199 setup_nmi(); 2239 setup_nmi();
2240 enable_8259A_irq(0);
2200 } 2241 }
2201 goto out; 2242 goto out;
2202 } 2243 }
2203 /* 2244 /*
2204 * Cleanup, just in case ... 2245 * Cleanup, just in case ...
2205 */ 2246 */
2247 disable_8259A_irq(0);
2206 clear_IO_APIC_pin(apic2, pin2); 2248 clear_IO_APIC_pin(apic2, pin2);
2249 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
2207 } 2250 }
2208 printk(" failed.\n");
2209 2251
2210 if (nmi_watchdog == NMI_IO_APIC) { 2252 if (nmi_watchdog == NMI_IO_APIC) {
2211 printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); 2253 apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
2212 nmi_watchdog = 0; 2254 "through the IO-APIC - disabling NMI Watchdog!\n");
2255 nmi_watchdog = NMI_NONE;
2213 } 2256 }
2257 timer_ack = 0;
2214 2258
2215 printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); 2259 apic_printk(APIC_QUIET, KERN_INFO
2260 "...trying to set up timer as Virtual Wire IRQ...\n");
2216 2261
2217 disable_8259A_irq(0); 2262 lapic_register_intr(0, vector);
2218 set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq, 2263 apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
2219 "fasteoi");
2220 apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
2221 enable_8259A_irq(0); 2264 enable_8259A_irq(0);
2222 2265
2223 if (timer_irq_works()) { 2266 if (timer_irq_works()) {
2224 printk(" works.\n"); 2267 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
2225 goto out; 2268 goto out;
2226 } 2269 }
2227 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); 2270 disable_8259A_irq(0);
2228 printk(" failed.\n"); 2271 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
2272 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
2229 2273
2230 printk(KERN_INFO "...trying to set up timer as ExtINT IRQ..."); 2274 apic_printk(APIC_QUIET, KERN_INFO
2275 "...trying to set up timer as ExtINT IRQ...\n");
2231 2276
2232 timer_ack = 0;
2233 init_8259A(0); 2277 init_8259A(0);
2234 make_8259A_irq(0); 2278 make_8259A_irq(0);
2235 apic_write_around(APIC_LVT0, APIC_DM_EXTINT); 2279 apic_write(APIC_LVT0, APIC_DM_EXTINT);
2236 2280
2237 unlock_ExtINT_logic(); 2281 unlock_ExtINT_logic();
2238 2282
2239 if (timer_irq_works()) { 2283 if (timer_irq_works()) {
2240 printk(" works.\n"); 2284 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
2241 goto out; 2285 goto out;
2242 } 2286 }
2243 printk(" failed :(.\n"); 2287 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
2244 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " 2288 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
2245 "report. Then try booting with the 'noapic' option"); 2289 "report. Then try booting with the 'noapic' option.\n");
2246out: 2290out:
2247 local_irq_restore(flags); 2291 local_irq_restore(flags);
2248} 2292}
2249 2293
2250/* 2294/*
2251 * 2295 * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
2252 * IRQ's that are handled by the PIC in the MPS IOAPIC case. 2296 * to devices. However there may be an I/O APIC pin available for
2253 * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. 2297 * this interrupt regardless. The pin may be left unconnected, but
2254 * Linux doesn't really care, as it's not actually used 2298 * typically it will be reused as an ExtINT cascade interrupt for
2255 * for any interrupt handling anyway. 2299 * the master 8259A. In the MPS case such a pin will normally be
2300 * reported as an ExtINT interrupt in the MP table. With ACPI
2301 * there is no provision for ExtINT interrupts, and in the absence
2302 * of an override it would be treated as an ordinary ISA I/O APIC
2303 * interrupt, that is edge-triggered and unmasked by default. We
2304 * used to do this, but it caused problems on some systems because
2305 * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
2306 * the same ExtINT cascade interrupt to drive the local APIC of the
2307 * bootstrap processor. Therefore we refrain from routing IRQ2 to
2308 * the I/O APIC in all cases now. No actual device should request
2309 * it anyway. --macro
2256 */ 2310 */
2257#define PIC_IRQS (1 << PIC_CASCADE_IR) 2311#define PIC_IRQS (1 << PIC_CASCADE_IR)
2258 2312
@@ -2261,15 +2315,12 @@ void __init setup_IO_APIC(void)
2261 int i; 2315 int i;
2262 2316
2263 /* Reserve all the system vectors. */ 2317 /* Reserve all the system vectors. */
2264 for (i = FIRST_SYSTEM_VECTOR; i < NR_VECTORS; i++) 2318 for (i = first_system_vector; i < NR_VECTORS; i++)
2265 set_bit(i, used_vectors); 2319 set_bit(i, used_vectors);
2266 2320
2267 enable_IO_APIC(); 2321 enable_IO_APIC();
2268 2322
2269 if (acpi_ioapic) 2323 io_apic_irqs = ~PIC_IRQS;
2270 io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
2271 else
2272 io_apic_irqs = ~PIC_IRQS;
2273 2324
2274 printk("ENABLING IO-APIC IRQs\n"); 2325 printk("ENABLING IO-APIC IRQs\n");
2275 2326
@@ -2286,28 +2337,14 @@ void __init setup_IO_APIC(void)
2286 print_IO_APIC(); 2337 print_IO_APIC();
2287} 2338}
2288 2339
2289static int __init setup_disable_8254_timer(char *s)
2290{
2291 timer_over_8254 = -1;
2292 return 1;
2293}
2294static int __init setup_enable_8254_timer(char *s)
2295{
2296 timer_over_8254 = 2;
2297 return 1;
2298}
2299
2300__setup("disable_8254_timer", setup_disable_8254_timer);
2301__setup("enable_8254_timer", setup_enable_8254_timer);
2302
2303/* 2340/*
2304 * Called after all the initialization is done. If we didnt find any 2341 * Called after all the initialization is done. If we didnt find any
2305 * APIC bugs then we can allow the modify fast path 2342 * APIC bugs then we can allow the modify fast path
2306 */ 2343 */
2307 2344
2308static int __init io_apic_bug_finalize(void) 2345static int __init io_apic_bug_finalize(void)
2309{ 2346{
2310 if(sis_apic_bug == -1) 2347 if (sis_apic_bug == -1)
2311 sis_apic_bug = 0; 2348 sis_apic_bug = 0;
2312 return 0; 2349 return 0;
2313} 2350}
@@ -2318,17 +2355,17 @@ struct sysfs_ioapic_data {
2318 struct sys_device dev; 2355 struct sys_device dev;
2319 struct IO_APIC_route_entry entry[0]; 2356 struct IO_APIC_route_entry entry[0];
2320}; 2357};
2321static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; 2358static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS];
2322 2359
2323static int ioapic_suspend(struct sys_device *dev, pm_message_t state) 2360static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
2324{ 2361{
2325 struct IO_APIC_route_entry *entry; 2362 struct IO_APIC_route_entry *entry;
2326 struct sysfs_ioapic_data *data; 2363 struct sysfs_ioapic_data *data;
2327 int i; 2364 int i;
2328 2365
2329 data = container_of(dev, struct sysfs_ioapic_data, dev); 2366 data = container_of(dev, struct sysfs_ioapic_data, dev);
2330 entry = data->entry; 2367 entry = data->entry;
2331 for (i = 0; i < nr_ioapic_registers[dev->id]; i ++) 2368 for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
2332 entry[i] = ioapic_read_entry(dev->id, i); 2369 entry[i] = ioapic_read_entry(dev->id, i);
2333 2370
2334 return 0; 2371 return 0;
@@ -2341,18 +2378,18 @@ static int ioapic_resume(struct sys_device *dev)
2341 unsigned long flags; 2378 unsigned long flags;
2342 union IO_APIC_reg_00 reg_00; 2379 union IO_APIC_reg_00 reg_00;
2343 int i; 2380 int i;
2344 2381
2345 data = container_of(dev, struct sysfs_ioapic_data, dev); 2382 data = container_of(dev, struct sysfs_ioapic_data, dev);
2346 entry = data->entry; 2383 entry = data->entry;
2347 2384
2348 spin_lock_irqsave(&ioapic_lock, flags); 2385 spin_lock_irqsave(&ioapic_lock, flags);
2349 reg_00.raw = io_apic_read(dev->id, 0); 2386 reg_00.raw = io_apic_read(dev->id, 0);
2350 if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { 2387 if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
2351 reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; 2388 reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
2352 io_apic_write(dev->id, 0, reg_00.raw); 2389 io_apic_write(dev->id, 0, reg_00.raw);
2353 } 2390 }
2354 spin_unlock_irqrestore(&ioapic_lock, flags); 2391 spin_unlock_irqrestore(&ioapic_lock, flags);
2355 for (i = 0; i < nr_ioapic_registers[dev->id]; i ++) 2392 for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
2356 ioapic_write_entry(dev->id, i, entry[i]); 2393 ioapic_write_entry(dev->id, i, entry[i]);
2357 2394
2358 return 0; 2395 return 0;
@@ -2366,24 +2403,23 @@ static struct sysdev_class ioapic_sysdev_class = {
2366 2403
2367static int __init ioapic_init_sysfs(void) 2404static int __init ioapic_init_sysfs(void)
2368{ 2405{
2369 struct sys_device * dev; 2406 struct sys_device *dev;
2370 int i, size, error = 0; 2407 int i, size, error = 0;
2371 2408
2372 error = sysdev_class_register(&ioapic_sysdev_class); 2409 error = sysdev_class_register(&ioapic_sysdev_class);
2373 if (error) 2410 if (error)
2374 return error; 2411 return error;
2375 2412
2376 for (i = 0; i < nr_ioapics; i++ ) { 2413 for (i = 0; i < nr_ioapics; i++) {
2377 size = sizeof(struct sys_device) + nr_ioapic_registers[i] 2414 size = sizeof(struct sys_device) + nr_ioapic_registers[i]
2378 * sizeof(struct IO_APIC_route_entry); 2415 * sizeof(struct IO_APIC_route_entry);
2379 mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); 2416 mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
2380 if (!mp_ioapic_data[i]) { 2417 if (!mp_ioapic_data[i]) {
2381 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); 2418 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
2382 continue; 2419 continue;
2383 } 2420 }
2384 memset(mp_ioapic_data[i], 0, size);
2385 dev = &mp_ioapic_data[i]->dev; 2421 dev = &mp_ioapic_data[i]->dev;
2386 dev->id = i; 2422 dev->id = i;
2387 dev->cls = &ioapic_sysdev_class; 2423 dev->cls = &ioapic_sysdev_class;
2388 error = sysdev_register(dev); 2424 error = sysdev_register(dev);
2389 if (error) { 2425 if (error) {
@@ -2458,7 +2494,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
2458 msg->address_lo = 2494 msg->address_lo =
2459 MSI_ADDR_BASE_LO | 2495 MSI_ADDR_BASE_LO |
2460 ((INT_DEST_MODE == 0) ? 2496 ((INT_DEST_MODE == 0) ?
2461 MSI_ADDR_DEST_MODE_PHYSICAL: 2497MSI_ADDR_DEST_MODE_PHYSICAL:
2462 MSI_ADDR_DEST_MODE_LOGICAL) | 2498 MSI_ADDR_DEST_MODE_LOGICAL) |
2463 ((INT_DELIVERY_MODE != dest_LowestPrio) ? 2499 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2464 MSI_ADDR_REDIRECTION_CPU: 2500 MSI_ADDR_REDIRECTION_CPU:
@@ -2469,7 +2505,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
2469 MSI_DATA_TRIGGER_EDGE | 2505 MSI_DATA_TRIGGER_EDGE |
2470 MSI_DATA_LEVEL_ASSERT | 2506 MSI_DATA_LEVEL_ASSERT |
2471 ((INT_DELIVERY_MODE != dest_LowestPrio) ? 2507 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2472 MSI_DATA_DELIVERY_FIXED: 2508MSI_DATA_DELIVERY_FIXED:
2473 MSI_DATA_DELIVERY_LOWPRI) | 2509 MSI_DATA_DELIVERY_LOWPRI) |
2474 MSI_DATA_VECTOR(vector); 2510 MSI_DATA_VECTOR(vector);
2475 } 2511 }
@@ -2640,12 +2676,12 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2640#endif /* CONFIG_HT_IRQ */ 2676#endif /* CONFIG_HT_IRQ */
2641 2677
2642/* -------------------------------------------------------------------------- 2678/* --------------------------------------------------------------------------
2643 ACPI-based IOAPIC Configuration 2679 ACPI-based IOAPIC Configuration
2644 -------------------------------------------------------------------------- */ 2680 -------------------------------------------------------------------------- */
2645 2681
2646#ifdef CONFIG_ACPI 2682#ifdef CONFIG_ACPI
2647 2683
2648int __init io_apic_get_unique_id (int ioapic, int apic_id) 2684int __init io_apic_get_unique_id(int ioapic, int apic_id)
2649{ 2685{
2650 union IO_APIC_reg_00 reg_00; 2686 union IO_APIC_reg_00 reg_00;
2651 static physid_mask_t apic_id_map = PHYSID_MASK_NONE; 2687 static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
@@ -2654,10 +2690,10 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id)
2654 int i = 0; 2690 int i = 0;
2655 2691
2656 /* 2692 /*
2657 * The P4 platform supports up to 256 APIC IDs on two separate APIC 2693 * The P4 platform supports up to 256 APIC IDs on two separate APIC
2658 * buses (one for LAPICs, one for IOAPICs), where predecessors only 2694 * buses (one for LAPICs, one for IOAPICs), where predecessors only
2659 * supports up to 16 on one shared APIC bus. 2695 * supports up to 16 on one shared APIC bus.
2660 * 2696 *
2661 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full 2697 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
2662 * advantage of new APIC bus architecture. 2698 * advantage of new APIC bus architecture.
2663 */ 2699 */
@@ -2676,7 +2712,7 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id)
2676 } 2712 }
2677 2713
2678 /* 2714 /*
2679 * Every APIC in a system must have a unique ID or we get lots of nice 2715 * Every APIC in a system must have a unique ID or we get lots of nice
2680 * 'stuck on smp_invalidate_needed IPI wait' messages. 2716 * 'stuck on smp_invalidate_needed IPI wait' messages.
2681 */ 2717 */
2682 if (check_apicid_used(apic_id_map, apic_id)) { 2718 if (check_apicid_used(apic_id_map, apic_id)) {
@@ -2693,7 +2729,7 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id)
2693 "trying %d\n", ioapic, apic_id, i); 2729 "trying %d\n", ioapic, apic_id, i);
2694 2730
2695 apic_id = i; 2731 apic_id = i;
2696 } 2732 }
2697 2733
2698 tmp = apicid_to_cpu_present(apic_id); 2734 tmp = apicid_to_cpu_present(apic_id);
2699 physids_or(apic_id_map, apic_id_map, tmp); 2735 physids_or(apic_id_map, apic_id_map, tmp);
@@ -2720,7 +2756,7 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id)
2720} 2756}
2721 2757
2722 2758
2723int __init io_apic_get_version (int ioapic) 2759int __init io_apic_get_version(int ioapic)
2724{ 2760{
2725 union IO_APIC_reg_01 reg_01; 2761 union IO_APIC_reg_01 reg_01;
2726 unsigned long flags; 2762 unsigned long flags;
@@ -2733,7 +2769,7 @@ int __init io_apic_get_version (int ioapic)
2733} 2769}
2734 2770
2735 2771
2736int __init io_apic_get_redir_entries (int ioapic) 2772int __init io_apic_get_redir_entries(int ioapic)
2737{ 2773{
2738 union IO_APIC_reg_01 reg_01; 2774 union IO_APIC_reg_01 reg_01;
2739 unsigned long flags; 2775 unsigned long flags;
@@ -2746,7 +2782,7 @@ int __init io_apic_get_redir_entries (int ioapic)
2746} 2782}
2747 2783
2748 2784
2749int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low) 2785int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low)
2750{ 2786{
2751 struct IO_APIC_route_entry entry; 2787 struct IO_APIC_route_entry entry;
2752 2788
@@ -2762,7 +2798,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
2762 * corresponding device driver registers for this IRQ. 2798 * corresponding device driver registers for this IRQ.
2763 */ 2799 */
2764 2800
2765 memset(&entry,0,sizeof(entry)); 2801 memset(&entry, 0, sizeof(entry));
2766 2802
2767 entry.delivery_mode = INT_DELIVERY_MODE; 2803 entry.delivery_mode = INT_DELIVERY_MODE;
2768 entry.dest_mode = INT_DEST_MODE; 2804 entry.dest_mode = INT_DEST_MODE;
@@ -2781,7 +2817,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
2781 2817
2782 apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry " 2818 apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
2783 "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic, 2819 "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
2784 mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, 2820 mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq,
2785 edge_level, active_high_low); 2821 edge_level, active_high_low);
2786 2822
2787 ioapic_register_intr(irq, entry.vector, edge_level); 2823 ioapic_register_intr(irq, entry.vector, edge_level);
@@ -2802,8 +2838,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
2802 return -1; 2838 return -1;
2803 2839
2804 for (i = 0; i < mp_irq_entries; i++) 2840 for (i = 0; i < mp_irq_entries; i++)
2805 if (mp_irqs[i].mpc_irqtype == mp_INT && 2841 if (mp_irqs[i].mp_irqtype == mp_INT &&
2806 mp_irqs[i].mpc_srcbusirq == bus_irq) 2842 mp_irqs[i].mp_srcbusirq == bus_irq)
2807 break; 2843 break;
2808 if (i >= mp_irq_entries) 2844 if (i >= mp_irq_entries)
2809 return -1; 2845 return -1;
@@ -2836,3 +2872,34 @@ static int __init parse_noapic(char *arg)
2836 return 0; 2872 return 0;
2837} 2873}
2838early_param("noapic", parse_noapic); 2874early_param("noapic", parse_noapic);
2875
2876void __init ioapic_init_mappings(void)
2877{
2878 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
2879 int i;
2880
2881 for (i = 0; i < nr_ioapics; i++) {
2882 if (smp_found_config) {
2883 ioapic_phys = mp_ioapics[i].mp_apicaddr;
2884 if (!ioapic_phys) {
2885 printk(KERN_ERR
2886 "WARNING: bogus zero IO-APIC "
2887 "address found in MPTABLE, "
2888 "disabling IO/APIC support!\n");
2889 smp_found_config = 0;
2890 skip_ioapic_setup = 1;
2891 goto fake_ioapic_page;
2892 }
2893 } else {
2894fake_ioapic_page:
2895 ioapic_phys = (unsigned long)
2896 alloc_bootmem_pages(PAGE_SIZE);
2897 ioapic_phys = __pa(ioapic_phys);
2898 }
2899 set_fixmap_nocache(idx, ioapic_phys);
2900 printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
2901 __fix_to_virt(idx), ioapic_phys);
2902 idx++;
2903 }
2904}
2905
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index ef1a8dfcc529..8269434d1707 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -45,6 +45,7 @@
45#include <asm/proto.h> 45#include <asm/proto.h>
46#include <asm/acpi.h> 46#include <asm/acpi.h>
47#include <asm/dma.h> 47#include <asm/dma.h>
48#include <asm/i8259.h>
48#include <asm/nmi.h> 49#include <asm/nmi.h>
49#include <asm/msidef.h> 50#include <asm/msidef.h>
50#include <asm/hypertransport.h> 51#include <asm/hypertransport.h>
@@ -61,7 +62,7 @@ struct irq_cfg {
61}; 62};
62 63
63/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ 64/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
64struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = { 65static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
65 [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, 66 [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, },
66 [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, 67 [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, },
67 [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, 68 [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, },
@@ -82,6 +83,10 @@ struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
82 83
83static int assign_irq_vector(int irq, cpumask_t mask); 84static int assign_irq_vector(int irq, cpumask_t mask);
84 85
86int first_system_vector = 0xfe;
87
88char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
89
85#define __apicdebuginit __init 90#define __apicdebuginit __init
86 91
87int sis_apic_bug; /* not actually supported, dummy for compile */ 92int sis_apic_bug; /* not actually supported, dummy for compile */
@@ -90,7 +95,7 @@ static int no_timer_check;
90 95
91static int disable_timer_pin_1 __initdata; 96static int disable_timer_pin_1 __initdata;
92 97
93int timer_over_8254 __initdata = 1; 98int timer_through_8259 __initdata;
94 99
95/* Where if anywhere is the i8259 connect in external int mode */ 100/* Where if anywhere is the i8259 connect in external int mode */
96static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; 101static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
@@ -104,15 +109,17 @@ DEFINE_SPINLOCK(vector_lock);
104int nr_ioapic_registers[MAX_IO_APICS]; 109int nr_ioapic_registers[MAX_IO_APICS];
105 110
106/* I/O APIC entries */ 111/* I/O APIC entries */
107struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; 112struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
108int nr_ioapics; 113int nr_ioapics;
109 114
110/* MP IRQ source entries */ 115/* MP IRQ source entries */
111struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; 116struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
112 117
113/* # of MP IRQ source entries */ 118/* # of MP IRQ source entries */
114int mp_irq_entries; 119int mp_irq_entries;
115 120
121DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
122
116/* 123/*
117 * Rough estimation of how many shared IRQs there are, can 124 * Rough estimation of how many shared IRQs there are, can
118 * be changed anytime. 125 * be changed anytime.
@@ -140,7 +147,7 @@ struct io_apic {
140static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) 147static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
141{ 148{
142 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) 149 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
143 + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK); 150 + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
144} 151}
145 152
146static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) 153static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
@@ -183,7 +190,7 @@ static bool io_apic_level_ack_pending(unsigned int irq)
183 break; 190 break;
184 reg = io_apic_read(entry->apic, 0x10 + pin*2); 191 reg = io_apic_read(entry->apic, 0x10 + pin*2);
185 /* Is the remote IRR bit set? */ 192 /* Is the remote IRR bit set? */
186 if ((reg >> 14) & 1) { 193 if (reg & IO_APIC_REDIR_REMOTE_IRR) {
187 spin_unlock_irqrestore(&ioapic_lock, flags); 194 spin_unlock_irqrestore(&ioapic_lock, flags);
188 return true; 195 return true;
189 } 196 }
@@ -298,7 +305,7 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
298 break; 305 break;
299 io_apic_write(apic, 0x11 + pin*2, dest); 306 io_apic_write(apic, 0x11 + pin*2, dest);
300 reg = io_apic_read(apic, 0x10 + pin*2); 307 reg = io_apic_read(apic, 0x10 + pin*2);
301 reg &= ~0x000000ff; 308 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
302 reg |= vector; 309 reg |= vector;
303 io_apic_modify(apic, reg); 310 io_apic_modify(apic, reg);
304 if (!entry->next) 311 if (!entry->next)
@@ -360,16 +367,37 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
360 entry->pin = pin; 367 entry->pin = pin;
361} 368}
362 369
370/*
371 * Reroute an IRQ to a different pin.
372 */
373static void __init replace_pin_at_irq(unsigned int irq,
374 int oldapic, int oldpin,
375 int newapic, int newpin)
376{
377 struct irq_pin_list *entry = irq_2_pin + irq;
378
379 while (1) {
380 if (entry->apic == oldapic && entry->pin == oldpin) {
381 entry->apic = newapic;
382 entry->pin = newpin;
383 }
384 if (!entry->next)
385 break;
386 entry = irq_2_pin + entry->next;
387 }
388}
389
363 390
364#define DO_ACTION(name,R,ACTION, FINAL) \ 391#define DO_ACTION(name,R,ACTION, FINAL) \
365 \ 392 \
366 static void name##_IO_APIC_irq (unsigned int irq) \ 393 static void name##_IO_APIC_irq (unsigned int irq) \
367 __DO_ACTION(R, ACTION, FINAL) 394 __DO_ACTION(R, ACTION, FINAL)
368 395
369DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) 396/* mask = 1 */
370 /* mask = 1 */ 397DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic))
371DO_ACTION( __unmask, 0, &= 0xfffeffff, ) 398
372 /* mask = 0 */ 399/* mask = 0 */
400DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, )
373 401
374static void mask_IO_APIC_irq (unsigned int irq) 402static void mask_IO_APIC_irq (unsigned int irq)
375{ 403{
@@ -430,20 +458,6 @@ static int __init disable_timer_pin_setup(char *arg)
430} 458}
431__setup("disable_timer_pin_1", disable_timer_pin_setup); 459__setup("disable_timer_pin_1", disable_timer_pin_setup);
432 460
433static int __init setup_disable_8254_timer(char *s)
434{
435 timer_over_8254 = -1;
436 return 1;
437}
438static int __init setup_enable_8254_timer(char *s)
439{
440 timer_over_8254 = 2;
441 return 1;
442}
443
444__setup("disable_8254_timer", setup_disable_8254_timer);
445__setup("enable_8254_timer", setup_enable_8254_timer);
446
447 461
448/* 462/*
449 * Find the IRQ entry number of a certain pin. 463 * Find the IRQ entry number of a certain pin.
@@ -453,10 +467,10 @@ static int find_irq_entry(int apic, int pin, int type)
453 int i; 467 int i;
454 468
455 for (i = 0; i < mp_irq_entries; i++) 469 for (i = 0; i < mp_irq_entries; i++)
456 if (mp_irqs[i].mpc_irqtype == type && 470 if (mp_irqs[i].mp_irqtype == type &&
457 (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || 471 (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
458 mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && 472 mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
459 mp_irqs[i].mpc_dstirq == pin) 473 mp_irqs[i].mp_dstirq == pin)
460 return i; 474 return i;
461 475
462 return -1; 476 return -1;
@@ -470,13 +484,13 @@ static int __init find_isa_irq_pin(int irq, int type)
470 int i; 484 int i;
471 485
472 for (i = 0; i < mp_irq_entries; i++) { 486 for (i = 0; i < mp_irq_entries; i++) {
473 int lbus = mp_irqs[i].mpc_srcbus; 487 int lbus = mp_irqs[i].mp_srcbus;
474 488
475 if (test_bit(lbus, mp_bus_not_pci) && 489 if (test_bit(lbus, mp_bus_not_pci) &&
476 (mp_irqs[i].mpc_irqtype == type) && 490 (mp_irqs[i].mp_irqtype == type) &&
477 (mp_irqs[i].mpc_srcbusirq == irq)) 491 (mp_irqs[i].mp_srcbusirq == irq))
478 492
479 return mp_irqs[i].mpc_dstirq; 493 return mp_irqs[i].mp_dstirq;
480 } 494 }
481 return -1; 495 return -1;
482} 496}
@@ -486,17 +500,17 @@ static int __init find_isa_irq_apic(int irq, int type)
486 int i; 500 int i;
487 501
488 for (i = 0; i < mp_irq_entries; i++) { 502 for (i = 0; i < mp_irq_entries; i++) {
489 int lbus = mp_irqs[i].mpc_srcbus; 503 int lbus = mp_irqs[i].mp_srcbus;
490 504
491 if (test_bit(lbus, mp_bus_not_pci) && 505 if (test_bit(lbus, mp_bus_not_pci) &&
492 (mp_irqs[i].mpc_irqtype == type) && 506 (mp_irqs[i].mp_irqtype == type) &&
493 (mp_irqs[i].mpc_srcbusirq == irq)) 507 (mp_irqs[i].mp_srcbusirq == irq))
494 break; 508 break;
495 } 509 }
496 if (i < mp_irq_entries) { 510 if (i < mp_irq_entries) {
497 int apic; 511 int apic;
498 for(apic = 0; apic < nr_ioapics; apic++) { 512 for(apic = 0; apic < nr_ioapics; apic++) {
499 if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) 513 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
500 return apic; 514 return apic;
501 } 515 }
502 } 516 }
@@ -516,28 +530,28 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
516 530
517 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", 531 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
518 bus, slot, pin); 532 bus, slot, pin);
519 if (mp_bus_id_to_pci_bus[bus] == -1) { 533 if (test_bit(bus, mp_bus_not_pci)) {
520 apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); 534 apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
521 return -1; 535 return -1;
522 } 536 }
523 for (i = 0; i < mp_irq_entries; i++) { 537 for (i = 0; i < mp_irq_entries; i++) {
524 int lbus = mp_irqs[i].mpc_srcbus; 538 int lbus = mp_irqs[i].mp_srcbus;
525 539
526 for (apic = 0; apic < nr_ioapics; apic++) 540 for (apic = 0; apic < nr_ioapics; apic++)
527 if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || 541 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
528 mp_irqs[i].mpc_dstapic == MP_APIC_ALL) 542 mp_irqs[i].mp_dstapic == MP_APIC_ALL)
529 break; 543 break;
530 544
531 if (!test_bit(lbus, mp_bus_not_pci) && 545 if (!test_bit(lbus, mp_bus_not_pci) &&
532 !mp_irqs[i].mpc_irqtype && 546 !mp_irqs[i].mp_irqtype &&
533 (bus == lbus) && 547 (bus == lbus) &&
534 (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { 548 (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
535 int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); 549 int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq);
536 550
537 if (!(apic || IO_APIC_IRQ(irq))) 551 if (!(apic || IO_APIC_IRQ(irq)))
538 continue; 552 continue;
539 553
540 if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) 554 if (pin == (mp_irqs[i].mp_srcbusirq & 3))
541 return irq; 555 return irq;
542 /* 556 /*
543 * Use the first all-but-pin matching entry as a 557 * Use the first all-but-pin matching entry as a
@@ -565,13 +579,13 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
565 579
566static int MPBIOS_polarity(int idx) 580static int MPBIOS_polarity(int idx)
567{ 581{
568 int bus = mp_irqs[idx].mpc_srcbus; 582 int bus = mp_irqs[idx].mp_srcbus;
569 int polarity; 583 int polarity;
570 584
571 /* 585 /*
572 * Determine IRQ line polarity (high active or low active): 586 * Determine IRQ line polarity (high active or low active):
573 */ 587 */
574 switch (mp_irqs[idx].mpc_irqflag & 3) 588 switch (mp_irqs[idx].mp_irqflag & 3)
575 { 589 {
576 case 0: /* conforms, ie. bus-type dependent polarity */ 590 case 0: /* conforms, ie. bus-type dependent polarity */
577 if (test_bit(bus, mp_bus_not_pci)) 591 if (test_bit(bus, mp_bus_not_pci))
@@ -607,13 +621,13 @@ static int MPBIOS_polarity(int idx)
607 621
608static int MPBIOS_trigger(int idx) 622static int MPBIOS_trigger(int idx)
609{ 623{
610 int bus = mp_irqs[idx].mpc_srcbus; 624 int bus = mp_irqs[idx].mp_srcbus;
611 int trigger; 625 int trigger;
612 626
613 /* 627 /*
614 * Determine IRQ trigger mode (edge or level sensitive): 628 * Determine IRQ trigger mode (edge or level sensitive):
615 */ 629 */
616 switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) 630 switch ((mp_irqs[idx].mp_irqflag>>2) & 3)
617 { 631 {
618 case 0: /* conforms, ie. bus-type dependent */ 632 case 0: /* conforms, ie. bus-type dependent */
619 if (test_bit(bus, mp_bus_not_pci)) 633 if (test_bit(bus, mp_bus_not_pci))
@@ -660,16 +674,16 @@ static inline int irq_trigger(int idx)
660static int pin_2_irq(int idx, int apic, int pin) 674static int pin_2_irq(int idx, int apic, int pin)
661{ 675{
662 int irq, i; 676 int irq, i;
663 int bus = mp_irqs[idx].mpc_srcbus; 677 int bus = mp_irqs[idx].mp_srcbus;
664 678
665 /* 679 /*
666 * Debugging check, we are in big trouble if this message pops up! 680 * Debugging check, we are in big trouble if this message pops up!
667 */ 681 */
668 if (mp_irqs[idx].mpc_dstirq != pin) 682 if (mp_irqs[idx].mp_dstirq != pin)
669 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); 683 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
670 684
671 if (test_bit(bus, mp_bus_not_pci)) { 685 if (test_bit(bus, mp_bus_not_pci)) {
672 irq = mp_irqs[idx].mpc_srcbusirq; 686 irq = mp_irqs[idx].mp_srcbusirq;
673 } else { 687 } else {
674 /* 688 /*
675 * PCI IRQs are mapped in order 689 * PCI IRQs are mapped in order
@@ -718,7 +732,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
718 return 0; 732 return 0;
719 } 733 }
720 734
721 for_each_cpu_mask(cpu, mask) { 735 for_each_cpu_mask_nr(cpu, mask) {
722 cpumask_t domain, new_mask; 736 cpumask_t domain, new_mask;
723 int new_cpu; 737 int new_cpu;
724 int vector, offset; 738 int vector, offset;
@@ -730,7 +744,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
730 offset = current_offset; 744 offset = current_offset;
731next: 745next:
732 vector += 8; 746 vector += 8;
733 if (vector >= FIRST_SYSTEM_VECTOR) { 747 if (vector >= first_system_vector) {
734 /* If we run out of vectors on large boxen, must share them. */ 748 /* If we run out of vectors on large boxen, must share them. */
735 offset = (offset + 1) % 8; 749 offset = (offset + 1) % 8;
736 vector = FIRST_DEVICE_VECTOR + offset; 750 vector = FIRST_DEVICE_VECTOR + offset;
@@ -739,7 +753,7 @@ next:
739 continue; 753 continue;
740 if (vector == IA32_SYSCALL_VECTOR) 754 if (vector == IA32_SYSCALL_VECTOR)
741 goto next; 755 goto next;
742 for_each_cpu_mask(new_cpu, new_mask) 756 for_each_cpu_mask_nr(new_cpu, new_mask)
743 if (per_cpu(vector_irq, new_cpu)[vector] != -1) 757 if (per_cpu(vector_irq, new_cpu)[vector] != -1)
744 goto next; 758 goto next;
745 /* Found one! */ 759 /* Found one! */
@@ -749,7 +763,7 @@ next:
749 cfg->move_in_progress = 1; 763 cfg->move_in_progress = 1;
750 cfg->old_domain = cfg->domain; 764 cfg->old_domain = cfg->domain;
751 } 765 }
752 for_each_cpu_mask(new_cpu, new_mask) 766 for_each_cpu_mask_nr(new_cpu, new_mask)
753 per_cpu(vector_irq, new_cpu)[vector] = irq; 767 per_cpu(vector_irq, new_cpu)[vector] = irq;
754 cfg->vector = vector; 768 cfg->vector = vector;
755 cfg->domain = domain; 769 cfg->domain = domain;
@@ -781,14 +795,14 @@ static void __clear_irq_vector(int irq)
781 795
782 vector = cfg->vector; 796 vector = cfg->vector;
783 cpus_and(mask, cfg->domain, cpu_online_map); 797 cpus_and(mask, cfg->domain, cpu_online_map);
784 for_each_cpu_mask(cpu, mask) 798 for_each_cpu_mask_nr(cpu, mask)
785 per_cpu(vector_irq, cpu)[vector] = -1; 799 per_cpu(vector_irq, cpu)[vector] = -1;
786 800
787 cfg->vector = 0; 801 cfg->vector = 0;
788 cpus_clear(cfg->domain); 802 cpus_clear(cfg->domain);
789} 803}
790 804
791void __setup_vector_irq(int cpu) 805static void __setup_vector_irq(int cpu)
792{ 806{
793 /* Initialize vector_irq on a new cpu */ 807 /* Initialize vector_irq on a new cpu */
794 /* This function must be called with vector_lock held */ 808 /* This function must be called with vector_lock held */
@@ -811,6 +825,13 @@ void __setup_vector_irq(int cpu)
811 } 825 }
812} 826}
813 827
828void setup_vector_irq(int cpu)
829{
830 spin_lock(&vector_lock);
831 __setup_vector_irq(smp_processor_id());
832 spin_unlock(&vector_lock);
833}
834
814 835
815static struct irq_chip ioapic_chip; 836static struct irq_chip ioapic_chip;
816 837
@@ -846,7 +867,7 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
846 apic_printk(APIC_VERBOSE,KERN_DEBUG 867 apic_printk(APIC_VERBOSE,KERN_DEBUG
847 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " 868 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
848 "IRQ %d Mode:%i Active:%i)\n", 869 "IRQ %d Mode:%i Active:%i)\n",
849 apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector, 870 apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
850 irq, trigger, polarity); 871 irq, trigger, polarity);
851 872
852 /* 873 /*
@@ -887,10 +908,10 @@ static void __init setup_IO_APIC_irqs(void)
887 idx = find_irq_entry(apic,pin,mp_INT); 908 idx = find_irq_entry(apic,pin,mp_INT);
888 if (idx == -1) { 909 if (idx == -1) {
889 if (first_notcon) { 910 if (first_notcon) {
890 apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin); 911 apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin);
891 first_notcon = 0; 912 first_notcon = 0;
892 } else 913 } else
893 apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin); 914 apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin);
894 continue; 915 continue;
895 } 916 }
896 if (!first_notcon) { 917 if (!first_notcon) {
@@ -911,26 +932,21 @@ static void __init setup_IO_APIC_irqs(void)
911} 932}
912 933
913/* 934/*
914 * Set up the 8259A-master output pin as broadcast to all 935 * Set up the timer pin, possibly with the 8259A-master behind.
915 * CPUs.
916 */ 936 */
917static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) 937static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
938 int vector)
918{ 939{
919 struct IO_APIC_route_entry entry; 940 struct IO_APIC_route_entry entry;
920 941
921 memset(&entry, 0, sizeof(entry)); 942 memset(&entry, 0, sizeof(entry));
922 943
923 disable_8259A_irq(0);
924
925 /* mask LVT0 */
926 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
927
928 /* 944 /*
929 * We use logical delivery to get the timer IRQ 945 * We use logical delivery to get the timer IRQ
930 * to the first CPU. 946 * to the first CPU.
931 */ 947 */
932 entry.dest_mode = INT_DEST_MODE; 948 entry.dest_mode = INT_DEST_MODE;
933 entry.mask = 0; /* unmask IRQ now */ 949 entry.mask = 1; /* mask IRQ now */
934 entry.dest = cpu_mask_to_apicid(TARGET_CPUS); 950 entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
935 entry.delivery_mode = INT_DELIVERY_MODE; 951 entry.delivery_mode = INT_DELIVERY_MODE;
936 entry.polarity = 0; 952 entry.polarity = 0;
@@ -939,7 +955,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
939 955
940 /* 956 /*
941 * The timer IRQ doesn't have to know that behind the 957 * The timer IRQ doesn't have to know that behind the
942 * scene we have a 8259A-master in AEOI mode ... 958 * scene we may have a 8259A-master in AEOI mode ...
943 */ 959 */
944 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); 960 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
945 961
@@ -947,8 +963,6 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
947 * Add it to the IO-APIC irq-routing table: 963 * Add it to the IO-APIC irq-routing table:
948 */ 964 */
949 ioapic_write_entry(apic, pin, entry); 965 ioapic_write_entry(apic, pin, entry);
950
951 enable_8259A_irq(0);
952} 966}
953 967
954void __apicdebuginit print_IO_APIC(void) 968void __apicdebuginit print_IO_APIC(void)
@@ -965,7 +979,7 @@ void __apicdebuginit print_IO_APIC(void)
965 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); 979 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
966 for (i = 0; i < nr_ioapics; i++) 980 for (i = 0; i < nr_ioapics; i++)
967 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", 981 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
968 mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); 982 mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
969 983
970 /* 984 /*
971 * We are a bit conservative about what we expect. We have to 985 * We are a bit conservative about what we expect. We have to
@@ -983,7 +997,7 @@ void __apicdebuginit print_IO_APIC(void)
983 spin_unlock_irqrestore(&ioapic_lock, flags); 997 spin_unlock_irqrestore(&ioapic_lock, flags);
984 998
985 printk("\n"); 999 printk("\n");
986 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); 1000 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
987 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); 1001 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
988 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); 1002 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
989 1003
@@ -1077,6 +1091,7 @@ void __apicdebuginit print_local_APIC(void * dummy)
1077 1091
1078 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", 1092 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1079 smp_processor_id(), hard_smp_processor_id()); 1093 smp_processor_id(), hard_smp_processor_id());
1094 v = apic_read(APIC_ID);
1080 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id())); 1095 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id()));
1081 v = apic_read(APIC_LVR); 1096 v = apic_read(APIC_LVR);
1082 printk(KERN_INFO "... APIC VERSION: %08x\n", v); 1097 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
@@ -1146,7 +1161,7 @@ void __apicdebuginit print_local_APIC(void * dummy)
1146 1161
1147void print_all_local_APICs (void) 1162void print_all_local_APICs (void)
1148{ 1163{
1149 on_each_cpu(print_local_APIC, NULL, 1, 1); 1164 on_each_cpu(print_local_APIC, NULL, 1);
1150} 1165}
1151 1166
1152void __apicdebuginit print_PIC(void) 1167void __apicdebuginit print_PIC(void)
@@ -1358,12 +1373,10 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
1358static int ioapic_retrigger_irq(unsigned int irq) 1373static int ioapic_retrigger_irq(unsigned int irq)
1359{ 1374{
1360 struct irq_cfg *cfg = &irq_cfg[irq]; 1375 struct irq_cfg *cfg = &irq_cfg[irq];
1361 cpumask_t mask;
1362 unsigned long flags; 1376 unsigned long flags;
1363 1377
1364 spin_lock_irqsave(&vector_lock, flags); 1378 spin_lock_irqsave(&vector_lock, flags);
1365 mask = cpumask_of_cpu(first_cpu(cfg->domain)); 1379 send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
1366 send_IPI_mask(mask, cfg->vector);
1367 spin_unlock_irqrestore(&vector_lock, flags); 1380 spin_unlock_irqrestore(&vector_lock, flags);
1368 1381
1369 return 1; 1382 return 1;
@@ -1540,7 +1553,7 @@ static inline void init_IO_APIC_traps(void)
1540 } 1553 }
1541} 1554}
1542 1555
1543static void enable_lapic_irq (unsigned int irq) 1556static void unmask_lapic_irq(unsigned int irq)
1544{ 1557{
1545 unsigned long v; 1558 unsigned long v;
1546 1559
@@ -1548,7 +1561,7 @@ static void enable_lapic_irq (unsigned int irq)
1548 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); 1561 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
1549} 1562}
1550 1563
1551static void disable_lapic_irq (unsigned int irq) 1564static void mask_lapic_irq(unsigned int irq)
1552{ 1565{
1553 unsigned long v; 1566 unsigned long v;
1554 1567
@@ -1561,19 +1574,20 @@ static void ack_lapic_irq (unsigned int irq)
1561 ack_APIC_irq(); 1574 ack_APIC_irq();
1562} 1575}
1563 1576
1564static void end_lapic_irq (unsigned int i) { /* nothing */ } 1577static struct irq_chip lapic_chip __read_mostly = {
1565 1578 .name = "local-APIC",
1566static struct hw_interrupt_type lapic_irq_type __read_mostly = { 1579 .mask = mask_lapic_irq,
1567 .name = "local-APIC", 1580 .unmask = unmask_lapic_irq,
1568 .typename = "local-APIC-edge", 1581 .ack = ack_lapic_irq,
1569 .startup = NULL, /* startup_irq() not used for IRQ0 */
1570 .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
1571 .enable = enable_lapic_irq,
1572 .disable = disable_lapic_irq,
1573 .ack = ack_lapic_irq,
1574 .end = end_lapic_irq,
1575}; 1582};
1576 1583
1584static void lapic_register_intr(int irq)
1585{
1586 irq_desc[irq].status &= ~IRQ_LEVEL;
1587 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
1588 "edge");
1589}
1590
1577static void __init setup_nmi(void) 1591static void __init setup_nmi(void)
1578{ 1592{
1579 /* 1593 /*
@@ -1659,6 +1673,7 @@ static inline void __init check_timer(void)
1659 struct irq_cfg *cfg = irq_cfg + 0; 1673 struct irq_cfg *cfg = irq_cfg + 0;
1660 int apic1, pin1, apic2, pin2; 1674 int apic1, pin1, apic2, pin2;
1661 unsigned long flags; 1675 unsigned long flags;
1676 int no_pin1 = 0;
1662 1677
1663 local_irq_save(flags); 1678 local_irq_save(flags);
1664 1679
@@ -1669,34 +1684,48 @@ static inline void __init check_timer(void)
1669 assign_irq_vector(0, TARGET_CPUS); 1684 assign_irq_vector(0, TARGET_CPUS);
1670 1685
1671 /* 1686 /*
1672 * Subtle, code in do_timer_interrupt() expects an AEOI 1687 * As IRQ0 is to be enabled in the 8259A, the virtual
1673 * mode for the 8259A whenever interrupts are routed 1688 * wire has to be disabled in the local APIC.
1674 * through I/O APICs. Also IRQ0 has to be enabled in
1675 * the 8259A which implies the virtual wire has to be
1676 * disabled in the local APIC.
1677 */ 1689 */
1678 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 1690 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
1679 init_8259A(1); 1691 init_8259A(1);
1680 if (timer_over_8254 > 0)
1681 enable_8259A_irq(0);
1682 1692
1683 pin1 = find_isa_irq_pin(0, mp_INT); 1693 pin1 = find_isa_irq_pin(0, mp_INT);
1684 apic1 = find_isa_irq_apic(0, mp_INT); 1694 apic1 = find_isa_irq_apic(0, mp_INT);
1685 pin2 = ioapic_i8259.pin; 1695 pin2 = ioapic_i8259.pin;
1686 apic2 = ioapic_i8259.apic; 1696 apic2 = ioapic_i8259.apic;
1687 1697
1688 apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", 1698 apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
1689 cfg->vector, apic1, pin1, apic2, pin2); 1699 "apic1=%d pin1=%d apic2=%d pin2=%d\n",
1700 cfg->vector, apic1, pin1, apic2, pin2);
1701
1702 /*
1703 * Some BIOS writers are clueless and report the ExtINTA
1704 * I/O APIC input from the cascaded 8259A as the timer
1705 * interrupt input. So just in case, if only one pin
1706 * was found above, try it both directly and through the
1707 * 8259A.
1708 */
1709 if (pin1 == -1) {
1710 pin1 = pin2;
1711 apic1 = apic2;
1712 no_pin1 = 1;
1713 } else if (pin2 == -1) {
1714 pin2 = pin1;
1715 apic2 = apic1;
1716 }
1690 1717
1691 if (pin1 != -1) { 1718 if (pin1 != -1) {
1692 /* 1719 /*
1693 * Ok, does IRQ0 through the IOAPIC work? 1720 * Ok, does IRQ0 through the IOAPIC work?
1694 */ 1721 */
1722 if (no_pin1) {
1723 add_pin_to_irq(0, apic1, pin1);
1724 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
1725 }
1695 unmask_IO_APIC_irq(0); 1726 unmask_IO_APIC_irq(0);
1696 if (!no_timer_check && timer_irq_works()) { 1727 if (!no_timer_check && timer_irq_works()) {
1697 nmi_watchdog_default();
1698 if (nmi_watchdog == NMI_IO_APIC) { 1728 if (nmi_watchdog == NMI_IO_APIC) {
1699 disable_8259A_irq(0);
1700 setup_nmi(); 1729 setup_nmi();
1701 enable_8259A_irq(0); 1730 enable_8259A_irq(0);
1702 } 1731 }
@@ -1705,54 +1734,62 @@ static inline void __init check_timer(void)
1705 goto out; 1734 goto out;
1706 } 1735 }
1707 clear_IO_APIC_pin(apic1, pin1); 1736 clear_IO_APIC_pin(apic1, pin1);
1708 apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not " 1737 if (!no_pin1)
1709 "connected to IO-APIC\n"); 1738 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
1710 } 1739 "8254 timer not connected to IO-APIC\n");
1711 1740
1712 apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) " 1741 apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
1713 "through the 8259A ... "); 1742 "(IRQ0) through the 8259A ...\n");
1714 if (pin2 != -1) { 1743 apic_printk(APIC_QUIET, KERN_INFO
1715 apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...", 1744 "..... (found apic %d pin %d) ...\n", apic2, pin2);
1716 apic2, pin2);
1717 /* 1745 /*
1718 * legacy devices should be connected to IO APIC #0 1746 * legacy devices should be connected to IO APIC #0
1719 */ 1747 */
1720 setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector); 1748 replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
1749 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
1750 unmask_IO_APIC_irq(0);
1751 enable_8259A_irq(0);
1721 if (timer_irq_works()) { 1752 if (timer_irq_works()) {
1722 apic_printk(APIC_VERBOSE," works.\n"); 1753 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
1723 nmi_watchdog_default(); 1754 timer_through_8259 = 1;
1724 if (nmi_watchdog == NMI_IO_APIC) { 1755 if (nmi_watchdog == NMI_IO_APIC) {
1756 disable_8259A_irq(0);
1725 setup_nmi(); 1757 setup_nmi();
1758 enable_8259A_irq(0);
1726 } 1759 }
1727 goto out; 1760 goto out;
1728 } 1761 }
1729 /* 1762 /*
1730 * Cleanup, just in case ... 1763 * Cleanup, just in case ...
1731 */ 1764 */
1765 disable_8259A_irq(0);
1732 clear_IO_APIC_pin(apic2, pin2); 1766 clear_IO_APIC_pin(apic2, pin2);
1767 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
1733 } 1768 }
1734 apic_printk(APIC_VERBOSE," failed.\n");
1735 1769
1736 if (nmi_watchdog == NMI_IO_APIC) { 1770 if (nmi_watchdog == NMI_IO_APIC) {
1737 printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); 1771 apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
1738 nmi_watchdog = 0; 1772 "through the IO-APIC - disabling NMI Watchdog!\n");
1773 nmi_watchdog = NMI_NONE;
1739 } 1774 }
1740 1775
1741 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); 1776 apic_printk(APIC_QUIET, KERN_INFO
1777 "...trying to set up timer as Virtual Wire IRQ...\n");
1742 1778
1743 disable_8259A_irq(0); 1779 lapic_register_intr(0);
1744 irq_desc[0].chip = &lapic_irq_type;
1745 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ 1780 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
1746 enable_8259A_irq(0); 1781 enable_8259A_irq(0);
1747 1782
1748 if (timer_irq_works()) { 1783 if (timer_irq_works()) {
1749 apic_printk(APIC_VERBOSE," works.\n"); 1784 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
1750 goto out; 1785 goto out;
1751 } 1786 }
1787 disable_8259A_irq(0);
1752 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); 1788 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
1753 apic_printk(APIC_VERBOSE," failed.\n"); 1789 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
1754 1790
1755 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ..."); 1791 apic_printk(APIC_QUIET, KERN_INFO
1792 "...trying to set up timer as ExtINT IRQ...\n");
1756 1793
1757 init_8259A(0); 1794 init_8259A(0);
1758 make_8259A_irq(0); 1795 make_8259A_irq(0);
@@ -1761,11 +1798,12 @@ static inline void __init check_timer(void)
1761 unlock_ExtINT_logic(); 1798 unlock_ExtINT_logic();
1762 1799
1763 if (timer_irq_works()) { 1800 if (timer_irq_works()) {
1764 apic_printk(APIC_VERBOSE," works.\n"); 1801 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
1765 goto out; 1802 goto out;
1766 } 1803 }
1767 apic_printk(APIC_VERBOSE," failed :(.\n"); 1804 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
1768 panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); 1805 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
1806 "report. Then try booting with the 'noapic' option.\n");
1769out: 1807out:
1770 local_irq_restore(flags); 1808 local_irq_restore(flags);
1771} 1809}
@@ -1778,11 +1816,21 @@ static int __init notimercheck(char *s)
1778__setup("no_timer_check", notimercheck); 1816__setup("no_timer_check", notimercheck);
1779 1817
1780/* 1818/*
1781 * 1819 * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
1782 * IRQs that are handled by the PIC in the MPS IOAPIC case. 1820 * to devices. However there may be an I/O APIC pin available for
1783 * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. 1821 * this interrupt regardless. The pin may be left unconnected, but
1784 * Linux doesn't really care, as it's not actually used 1822 * typically it will be reused as an ExtINT cascade interrupt for
1785 * for any interrupt handling anyway. 1823 * the master 8259A. In the MPS case such a pin will normally be
1824 * reported as an ExtINT interrupt in the MP table. With ACPI
1825 * there is no provision for ExtINT interrupts, and in the absence
1826 * of an override it would be treated as an ordinary ISA I/O APIC
1827 * interrupt, that is edge-triggered and unmasked by default. We
1828 * used to do this, but it caused problems on some systems because
1829 * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
1830 * the same ExtINT cascade interrupt to drive the local APIC of the
1831 * bootstrap processor. Therefore we refrain from routing IRQ2 to
1832 * the I/O APIC in all cases now. No actual device should request
1833 * it anyway. --macro
1786 */ 1834 */
1787#define PIC_IRQS (1<<2) 1835#define PIC_IRQS (1<<2)
1788 1836
@@ -1793,10 +1841,7 @@ void __init setup_IO_APIC(void)
1793 * calling enable_IO_APIC() is moved to setup_local_APIC for BP 1841 * calling enable_IO_APIC() is moved to setup_local_APIC for BP
1794 */ 1842 */
1795 1843
1796 if (acpi_ioapic) 1844 io_apic_irqs = ~PIC_IRQS;
1797 io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
1798 else
1799 io_apic_irqs = ~PIC_IRQS;
1800 1845
1801 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); 1846 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
1802 1847
@@ -1841,8 +1886,8 @@ static int ioapic_resume(struct sys_device *dev)
1841 1886
1842 spin_lock_irqsave(&ioapic_lock, flags); 1887 spin_lock_irqsave(&ioapic_lock, flags);
1843 reg_00.raw = io_apic_read(dev->id, 0); 1888 reg_00.raw = io_apic_read(dev->id, 0);
1844 if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { 1889 if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
1845 reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; 1890 reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
1846 io_apic_write(dev->id, 0, reg_00.raw); 1891 io_apic_write(dev->id, 0, reg_00.raw);
1847 } 1892 }
1848 spin_unlock_irqrestore(&ioapic_lock, flags); 1893 spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -2242,8 +2287,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
2242 return -1; 2287 return -1;
2243 2288
2244 for (i = 0; i < mp_irq_entries; i++) 2289 for (i = 0; i < mp_irq_entries; i++)
2245 if (mp_irqs[i].mpc_irqtype == mp_INT && 2290 if (mp_irqs[i].mp_irqtype == mp_INT &&
2246 mp_irqs[i].mpc_srcbusirq == bus_irq) 2291 mp_irqs[i].mp_srcbusirq == bus_irq)
2247 break; 2292 break;
2248 if (i >= mp_irq_entries) 2293 if (i >= mp_irq_entries)
2249 return -1; 2294 return -1;
@@ -2336,7 +2381,7 @@ void __init ioapic_init_mappings(void)
2336 ioapic_res = ioapic_setup_resources(); 2381 ioapic_res = ioapic_setup_resources();
2337 for (i = 0; i < nr_ioapics; i++) { 2382 for (i = 0; i < nr_ioapics; i++) {
2338 if (smp_found_config) { 2383 if (smp_found_config) {
2339 ioapic_phys = mp_ioapics[i].mpc_apicaddr; 2384 ioapic_phys = mp_ioapics[i].mp_apicaddr;
2340 } else { 2385 } else {
2341 ioapic_phys = (unsigned long) 2386 ioapic_phys = (unsigned long)
2342 alloc_bootmem_pages(PAGE_SIZE); 2387 alloc_bootmem_pages(PAGE_SIZE);
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index 5921e5f0a640..1c3a66a67f83 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -103,6 +103,9 @@ void __init io_delay_init(void)
103 103
104static int __init io_delay_param(char *s) 104static int __init io_delay_param(char *s)
105{ 105{
106 if (!s)
107 return -EINVAL;
108
106 if (!strcmp(s, "0x80")) 109 if (!strcmp(s, "0x80"))
107 io_delay_type = CONFIG_IO_DELAY_TYPE_0X80; 110 io_delay_type = CONFIG_IO_DELAY_TYPE_0X80;
108 else if (!strcmp(s, "0xed")) 111 else if (!strcmp(s, "0xed"))
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
index c0df7b89ca23..3f7537b669d3 100644
--- a/arch/x86/kernel/ipi.c
+++ b/arch/x86/kernel/ipi.c
@@ -8,7 +8,6 @@
8#include <linux/kernel_stat.h> 8#include <linux/kernel_stat.h>
9#include <linux/mc146818rtc.h> 9#include <linux/mc146818rtc.h>
10#include <linux/cache.h> 10#include <linux/cache.h>
11#include <linux/interrupt.h>
12#include <linux/cpu.h> 11#include <linux/cpu.h>
13#include <linux/module.h> 12#include <linux/module.h>
14 13
@@ -71,7 +70,7 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector)
71 /* 70 /*
72 * Send the IPI. The write to APIC_ICR fires this off. 71 * Send the IPI. The write to APIC_ICR fires this off.
73 */ 72 */
74 apic_write_around(APIC_ICR, cfg); 73 apic_write(APIC_ICR, cfg);
75} 74}
76 75
77void send_IPI_self(int vector) 76void send_IPI_self(int vector)
@@ -99,7 +98,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
99 * prepare target chip field 98 * prepare target chip field
100 */ 99 */
101 cfg = __prepare_ICR2(mask); 100 cfg = __prepare_ICR2(mask);
102 apic_write_around(APIC_ICR2, cfg); 101 apic_write(APIC_ICR2, cfg);
103 102
104 /* 103 /*
105 * program the ICR 104 * program the ICR
@@ -109,7 +108,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
109 /* 108 /*
110 * Send the IPI. The write to APIC_ICR fires this off. 109 * Send the IPI. The write to APIC_ICR fires this off.
111 */ 110 */
112 apic_write_around(APIC_ICR, cfg); 111 apic_write(APIC_ICR, cfg);
113} 112}
114 113
115/* 114/*
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 147352df28b9..1cf8c1fcc088 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -48,6 +48,29 @@ void ack_bad_irq(unsigned int irq)
48#endif 48#endif
49} 49}
50 50
51#ifdef CONFIG_DEBUG_STACKOVERFLOW
52/* Debugging check for stack overflow: is there less than 1KB free? */
53static int check_stack_overflow(void)
54{
55 long sp;
56
57 __asm__ __volatile__("andl %%esp,%0" :
58 "=r" (sp) : "0" (THREAD_SIZE - 1));
59
60 return sp < (sizeof(struct thread_info) + STACK_WARN);
61}
62
63static void print_stack_overflow(void)
64{
65 printk(KERN_WARNING "low stack detected by irq handler\n");
66 dump_stack();
67}
68
69#else
70static inline int check_stack_overflow(void) { return 0; }
71static inline void print_stack_overflow(void) { }
72#endif
73
51#ifdef CONFIG_4KSTACKS 74#ifdef CONFIG_4KSTACKS
52/* 75/*
53 * per-CPU IRQ handling contexts (thread information and stack) 76 * per-CPU IRQ handling contexts (thread information and stack)
@@ -59,48 +82,26 @@ union irq_ctx {
59 82
60static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; 83static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
61static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; 84static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
62#endif
63
64/*
65 * do_IRQ handles all normal device IRQ's (the special
66 * SMP cross-CPU interrupts have their own specific
67 * handlers).
68 */
69unsigned int do_IRQ(struct pt_regs *regs)
70{
71 struct pt_regs *old_regs;
72 /* high bit used in ret_from_ code */
73 int irq = ~regs->orig_ax;
74 struct irq_desc *desc = irq_desc + irq;
75#ifdef CONFIG_4KSTACKS
76 union irq_ctx *curctx, *irqctx;
77 u32 *isp;
78#endif
79 85
80 if (unlikely((unsigned)irq >= NR_IRQS)) { 86static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
81 printk(KERN_EMERG "%s: cannot handle IRQ %d\n", 87static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
82 __func__, irq);
83 BUG();
84 }
85 88
86 old_regs = set_irq_regs(regs); 89static void call_on_stack(void *func, void *stack)
87 irq_enter(); 90{
88#ifdef CONFIG_DEBUG_STACKOVERFLOW 91 asm volatile("xchgl %%ebx,%%esp \n"
89 /* Debugging check for stack overflow: is there less than 1KB free? */ 92 "call *%%edi \n"
90 { 93 "movl %%ebx,%%esp \n"
91 long sp; 94 : "=b" (stack)
92 95 : "0" (stack),
93 __asm__ __volatile__("andl %%esp,%0" : 96 "D"(func)
94 "=r" (sp) : "0" (THREAD_SIZE - 1)); 97 : "memory", "cc", "edx", "ecx", "eax");
95 if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) { 98}
96 printk("do_IRQ: stack overflow: %ld\n",
97 sp - sizeof(struct thread_info));
98 dump_stack();
99 }
100 }
101#endif
102 99
103#ifdef CONFIG_4KSTACKS 100static inline int
101execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
102{
103 union irq_ctx *curctx, *irqctx;
104 u32 *isp, arg1, arg2;
104 105
105 curctx = (union irq_ctx *) current_thread_info(); 106 curctx = (union irq_ctx *) current_thread_info();
106 irqctx = hardirq_ctx[smp_processor_id()]; 107 irqctx = hardirq_ctx[smp_processor_id()];
@@ -111,52 +112,39 @@ unsigned int do_IRQ(struct pt_regs *regs)
111 * handler) we can't do that and just have to keep using the 112 * handler) we can't do that and just have to keep using the
112 * current stack (which is the irq stack already after all) 113 * current stack (which is the irq stack already after all)
113 */ 114 */
114 if (curctx != irqctx) { 115 if (unlikely(curctx == irqctx))
115 int arg1, arg2, bx; 116 return 0;
116
117 /* build the stack frame on the IRQ stack */
118 isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
119 irqctx->tinfo.task = curctx->tinfo.task;
120 irqctx->tinfo.previous_esp = current_stack_pointer;
121 117
122 /* 118 /* build the stack frame on the IRQ stack */
123 * Copy the softirq bits in preempt_count so that the 119 isp = (u32 *) ((char*)irqctx + sizeof(*irqctx));
124 * softirq checks work in the hardirq context. 120 irqctx->tinfo.task = curctx->tinfo.task;
125 */ 121 irqctx->tinfo.previous_esp = current_stack_pointer;
126 irqctx->tinfo.preempt_count =
127 (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
128 (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
129
130 asm volatile(
131 " xchgl %%ebx,%%esp \n"
132 " call *%%edi \n"
133 " movl %%ebx,%%esp \n"
134 : "=a" (arg1), "=d" (arg2), "=b" (bx)
135 : "0" (irq), "1" (desc), "2" (isp),
136 "D" (desc->handle_irq)
137 : "memory", "cc", "ecx"
138 );
139 } else
140#endif
141 desc->handle_irq(irq, desc);
142 122
143 irq_exit(); 123 /*
144 set_irq_regs(old_regs); 124 * Copy the softirq bits in preempt_count so that the
125 * softirq checks work in the hardirq context.
126 */
127 irqctx->tinfo.preempt_count =
128 (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
129 (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
130
131 if (unlikely(overflow))
132 call_on_stack(print_stack_overflow, isp);
133
134 asm volatile("xchgl %%ebx,%%esp \n"
135 "call *%%edi \n"
136 "movl %%ebx,%%esp \n"
137 : "=a" (arg1), "=d" (arg2), "=b" (isp)
138 : "0" (irq), "1" (desc), "2" (isp),
139 "D" (desc->handle_irq)
140 : "memory", "cc", "ecx");
145 return 1; 141 return 1;
146} 142}
147 143
148#ifdef CONFIG_4KSTACKS
149
150static char softirq_stack[NR_CPUS * THREAD_SIZE]
151 __attribute__((__section__(".bss.page_aligned")));
152
153static char hardirq_stack[NR_CPUS * THREAD_SIZE]
154 __attribute__((__section__(".bss.page_aligned")));
155
156/* 144/*
157 * allocate per-cpu stacks for hardirq and for softirq processing 145 * allocate per-cpu stacks for hardirq and for softirq processing
158 */ 146 */
159void irq_ctx_init(int cpu) 147void __cpuinit irq_ctx_init(int cpu)
160{ 148{
161 union irq_ctx *irqctx; 149 union irq_ctx *irqctx;
162 150
@@ -164,25 +152,25 @@ void irq_ctx_init(int cpu)
164 return; 152 return;
165 153
166 irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; 154 irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
167 irqctx->tinfo.task = NULL; 155 irqctx->tinfo.task = NULL;
168 irqctx->tinfo.exec_domain = NULL; 156 irqctx->tinfo.exec_domain = NULL;
169 irqctx->tinfo.cpu = cpu; 157 irqctx->tinfo.cpu = cpu;
170 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; 158 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
171 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 159 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
172 160
173 hardirq_ctx[cpu] = irqctx; 161 hardirq_ctx[cpu] = irqctx;
174 162
175 irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE]; 163 irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
176 irqctx->tinfo.task = NULL; 164 irqctx->tinfo.task = NULL;
177 irqctx->tinfo.exec_domain = NULL; 165 irqctx->tinfo.exec_domain = NULL;
178 irqctx->tinfo.cpu = cpu; 166 irqctx->tinfo.cpu = cpu;
179 irqctx->tinfo.preempt_count = 0; 167 irqctx->tinfo.preempt_count = 0;
180 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 168 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
181 169
182 softirq_ctx[cpu] = irqctx; 170 softirq_ctx[cpu] = irqctx;
183 171
184 printk("CPU %u irqstacks, hard=%p soft=%p\n", 172 printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
185 cpu,hardirq_ctx[cpu],softirq_ctx[cpu]); 173 cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
186} 174}
187 175
188void irq_ctx_exit(int cpu) 176void irq_ctx_exit(int cpu)
@@ -211,25 +199,56 @@ asmlinkage void do_softirq(void)
211 /* build the stack frame on the softirq stack */ 199 /* build the stack frame on the softirq stack */
212 isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); 200 isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
213 201
214 asm volatile( 202 call_on_stack(__do_softirq, isp);
215 " xchgl %%ebx,%%esp \n"
216 " call __do_softirq \n"
217 " movl %%ebx,%%esp \n"
218 : "=b"(isp)
219 : "0"(isp)
220 : "memory", "cc", "edx", "ecx", "eax"
221 );
222 /* 203 /*
223 * Shouldnt happen, we returned above if in_interrupt(): 204 * Shouldnt happen, we returned above if in_interrupt():
224 */ 205 */
225 WARN_ON_ONCE(softirq_count()); 206 WARN_ON_ONCE(softirq_count());
226 } 207 }
227 208
228 local_irq_restore(flags); 209 local_irq_restore(flags);
229} 210}
211
212#else
213static inline int
214execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
230#endif 215#endif
231 216
232/* 217/*
218 * do_IRQ handles all normal device IRQ's (the special
219 * SMP cross-CPU interrupts have their own specific
220 * handlers).
221 */
222unsigned int do_IRQ(struct pt_regs *regs)
223{
224 struct pt_regs *old_regs;
225 /* high bit used in ret_from_ code */
226 int overflow, irq = ~regs->orig_ax;
227 struct irq_desc *desc = irq_desc + irq;
228
229 if (unlikely((unsigned)irq >= NR_IRQS)) {
230 printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
231 __func__, irq);
232 BUG();
233 }
234
235 old_regs = set_irq_regs(regs);
236 irq_enter();
237
238 overflow = check_stack_overflow();
239
240 if (!execute_on_irq_stack(overflow, desc, irq)) {
241 if (unlikely(overflow))
242 print_stack_overflow();
243 desc->handle_irq(irq, desc);
244 }
245
246 irq_exit();
247 set_irq_regs(old_regs);
248 return 1;
249}
250
251/*
233 * Interrupt statistics: 252 * Interrupt statistics:
234 */ 253 */
235 254
@@ -313,16 +332,20 @@ skip:
313 per_cpu(irq_stat,j).irq_tlb_count); 332 per_cpu(irq_stat,j).irq_tlb_count);
314 seq_printf(p, " TLB shootdowns\n"); 333 seq_printf(p, " TLB shootdowns\n");
315#endif 334#endif
335#ifdef CONFIG_X86_MCE
316 seq_printf(p, "TRM: "); 336 seq_printf(p, "TRM: ");
317 for_each_online_cpu(j) 337 for_each_online_cpu(j)
318 seq_printf(p, "%10u ", 338 seq_printf(p, "%10u ",
319 per_cpu(irq_stat,j).irq_thermal_count); 339 per_cpu(irq_stat,j).irq_thermal_count);
320 seq_printf(p, " Thermal event interrupts\n"); 340 seq_printf(p, " Thermal event interrupts\n");
341#endif
342#ifdef CONFIG_X86_LOCAL_APIC
321 seq_printf(p, "SPU: "); 343 seq_printf(p, "SPU: ");
322 for_each_online_cpu(j) 344 for_each_online_cpu(j)
323 seq_printf(p, "%10u ", 345 seq_printf(p, "%10u ",
324 per_cpu(irq_stat,j).irq_spurious_count); 346 per_cpu(irq_stat,j).irq_spurious_count);
325 seq_printf(p, " Spurious interrupts\n"); 347 seq_printf(p, " Spurious interrupts\n");
348#endif
326 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); 349 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
327#if defined(CONFIG_X86_IO_APIC) 350#if defined(CONFIG_X86_IO_APIC)
328 seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); 351 seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
@@ -331,6 +354,40 @@ skip:
331 return 0; 354 return 0;
332} 355}
333 356
357/*
358 * /proc/stat helpers
359 */
360u64 arch_irq_stat_cpu(unsigned int cpu)
361{
362 u64 sum = nmi_count(cpu);
363
364#ifdef CONFIG_X86_LOCAL_APIC
365 sum += per_cpu(irq_stat, cpu).apic_timer_irqs;
366#endif
367#ifdef CONFIG_SMP
368 sum += per_cpu(irq_stat, cpu).irq_resched_count;
369 sum += per_cpu(irq_stat, cpu).irq_call_count;
370 sum += per_cpu(irq_stat, cpu).irq_tlb_count;
371#endif
372#ifdef CONFIG_X86_MCE
373 sum += per_cpu(irq_stat, cpu).irq_thermal_count;
374#endif
375#ifdef CONFIG_X86_LOCAL_APIC
376 sum += per_cpu(irq_stat, cpu).irq_spurious_count;
377#endif
378 return sum;
379}
380
381u64 arch_irq_stat(void)
382{
383 u64 sum = atomic_read(&irq_err_count);
384
385#ifdef CONFIG_X86_IO_APIC
386 sum += atomic_read(&irq_mis_count);
387#endif
388 return sum;
389}
390
334#ifdef CONFIG_HOTPLUG_CPU 391#ifdef CONFIG_HOTPLUG_CPU
335#include <mach_apic.h> 392#include <mach_apic.h>
336 393
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 3aac15466a91..1f78b238d8d2 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -135,6 +135,7 @@ skip:
135 seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count); 135 seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count);
136 seq_printf(p, " TLB shootdowns\n"); 136 seq_printf(p, " TLB shootdowns\n");
137#endif 137#endif
138#ifdef CONFIG_X86_MCE
138 seq_printf(p, "TRM: "); 139 seq_printf(p, "TRM: ");
139 for_each_online_cpu(j) 140 for_each_online_cpu(j)
140 seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count); 141 seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count);
@@ -143,6 +144,7 @@ skip:
143 for_each_online_cpu(j) 144 for_each_online_cpu(j)
144 seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count); 145 seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count);
145 seq_printf(p, " Threshold APIC interrupts\n"); 146 seq_printf(p, " Threshold APIC interrupts\n");
147#endif
146 seq_printf(p, "SPU: "); 148 seq_printf(p, "SPU: ");
147 for_each_online_cpu(j) 149 for_each_online_cpu(j)
148 seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count); 150 seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count);
@@ -153,6 +155,32 @@ skip:
153} 155}
154 156
155/* 157/*
158 * /proc/stat helpers
159 */
160u64 arch_irq_stat_cpu(unsigned int cpu)
161{
162 u64 sum = cpu_pda(cpu)->__nmi_count;
163
164 sum += cpu_pda(cpu)->apic_timer_irqs;
165#ifdef CONFIG_SMP
166 sum += cpu_pda(cpu)->irq_resched_count;
167 sum += cpu_pda(cpu)->irq_call_count;
168 sum += cpu_pda(cpu)->irq_tlb_count;
169#endif
170#ifdef CONFIG_X86_MCE
171 sum += cpu_pda(cpu)->irq_thermal_count;
172 sum += cpu_pda(cpu)->irq_threshold_count;
173#endif
174 sum += cpu_pda(cpu)->irq_spurious_count;
175 return sum;
176}
177
178u64 arch_irq_stat(void)
179{
180 return atomic_read(&irq_err_count);
181}
182
183/*
156 * do_IRQ handles all normal device IRQ's (the special 184 * do_IRQ handles all normal device IRQ's (the special
157 * SMP cross-CPU interrupts have their own specific 185 * SMP cross-CPU interrupts have their own specific
158 * handlers). 186 * handlers).
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
new file mode 100644
index 000000000000..d66914287ee1
--- /dev/null
+++ b/arch/x86/kernel/irqinit_32.c
@@ -0,0 +1,114 @@
1#include <linux/errno.h>
2#include <linux/signal.h>
3#include <linux/sched.h>
4#include <linux/ioport.h>
5#include <linux/interrupt.h>
6#include <linux/slab.h>
7#include <linux/random.h>
8#include <linux/init.h>
9#include <linux/kernel_stat.h>
10#include <linux/sysdev.h>
11#include <linux/bitops.h>
12
13#include <asm/atomic.h>
14#include <asm/system.h>
15#include <asm/io.h>
16#include <asm/timer.h>
17#include <asm/pgtable.h>
18#include <asm/delay.h>
19#include <asm/desc.h>
20#include <asm/apic.h>
21#include <asm/arch_hooks.h>
22#include <asm/i8259.h>
23
24
25
26/*
27 * Note that on a 486, we don't want to do a SIGFPE on an irq13
28 * as the irq is unreliable, and exception 16 works correctly
29 * (ie as explained in the intel literature). On a 386, you
30 * can't use exception 16 due to bad IBM design, so we have to
31 * rely on the less exact irq13.
32 *
33 * Careful.. Not only is IRQ13 unreliable, but it is also
34 * leads to races. IBM designers who came up with it should
35 * be shot.
36 */
37
38
39static irqreturn_t math_error_irq(int cpl, void *dev_id)
40{
41 extern void math_error(void __user *);
42 outb(0,0xF0);
43 if (ignore_fpu_irq || !boot_cpu_data.hard_math)
44 return IRQ_NONE;
45 math_error((void __user *)get_irq_regs()->ip);
46 return IRQ_HANDLED;
47}
48
49/*
50 * New motherboards sometimes make IRQ 13 be a PCI interrupt,
51 * so allow interrupt sharing.
52 */
53static struct irqaction fpu_irq = {
54 .handler = math_error_irq,
55 .mask = CPU_MASK_NONE,
56 .name = "fpu",
57};
58
59void __init init_ISA_irqs (void)
60{
61 int i;
62
63#ifdef CONFIG_X86_LOCAL_APIC
64 init_bsp_APIC();
65#endif
66 init_8259A(0);
67
68 /*
69 * 16 old-style INTA-cycle interrupts:
70 */
71 for (i = 0; i < 16; i++) {
72 set_irq_chip_and_handler_name(i, &i8259A_chip,
73 handle_level_irq, "XT");
74 }
75}
76
77/* Overridden in paravirt.c */
78void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
79
80void __init native_init_IRQ(void)
81{
82 int i;
83
84 /* all the set up before the call gates are initialised */
85 pre_intr_init_hook();
86
87 /*
88 * Cover the whole vector space, no vector can escape
89 * us. (some of these will be overridden and become
90 * 'special' SMP interrupts)
91 */
92 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
93 int vector = FIRST_EXTERNAL_VECTOR + i;
94 if (i >= NR_IRQS)
95 break;
96 /* SYSCALL_VECTOR was reserved in trap_init. */
97 if (!test_bit(vector, used_vectors))
98 set_intr_gate(vector, interrupt[i]);
99 }
100
101 /* setup after call gates are initialised (usually add in
102 * the architecture specific gates)
103 */
104 intr_init_hook();
105
106 /*
107 * External FPU? Set up irq13 if so, for
108 * original braindamaged IBM FERR coupling.
109 */
110 if (boot_cpu_data.hard_math && !cpu_has_fpu)
111 setup_irq(FPU_IRQ, &fpu_irq);
112
113 irq_ctx_init(smp_processor_id());
114}
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
new file mode 100644
index 000000000000..1f26fd9ec4f4
--- /dev/null
+++ b/arch/x86/kernel/irqinit_64.c
@@ -0,0 +1,222 @@
1#include <linux/linkage.h>
2#include <linux/errno.h>
3#include <linux/signal.h>
4#include <linux/sched.h>
5#include <linux/ioport.h>
6#include <linux/interrupt.h>
7#include <linux/timex.h>
8#include <linux/slab.h>
9#include <linux/random.h>
10#include <linux/init.h>
11#include <linux/kernel_stat.h>
12#include <linux/sysdev.h>
13#include <linux/bitops.h>
14
15#include <asm/acpi.h>
16#include <asm/atomic.h>
17#include <asm/system.h>
18#include <asm/io.h>
19#include <asm/hw_irq.h>
20#include <asm/pgtable.h>
21#include <asm/delay.h>
22#include <asm/desc.h>
23#include <asm/apic.h>
24#include <asm/i8259.h>
25
26/*
27 * Common place to define all x86 IRQ vectors
28 *
29 * This builds up the IRQ handler stubs using some ugly macros in irq.h
30 *
31 * These macros create the low-level assembly IRQ routines that save
32 * register context and call do_IRQ(). do_IRQ() then does all the
33 * operations that are needed to keep the AT (or SMP IOAPIC)
34 * interrupt-controller happy.
35 */
36
37#define IRQ_NAME2(nr) nr##_interrupt(void)
38#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
39
40/*
41 * SMP has a few special interrupts for IPI messages
42 */
43
44#define BUILD_IRQ(nr) \
45 asmlinkage void IRQ_NAME(nr); \
46 asm("\n.text\n.p2align\n" \
47 "IRQ" #nr "_interrupt:\n\t" \
48 "push $~(" #nr ") ; " \
49 "jmp common_interrupt\n" \
50 ".previous");
51
52#define BI(x,y) \
53 BUILD_IRQ(x##y)
54
55#define BUILD_16_IRQS(x) \
56 BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
57 BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
58 BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
59 BI(x,c) BI(x,d) BI(x,e) BI(x,f)
60
61/*
62 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
63 * (these are usually mapped to vectors 0x30-0x3f)
64 */
65
66/*
67 * The IO-APIC gives us many more interrupt sources. Most of these
68 * are unused but an SMP system is supposed to have enough memory ...
69 * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
70 * across the spectrum, so we really want to be prepared to get all
71 * of these. Plus, more powerful systems might have more than 64
72 * IO-APIC registers.
73 *
74 * (these are usually mapped into the 0x30-0xff vector range)
75 */
76 BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
77BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
78BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
79BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
80
81#undef BUILD_16_IRQS
82#undef BI
83
84
85#define IRQ(x,y) \
86 IRQ##x##y##_interrupt
87
88#define IRQLIST_16(x) \
89 IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
90 IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
91 IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
92 IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
93
94/* for the irq vectors */
95static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
96 IRQLIST_16(0x2), IRQLIST_16(0x3),
97 IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
98 IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
99 IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
100};
101
102#undef IRQ
103#undef IRQLIST_16
104
105
106
107
108/*
109 * IRQ2 is cascade interrupt to second interrupt controller
110 */
111
112static struct irqaction irq2 = {
113 .handler = no_action,
114 .mask = CPU_MASK_NONE,
115 .name = "cascade",
116};
117DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
118 [0 ... IRQ0_VECTOR - 1] = -1,
119 [IRQ0_VECTOR] = 0,
120 [IRQ1_VECTOR] = 1,
121 [IRQ2_VECTOR] = 2,
122 [IRQ3_VECTOR] = 3,
123 [IRQ4_VECTOR] = 4,
124 [IRQ5_VECTOR] = 5,
125 [IRQ6_VECTOR] = 6,
126 [IRQ7_VECTOR] = 7,
127 [IRQ8_VECTOR] = 8,
128 [IRQ9_VECTOR] = 9,
129 [IRQ10_VECTOR] = 10,
130 [IRQ11_VECTOR] = 11,
131 [IRQ12_VECTOR] = 12,
132 [IRQ13_VECTOR] = 13,
133 [IRQ14_VECTOR] = 14,
134 [IRQ15_VECTOR] = 15,
135 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
136};
137
138static void __init init_ISA_irqs (void)
139{
140 int i;
141
142 init_bsp_APIC();
143 init_8259A(0);
144
145 for (i = 0; i < NR_IRQS; i++) {
146 irq_desc[i].status = IRQ_DISABLED;
147 irq_desc[i].action = NULL;
148 irq_desc[i].depth = 1;
149
150 if (i < 16) {
151 /*
152 * 16 old-style INTA-cycle interrupts:
153 */
154 set_irq_chip_and_handler_name(i, &i8259A_chip,
155 handle_level_irq, "XT");
156 } else {
157 /*
158 * 'high' PCI IRQs filled in on demand
159 */
160 irq_desc[i].chip = &no_irq_chip;
161 }
162 }
163}
164
165void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
166
167void __init native_init_IRQ(void)
168{
169 int i;
170
171 init_ISA_irqs();
172 /*
173 * Cover the whole vector space, no vector can escape
174 * us. (some of these will be overridden and become
175 * 'special' SMP interrupts)
176 */
177 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
178 int vector = FIRST_EXTERNAL_VECTOR + i;
179 if (vector != IA32_SYSCALL_VECTOR)
180 set_intr_gate(vector, interrupt[i]);
181 }
182
183#ifdef CONFIG_SMP
184 /*
185 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
186 * IPI, driven by wakeup.
187 */
188 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
189
190 /* IPIs for invalidation */
191 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
192 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
193 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
194 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
195 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
196 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
197 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
198 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
199
200 /* IPI for generic function call */
201 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
202
203 /* IPI for generic single function call */
204 alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
205 call_function_single_interrupt);
206
207 /* Low priority IPI to cleanup after moving an irq */
208 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
209#endif
210 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
211 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
212
213 /* self generated IPI for local APIC timer */
214 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
215
216 /* IPI vectors for APIC spurious and error interrupts */
217 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
218 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
219
220 if (!acpi_ioapic)
221 setup_irq(2, &irq2);
222}
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index c03205991718..f2d43bc75514 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -12,9 +12,13 @@
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/io.h> 13#include <linux/io.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/module.h>
15 16
16#include <asm/setup.h> 17#include <asm/setup.h>
17 18
19struct dentry *arch_debugfs_dir;
20EXPORT_SYMBOL(arch_debugfs_dir);
21
18#ifdef CONFIG_DEBUG_BOOT_PARAMS 22#ifdef CONFIG_DEBUG_BOOT_PARAMS
19struct setup_data_node { 23struct setup_data_node {
20 u64 paddr; 24 u64 paddr;
@@ -209,6 +213,10 @@ static int __init arch_kdebugfs_init(void)
209{ 213{
210 int error = 0; 214 int error = 0;
211 215
216 arch_debugfs_dir = debugfs_create_dir("x86", NULL);
217 if (!arch_debugfs_dir)
218 return -ENOMEM;
219
212#ifdef CONFIG_DEBUG_BOOT_PARAMS 220#ifdef CONFIG_DEBUG_BOOT_PARAMS
213 error = boot_params_kdebugfs_init(); 221 error = boot_params_kdebugfs_init();
214#endif 222#endif
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index b8c6743a13da..6c27679ec6aa 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -431,7 +431,6 @@ static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
431 regs->ip = (unsigned long)p->ainsn.insn; 431 regs->ip = (unsigned long)p->ainsn.insn;
432} 432}
433 433
434/* Called with kretprobe_lock held */
435void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, 434void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
436 struct pt_regs *regs) 435 struct pt_regs *regs)
437{ 436{
@@ -682,8 +681,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
682 unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; 681 unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
683 682
684 INIT_HLIST_HEAD(&empty_rp); 683 INIT_HLIST_HEAD(&empty_rp);
685 spin_lock_irqsave(&kretprobe_lock, flags); 684 kretprobe_hash_lock(current, &head, &flags);
686 head = kretprobe_inst_table_head(current);
687 /* fixup registers */ 685 /* fixup registers */
688#ifdef CONFIG_X86_64 686#ifdef CONFIG_X86_64
689 regs->cs = __KERNEL_CS; 687 regs->cs = __KERNEL_CS;
@@ -732,7 +730,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
732 730
733 kretprobe_assert(ri, orig_ret_address, trampoline_address); 731 kretprobe_assert(ri, orig_ret_address, trampoline_address);
734 732
735 spin_unlock_irqrestore(&kretprobe_lock, flags); 733 kretprobe_hash_unlock(current, &flags);
736 734
737 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { 735 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
738 hlist_del(&ri->hlist); 736 hlist_del(&ri->hlist);
@@ -860,7 +858,6 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs)
860 858
861 resume_execution(cur, regs, kcb); 859 resume_execution(cur, regs, kcb);
862 regs->flags |= kcb->kprobe_saved_flags; 860 regs->flags |= kcb->kprobe_saved_flags;
863 trace_hardirqs_fixup_flags(regs->flags);
864 861
865 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { 862 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
866 kcb->kprobe_status = KPROBE_HIT_SSDONE; 863 kcb->kprobe_status = KPROBE_HIT_SSDONE;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 87edf1ceb1df..d02def06ca91 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -113,7 +113,7 @@ static void kvm_setup_secondary_clock(void)
113#endif 113#endif
114 114
115#ifdef CONFIG_SMP 115#ifdef CONFIG_SMP
116void __init kvm_smp_prepare_boot_cpu(void) 116static void __init kvm_smp_prepare_boot_cpu(void)
117{ 117{
118 WARN_ON(kvm_register_clock("primary cpu clock")); 118 WARN_ON(kvm_register_clock("primary cpu clock"));
119 native_smp_prepare_boot_cpu(); 119 native_smp_prepare_boot_cpu();
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 0224c3637c73..3fee2aa50f3f 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -20,9 +20,9 @@
20#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
21 21
22#ifdef CONFIG_SMP 22#ifdef CONFIG_SMP
23static void flush_ldt(void *null) 23static void flush_ldt(void *current_mm)
24{ 24{
25 if (current->active_mm) 25 if (current->active_mm == current_mm)
26 load_LDT(&current->active_mm->context); 26 load_LDT(&current->active_mm->context);
27} 27}
28#endif 28#endif
@@ -62,13 +62,13 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
62 62
63 if (reload) { 63 if (reload) {
64#ifdef CONFIG_SMP 64#ifdef CONFIG_SMP
65 cpumask_t mask; 65 cpumask_of_cpu_ptr_declare(mask);
66 66
67 preempt_disable(); 67 preempt_disable();
68 load_LDT(pc); 68 load_LDT(pc);
69 mask = cpumask_of_cpu(smp_processor_id()); 69 cpumask_of_cpu_ptr_next(mask, smp_processor_id());
70 if (!cpus_equal(current->mm->cpu_vm_mask, mask)) 70 if (!cpus_equal(current->mm->cpu_vm_mask, *mask))
71 smp_call_function(flush_ldt, NULL, 1, 1); 71 smp_call_function(flush_ldt, current->mm, 1);
72 preempt_enable(); 72 preempt_enable();
73#else 73#else
74 load_LDT(pc); 74 load_LDT(pc);
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index d0b234c9fc31..9fe478d98406 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -11,6 +11,8 @@
11#include <linux/delay.h> 11#include <linux/delay.h>
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/numa.h> 13#include <linux/numa.h>
14#include <linux/ftrace.h>
15
14#include <asm/pgtable.h> 16#include <asm/pgtable.h>
15#include <asm/pgalloc.h> 17#include <asm/pgalloc.h>
16#include <asm/tlbflush.h> 18#include <asm/tlbflush.h>
@@ -20,6 +22,7 @@
20#include <asm/cpufeature.h> 22#include <asm/cpufeature.h>
21#include <asm/desc.h> 23#include <asm/desc.h>
22#include <asm/system.h> 24#include <asm/system.h>
25#include <asm/cacheflush.h>
23 26
24#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) 27#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
25static u32 kexec_pgd[1024] PAGE_ALIGNED; 28static u32 kexec_pgd[1024] PAGE_ALIGNED;
@@ -39,7 +42,7 @@ static void set_idt(void *newidt, __u16 limit)
39 curidt.address = (unsigned long)newidt; 42 curidt.address = (unsigned long)newidt;
40 43
41 load_idt(&curidt); 44 load_idt(&curidt);
42}; 45}
43 46
44 47
45static void set_gdt(void *newgdt, __u16 limit) 48static void set_gdt(void *newgdt, __u16 limit)
@@ -51,7 +54,7 @@ static void set_gdt(void *newgdt, __u16 limit)
51 curgdt.address = (unsigned long)newgdt; 54 curgdt.address = (unsigned long)newgdt;
52 55
53 load_gdt(&curgdt); 56 load_gdt(&curgdt);
54}; 57}
55 58
56static void load_segments(void) 59static void load_segments(void)
57{ 60{
@@ -83,10 +86,12 @@ static void load_segments(void)
83 * reboot code buffer to allow us to avoid allocations 86 * reboot code buffer to allow us to avoid allocations
84 * later. 87 * later.
85 * 88 *
86 * Currently nothing. 89 * Make control page executable.
87 */ 90 */
88int machine_kexec_prepare(struct kimage *image) 91int machine_kexec_prepare(struct kimage *image)
89{ 92{
93 if (nx_enabled)
94 set_pages_x(image->control_code_page, 1);
90 return 0; 95 return 0;
91} 96}
92 97
@@ -96,25 +101,48 @@ int machine_kexec_prepare(struct kimage *image)
96 */ 101 */
97void machine_kexec_cleanup(struct kimage *image) 102void machine_kexec_cleanup(struct kimage *image)
98{ 103{
104 if (nx_enabled)
105 set_pages_nx(image->control_code_page, 1);
99} 106}
100 107
101/* 108/*
102 * Do not allocate memory (or fail in any way) in machine_kexec(). 109 * Do not allocate memory (or fail in any way) in machine_kexec().
103 * We are past the point of no return, committed to rebooting now. 110 * We are past the point of no return, committed to rebooting now.
104 */ 111 */
105NORET_TYPE void machine_kexec(struct kimage *image) 112void machine_kexec(struct kimage *image)
106{ 113{
107 unsigned long page_list[PAGES_NR]; 114 unsigned long page_list[PAGES_NR];
108 void *control_page; 115 void *control_page;
116 asmlinkage unsigned long
117 (*relocate_kernel_ptr)(unsigned long indirection_page,
118 unsigned long control_page,
119 unsigned long start_address,
120 unsigned int has_pae,
121 unsigned int preserve_context);
122
123 tracer_disable();
109 124
110 /* Interrupts aren't acceptable while we reboot */ 125 /* Interrupts aren't acceptable while we reboot */
111 local_irq_disable(); 126 local_irq_disable();
112 127
128 if (image->preserve_context) {
129#ifdef CONFIG_X86_IO_APIC
130 /* We need to put APICs in legacy mode so that we can
131 * get timer interrupts in second kernel. kexec/kdump
132 * paths already have calls to disable_IO_APIC() in
133 * one form or other. kexec jump path also need
134 * one.
135 */
136 disable_IO_APIC();
137#endif
138 }
139
113 control_page = page_address(image->control_code_page); 140 control_page = page_address(image->control_code_page);
114 memcpy(control_page, relocate_kernel, PAGE_SIZE); 141 memcpy(control_page, relocate_kernel, PAGE_SIZE/2);
115 142
143 relocate_kernel_ptr = control_page;
116 page_list[PA_CONTROL_PAGE] = __pa(control_page); 144 page_list[PA_CONTROL_PAGE] = __pa(control_page);
117 page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; 145 page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
118 page_list[PA_PGD] = __pa(kexec_pgd); 146 page_list[PA_PGD] = __pa(kexec_pgd);
119 page_list[VA_PGD] = (unsigned long)kexec_pgd; 147 page_list[VA_PGD] = (unsigned long)kexec_pgd;
120#ifdef CONFIG_X86_PAE 148#ifdef CONFIG_X86_PAE
@@ -127,6 +155,7 @@ NORET_TYPE void machine_kexec(struct kimage *image)
127 page_list[VA_PTE_0] = (unsigned long)kexec_pte0; 155 page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
128 page_list[PA_PTE_1] = __pa(kexec_pte1); 156 page_list[PA_PTE_1] = __pa(kexec_pte1);
129 page_list[VA_PTE_1] = (unsigned long)kexec_pte1; 157 page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
158 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT);
130 159
131 /* The segment registers are funny things, they have both a 160 /* The segment registers are funny things, they have both a
132 * visible and an invisible part. Whenever the visible part is 161 * visible and an invisible part. Whenever the visible part is
@@ -145,8 +174,10 @@ NORET_TYPE void machine_kexec(struct kimage *image)
145 set_idt(phys_to_virt(0),0); 174 set_idt(phys_to_virt(0),0);
146 175
147 /* now call it */ 176 /* now call it */
148 relocate_kernel((unsigned long)image->head, (unsigned long)page_list, 177 image->start = relocate_kernel_ptr((unsigned long)image->head,
149 image->start, cpu_has_pae); 178 (unsigned long)page_list,
179 image->start, cpu_has_pae,
180 image->preserve_context);
150} 181}
151 182
152void arch_crash_save_vmcoreinfo(void) 183void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 576a03db4511..c43caa3a91f3 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -11,6 +11,8 @@
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/reboot.h> 12#include <linux/reboot.h>
13#include <linux/numa.h> 13#include <linux/numa.h>
14#include <linux/ftrace.h>
15
14#include <asm/pgtable.h> 16#include <asm/pgtable.h>
15#include <asm/tlbflush.h> 17#include <asm/tlbflush.h>
16#include <asm/mmu_context.h> 18#include <asm/mmu_context.h>
@@ -110,7 +112,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
110{ 112{
111 pgd_t *level4p; 113 pgd_t *level4p;
112 level4p = (pgd_t *)__va(start_pgtable); 114 level4p = (pgd_t *)__va(start_pgtable);
113 return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT); 115 return init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
114} 116}
115 117
116static void set_idt(void *newidt, u16 limit) 118static void set_idt(void *newidt, u16 limit)
@@ -179,11 +181,13 @@ void machine_kexec_cleanup(struct kimage *image)
179 * Do not allocate memory (or fail in any way) in machine_kexec(). 181 * Do not allocate memory (or fail in any way) in machine_kexec().
180 * We are past the point of no return, committed to rebooting now. 182 * We are past the point of no return, committed to rebooting now.
181 */ 183 */
182NORET_TYPE void machine_kexec(struct kimage *image) 184void machine_kexec(struct kimage *image)
183{ 185{
184 unsigned long page_list[PAGES_NR]; 186 unsigned long page_list[PAGES_NR];
185 void *control_page; 187 void *control_page;
186 188
189 tracer_disable();
190
187 /* Interrupts aren't acceptable while we reboot */ 191 /* Interrupts aren't acceptable while we reboot */
188 local_irq_disable(); 192 local_irq_disable();
189 193
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
index 69729e38b78a..6994c751590e 100644
--- a/arch/x86/kernel/microcode.c
+++ b/arch/x86/kernel/microcode.c
@@ -5,13 +5,14 @@
5 * 2006 Shaohua Li <shaohua.li@intel.com> 5 * 2006 Shaohua Li <shaohua.li@intel.com>
6 * 6 *
7 * This driver allows to upgrade microcode on Intel processors 7 * This driver allows to upgrade microcode on Intel processors
8 * belonging to IA-32 family - PentiumPro, Pentium II, 8 * belonging to IA-32 family - PentiumPro, Pentium II,
9 * Pentium III, Xeon, Pentium 4, etc. 9 * Pentium III, Xeon, Pentium 4, etc.
10 * 10 *
11 * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual, 11 * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
12 * Order Number 245472 or free download from: 12 * Software Developer's Manual
13 * 13 * Order Number 253668 or free download from:
14 * http://developer.intel.com/design/pentium4/manuals/245472.htm 14 *
15 * http://developer.intel.com/design/pentium4/manuals/253668.htm
15 * 16 *
16 * For more information, go to http://www.urbanmyth.org/microcode 17 * For more information, go to http://www.urbanmyth.org/microcode
17 * 18 *
@@ -58,12 +59,12 @@
58 * nature of implementation. 59 * nature of implementation.
59 * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com> 60 * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
60 * Fix the panic when writing zero-length microcode chunk. 61 * Fix the panic when writing zero-length microcode chunk.
61 * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, 62 * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
62 * Jun Nakajima <jun.nakajima@intel.com> 63 * Jun Nakajima <jun.nakajima@intel.com>
63 * Support for the microcode updates in the new format. 64 * Support for the microcode updates in the new format.
64 * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com> 65 * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
65 * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl 66 * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
66 * because we no longer hold a copy of applied microcode 67 * because we no longer hold a copy of applied microcode
67 * in kernel memory. 68 * in kernel memory.
68 * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com> 69 * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
69 * Fix sigmatch() macro to handle old CPUs with pf == 0. 70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
@@ -75,6 +76,7 @@
75#include <linux/kernel.h> 76#include <linux/kernel.h>
76#include <linux/init.h> 77#include <linux/init.h>
77#include <linux/sched.h> 78#include <linux/sched.h>
79#include <linux/smp_lock.h>
78#include <linux/cpumask.h> 80#include <linux/cpumask.h>
79#include <linux/module.h> 81#include <linux/module.h>
80#include <linux/slab.h> 82#include <linux/slab.h>
@@ -320,11 +322,11 @@ static void apply_microcode(int cpu)
320 return; 322 return;
321 323
322 /* serialize access to the physical write to MSR 0x79 */ 324 /* serialize access to the physical write to MSR 0x79 */
323 spin_lock_irqsave(&microcode_update_lock, flags); 325 spin_lock_irqsave(&microcode_update_lock, flags);
324 326
325 /* write microcode via MSR 0x79 */ 327 /* write microcode via MSR 0x79 */
326 wrmsr(MSR_IA32_UCODE_WRITE, 328 wrmsr(MSR_IA32_UCODE_WRITE,
327 (unsigned long) uci->mc->bits, 329 (unsigned long) uci->mc->bits,
328 (unsigned long) uci->mc->bits >> 16 >> 16); 330 (unsigned long) uci->mc->bits >> 16 >> 16);
329 wrmsr(MSR_IA32_UCODE_REV, 0, 0); 331 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
330 332
@@ -341,7 +343,7 @@ static void apply_microcode(int cpu)
341 return; 343 return;
342 } 344 }
343 printk(KERN_INFO "microcode: CPU%d updated from revision " 345 printk(KERN_INFO "microcode: CPU%d updated from revision "
344 "0x%x to 0x%x, date = %08x \n", 346 "0x%x to 0x%x, date = %08x \n",
345 cpu_num, uci->rev, val[1], uci->mc->hdr.date); 347 cpu_num, uci->rev, val[1], uci->mc->hdr.date);
346 uci->rev = val[1]; 348 uci->rev = val[1];
347} 349}
@@ -386,6 +388,7 @@ static int do_microcode_update (void)
386 void *new_mc = NULL; 388 void *new_mc = NULL;
387 int cpu; 389 int cpu;
388 cpumask_t old; 390 cpumask_t old;
391 cpumask_of_cpu_ptr_declare(newmask);
389 392
390 old = current->cpus_allowed; 393 old = current->cpus_allowed;
391 394
@@ -402,7 +405,8 @@ static int do_microcode_update (void)
402 405
403 if (!uci->valid) 406 if (!uci->valid)
404 continue; 407 continue;
405 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 408 cpumask_of_cpu_ptr_next(newmask, cpu);
409 set_cpus_allowed_ptr(current, newmask);
406 error = get_maching_microcode(new_mc, cpu); 410 error = get_maching_microcode(new_mc, cpu);
407 if (error < 0) 411 if (error < 0)
408 goto out; 412 goto out;
@@ -422,6 +426,7 @@ out:
422 426
423static int microcode_open (struct inode *unused1, struct file *unused2) 427static int microcode_open (struct inode *unused1, struct file *unused2)
424{ 428{
429 cycle_kernel_lock();
425 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; 430 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
426} 431}
427 432
@@ -488,7 +493,7 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
488#define microcode_dev_exit() do { } while(0) 493#define microcode_dev_exit() do { } while(0)
489#endif 494#endif
490 495
491static long get_next_ucode_from_buffer(void **mc, void *buf, 496static long get_next_ucode_from_buffer(void **mc, const u8 *buf,
492 unsigned long size, long offset) 497 unsigned long size, long offset)
493{ 498{
494 microcode_header_t *mc_header; 499 microcode_header_t *mc_header;
@@ -522,7 +527,7 @@ static int cpu_request_microcode(int cpu)
522 char name[30]; 527 char name[30];
523 struct cpuinfo_x86 *c = &cpu_data(cpu); 528 struct cpuinfo_x86 *c = &cpu_data(cpu);
524 const struct firmware *firmware; 529 const struct firmware *firmware;
525 void *buf; 530 const u8 *buf;
526 unsigned long size; 531 unsigned long size;
527 long offset = 0; 532 long offset = 0;
528 int error; 533 int error;
@@ -534,7 +539,7 @@ static int cpu_request_microcode(int cpu)
534 c->x86, c->x86_model, c->x86_mask); 539 c->x86, c->x86_model, c->x86_mask);
535 error = request_firmware(&firmware, name, &microcode_pdev->dev); 540 error = request_firmware(&firmware, name, &microcode_pdev->dev);
536 if (error) { 541 if (error) {
537 pr_debug("microcode: ucode data file %s load failed\n", name); 542 pr_debug("microcode: data file %s load failed\n", name);
538 return error; 543 return error;
539 } 544 }
540 buf = firmware->data; 545 buf = firmware->data;
@@ -571,6 +576,7 @@ static int apply_microcode_check_cpu(int cpu)
571 struct cpuinfo_x86 *c = &cpu_data(cpu); 576 struct cpuinfo_x86 *c = &cpu_data(cpu);
572 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 577 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
573 cpumask_t old; 578 cpumask_t old;
579 cpumask_of_cpu_ptr(newmask, cpu);
574 unsigned int val[2]; 580 unsigned int val[2];
575 int err = 0; 581 int err = 0;
576 582
@@ -579,7 +585,7 @@ static int apply_microcode_check_cpu(int cpu)
579 return 0; 585 return 0;
580 586
581 old = current->cpus_allowed; 587 old = current->cpus_allowed;
582 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 588 set_cpus_allowed_ptr(current, newmask);
583 589
584 /* Check if the microcode we have in memory matches the CPU */ 590 /* Check if the microcode we have in memory matches the CPU */
585 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || 591 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
@@ -617,11 +623,12 @@ static int apply_microcode_check_cpu(int cpu)
617static void microcode_init_cpu(int cpu, int resume) 623static void microcode_init_cpu(int cpu, int resume)
618{ 624{
619 cpumask_t old; 625 cpumask_t old;
626 cpumask_of_cpu_ptr(newmask, cpu);
620 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 627 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
621 628
622 old = current->cpus_allowed; 629 old = current->cpus_allowed;
623 630
624 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 631 set_cpus_allowed_ptr(current, newmask);
625 mutex_lock(&microcode_mutex); 632 mutex_lock(&microcode_mutex);
626 collect_cpu_info(cpu); 633 collect_cpu_info(cpu);
627 if (uci->valid && system_state == SYSTEM_RUNNING && !resume) 634 if (uci->valid && system_state == SYSTEM_RUNNING && !resume)
@@ -641,7 +648,9 @@ static void microcode_fini_cpu(int cpu)
641 mutex_unlock(&microcode_mutex); 648 mutex_unlock(&microcode_mutex);
642} 649}
643 650
644static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz) 651static ssize_t reload_store(struct sys_device *dev,
652 struct sysdev_attribute *attr,
653 const char *buf, size_t sz)
645{ 654{
646 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; 655 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
647 char *end; 656 char *end;
@@ -653,11 +662,12 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
653 return -EINVAL; 662 return -EINVAL;
654 if (val == 1) { 663 if (val == 1) {
655 cpumask_t old; 664 cpumask_t old;
665 cpumask_of_cpu_ptr(newmask, cpu);
656 666
657 old = current->cpus_allowed; 667 old = current->cpus_allowed;
658 668
659 get_online_cpus(); 669 get_online_cpus();
660 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 670 set_cpus_allowed_ptr(current, newmask);
661 671
662 mutex_lock(&microcode_mutex); 672 mutex_lock(&microcode_mutex);
663 if (uci->valid) 673 if (uci->valid)
@@ -671,14 +681,16 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
671 return sz; 681 return sz;
672} 682}
673 683
674static ssize_t version_show(struct sys_device *dev, char *buf) 684static ssize_t version_show(struct sys_device *dev,
685 struct sysdev_attribute *attr, char *buf)
675{ 686{
676 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; 687 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
677 688
678 return sprintf(buf, "0x%x\n", uci->rev); 689 return sprintf(buf, "0x%x\n", uci->rev);
679} 690}
680 691
681static ssize_t pf_show(struct sys_device *dev, char *buf) 692static ssize_t pf_show(struct sys_device *dev,
693 struct sysdev_attribute *attr, char *buf)
682{ 694{
683 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; 695 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
684 696
@@ -805,6 +817,9 @@ static int __init microcode_init (void)
805{ 817{
806 int error; 818 int error;
807 819
820 printk(KERN_INFO
821 "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
822
808 error = microcode_dev_init(); 823 error = microcode_dev_init();
809 if (error) 824 if (error)
810 return error; 825 return error;
@@ -825,9 +840,6 @@ static int __init microcode_init (void)
825 } 840 }
826 841
827 register_hotcpu_notifier(&mc_cpu_notifier); 842 register_hotcpu_notifier(&mc_cpu_notifier);
828
829 printk(KERN_INFO
830 "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
831 return 0; 843 return 0;
832} 844}
833 845
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index edc5fbfe85c0..fdfdc550b366 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -12,6 +12,7 @@
12#include <asm/io.h> 12#include <asm/io.h>
13#include <asm/msr.h> 13#include <asm/msr.h>
14#include <asm/acpi.h> 14#include <asm/acpi.h>
15#include <asm/mmconfig.h>
15 16
16#include "../pci/pci.h" 17#include "../pci/pci.h"
17 18
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c
index a888e67f5874..6ba87830d4b1 100644
--- a/arch/x86/kernel/module_64.c
+++ b/arch/x86/kernel/module_64.c
@@ -22,6 +22,7 @@
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/string.h> 23#include <linux/string.h>
24#include <linux/kernel.h> 24#include <linux/kernel.h>
25#include <linux/mm.h>
25#include <linux/slab.h> 26#include <linux/slab.h>
26#include <linux/bug.h> 27#include <linux/bug.h>
27 28
@@ -150,7 +151,8 @@ int module_finalize(const Elf_Ehdr *hdr,
150 const Elf_Shdr *sechdrs, 151 const Elf_Shdr *sechdrs,
151 struct module *me) 152 struct module *me)
152{ 153{
153 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; 154 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
155 *para = NULL;
154 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; 156 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
155 157
156 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 158 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -160,6 +162,8 @@ int module_finalize(const Elf_Ehdr *hdr,
160 alt = s; 162 alt = s;
161 if (!strcmp(".smp_locks", secstrings + s->sh_name)) 163 if (!strcmp(".smp_locks", secstrings + s->sh_name))
162 locks= s; 164 locks= s;
165 if (!strcmp(".parainstructions", secstrings + s->sh_name))
166 para = s;
163 } 167 }
164 168
165 if (alt) { 169 if (alt) {
@@ -175,6 +179,11 @@ int module_finalize(const Elf_Ehdr *hdr,
175 tseg, tseg + text->sh_size); 179 tseg, tseg + text->sh_size);
176 } 180 }
177 181
182 if (para) {
183 void *pseg = (void *)para->sh_addr;
184 apply_paravirt(pseg, pseg + para->sh_size);
185 }
186
178 return module_bug_finalize(hdr, sechdrs, me); 187 return module_bug_finalize(hdr, sechdrs, me);
179} 188}
180 189
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 404683b94e79..6ae005ccaed8 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -25,6 +25,9 @@
25#include <asm/proto.h> 25#include <asm/proto.h>
26#include <asm/acpi.h> 26#include <asm/acpi.h>
27#include <asm/bios_ebda.h> 27#include <asm/bios_ebda.h>
28#include <asm/e820.h>
29#include <asm/trampoline.h>
30#include <asm/setup.h>
28 31
29#include <mach_apic.h> 32#include <mach_apic.h>
30#ifdef CONFIG_X86_32 33#ifdef CONFIG_X86_32
@@ -32,28 +35,6 @@
32#include <mach_mpparse.h> 35#include <mach_mpparse.h>
33#endif 36#endif
34 37
35/* Have we found an MP table */
36int smp_found_config;
37
38/*
39 * Various Linux-internal data structures created from the
40 * MP-table.
41 */
42#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
43int mp_bus_id_to_type[MAX_MP_BUSSES];
44#endif
45
46DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
47int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 };
48
49static int mp_current_pci_id;
50
51int pic_mode;
52
53/*
54 * Intel MP BIOS table parsing routines:
55 */
56
57/* 38/*
58 * Checksum an MP configuration block. 39 * Checksum an MP configuration block.
59 */ 40 */
@@ -68,18 +49,6 @@ static int __init mpf_checksum(unsigned char *mp, int len)
68 return sum & 0xFF; 49 return sum & 0xFF;
69} 50}
70 51
71#ifdef CONFIG_X86_NUMAQ
72/*
73 * Have to match translation table entries to main table entries by counter
74 * hence the mpc_record variable .... can't see a less disgusting way of
75 * doing this ....
76 */
77
78static int mpc_record;
79static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
80 __cpuinitdata;
81#endif
82
83static void __cpuinit MP_processor_info(struct mpc_config_processor *m) 52static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
84{ 53{
85 int apicid; 54 int apicid;
@@ -89,11 +58,12 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
89 disabled_cpus++; 58 disabled_cpus++;
90 return; 59 return;
91 } 60 }
92#ifdef CONFIG_X86_NUMAQ 61
93 apicid = mpc_apic_id(m, translation_table[mpc_record]); 62 if (x86_quirks->mpc_apic_id)
94#else 63 apicid = x86_quirks->mpc_apic_id(m);
95 apicid = m->mpc_apicid; 64 else
96#endif 65 apicid = m->mpc_apicid;
66
97 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { 67 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
98 bootup_cpu = " (Bootup-CPU)"; 68 bootup_cpu = " (Bootup-CPU)";
99 boot_cpu_physical_apicid = m->mpc_apicid; 69 boot_cpu_physical_apicid = m->mpc_apicid;
@@ -103,18 +73,17 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
103 generic_processor_info(apicid, m->mpc_apicver); 73 generic_processor_info(apicid, m->mpc_apicver);
104} 74}
105 75
76#ifdef CONFIG_X86_IO_APIC
106static void __init MP_bus_info(struct mpc_config_bus *m) 77static void __init MP_bus_info(struct mpc_config_bus *m)
107{ 78{
108 char str[7]; 79 char str[7];
109
110 memcpy(str, m->mpc_bustype, 6); 80 memcpy(str, m->mpc_bustype, 6);
111 str[6] = 0; 81 str[6] = 0;
112 82
113#ifdef CONFIG_X86_NUMAQ 83 if (x86_quirks->mpc_oem_bus_info)
114 mpc_oem_bus_info(m, str, translation_table[mpc_record]); 84 x86_quirks->mpc_oem_bus_info(m, str);
115#else 85 else
116 Dprintk("Bus #%d is %s\n", m->mpc_busid, str); 86 printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str);
117#endif
118 87
119#if MAX_MP_BUSSES < 256 88#if MAX_MP_BUSSES < 256
120 if (m->mpc_busid >= MAX_MP_BUSSES) { 89 if (m->mpc_busid >= MAX_MP_BUSSES) {
@@ -131,12 +100,10 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
131 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; 100 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
132#endif 101#endif
133 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { 102 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
134#ifdef CONFIG_X86_NUMAQ 103 if (x86_quirks->mpc_oem_pci_bus)
135 mpc_oem_pci_bus(m, translation_table[mpc_record]); 104 x86_quirks->mpc_oem_pci_bus(m);
136#endif 105
137 clear_bit(m->mpc_busid, mp_bus_not_pci); 106 clear_bit(m->mpc_busid, mp_bus_not_pci);
138 mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
139 mp_current_pci_id++;
140#if defined(CONFIG_EISA) || defined (CONFIG_MCA) 107#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
141 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; 108 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
142 } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) { 109 } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
@@ -147,6 +114,7 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
147 } else 114 } else
148 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); 115 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
149} 116}
117#endif
150 118
151#ifdef CONFIG_X86_IO_APIC 119#ifdef CONFIG_X86_IO_APIC
152 120
@@ -176,117 +144,111 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m)
176 if (bad_ioapic(m->mpc_apicaddr)) 144 if (bad_ioapic(m->mpc_apicaddr))
177 return; 145 return;
178 146
179 mp_ioapics[nr_ioapics] = *m; 147 mp_ioapics[nr_ioapics].mp_apicaddr = m->mpc_apicaddr;
148 mp_ioapics[nr_ioapics].mp_apicid = m->mpc_apicid;
149 mp_ioapics[nr_ioapics].mp_type = m->mpc_type;
150 mp_ioapics[nr_ioapics].mp_apicver = m->mpc_apicver;
151 mp_ioapics[nr_ioapics].mp_flags = m->mpc_flags;
180 nr_ioapics++; 152 nr_ioapics++;
181} 153}
182 154
183static void __init MP_intsrc_info(struct mpc_config_intsrc *m) 155static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
184{ 156{
185 mp_irqs[mp_irq_entries] = *m; 157 printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x,"
186 Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
187 " IRQ %02x, APIC ID %x, APIC INT %02x\n", 158 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
188 m->mpc_irqtype, m->mpc_irqflag & 3, 159 m->mpc_irqtype, m->mpc_irqflag & 3,
189 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, 160 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
190 m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); 161 m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
191 if (++mp_irq_entries == MAX_IRQ_SOURCES)
192 panic("Max # of irq sources exceeded!!\n");
193} 162}
194 163
195#endif 164static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
196
197static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
198{ 165{
199 Dprintk("Lint: type %d, pol %d, trig %d, bus %d," 166 printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x,"
200 " IRQ %02x, APIC ID %x, APIC LINT %02x\n", 167 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
201 m->mpc_irqtype, m->mpc_irqflag & 3, 168 mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
202 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid, 169 (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
203 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); 170 mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq);
204} 171}
205 172
206#ifdef CONFIG_X86_NUMAQ 173static void __init assign_to_mp_irq(struct mpc_config_intsrc *m,
207static void __init MP_translation_info(struct mpc_config_translation *m) 174 struct mp_config_intsrc *mp_irq)
208{ 175{
209 printk(KERN_INFO 176 mp_irq->mp_dstapic = m->mpc_dstapic;
210 "Translation: record %d, type %d, quad %d, global %d, local %d\n", 177 mp_irq->mp_type = m->mpc_type;
211 mpc_record, m->trans_type, m->trans_quad, m->trans_global, 178 mp_irq->mp_irqtype = m->mpc_irqtype;
212 m->trans_local); 179 mp_irq->mp_irqflag = m->mpc_irqflag;
180 mp_irq->mp_srcbus = m->mpc_srcbus;
181 mp_irq->mp_srcbusirq = m->mpc_srcbusirq;
182 mp_irq->mp_dstirq = m->mpc_dstirq;
183}
213 184
214 if (mpc_record >= MAX_MPC_ENTRY) 185static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq,
215 printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); 186 struct mpc_config_intsrc *m)
216 else 187{
217 translation_table[mpc_record] = m; /* stash this for later */ 188 m->mpc_dstapic = mp_irq->mp_dstapic;
218 if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) 189 m->mpc_type = mp_irq->mp_type;
219 node_set_online(m->trans_quad); 190 m->mpc_irqtype = mp_irq->mp_irqtype;
191 m->mpc_irqflag = mp_irq->mp_irqflag;
192 m->mpc_srcbus = mp_irq->mp_srcbus;
193 m->mpc_srcbusirq = mp_irq->mp_srcbusirq;
194 m->mpc_dstirq = mp_irq->mp_dstirq;
220} 195}
221 196
222/* 197static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq,
223 * Read/parse the MPC oem tables 198 struct mpc_config_intsrc *m)
224 */ 199{
200 if (mp_irq->mp_dstapic != m->mpc_dstapic)
201 return 1;
202 if (mp_irq->mp_type != m->mpc_type)
203 return 2;
204 if (mp_irq->mp_irqtype != m->mpc_irqtype)
205 return 3;
206 if (mp_irq->mp_irqflag != m->mpc_irqflag)
207 return 4;
208 if (mp_irq->mp_srcbus != m->mpc_srcbus)
209 return 5;
210 if (mp_irq->mp_srcbusirq != m->mpc_srcbusirq)
211 return 6;
212 if (mp_irq->mp_dstirq != m->mpc_dstirq)
213 return 7;
214
215 return 0;
216}
225 217
226static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, 218static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
227 unsigned short oemsize)
228{ 219{
229 int count = sizeof(*oemtable); /* the header size */ 220 int i;
230 unsigned char *oemptr = ((unsigned char *)oemtable) + count; 221
231 222 print_MP_intsrc_info(m);
232 mpc_record = 0; 223
233 printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", 224 for (i = 0; i < mp_irq_entries; i++) {
234 oemtable); 225 if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m))
235 if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) { 226 return;
236 printk(KERN_WARNING
237 "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
238 oemtable->oem_signature[0], oemtable->oem_signature[1],
239 oemtable->oem_signature[2], oemtable->oem_signature[3]);
240 return;
241 }
242 if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
243 printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
244 return;
245 }
246 while (count < oemtable->oem_length) {
247 switch (*oemptr) {
248 case MP_TRANSLATION:
249 {
250 struct mpc_config_translation *m =
251 (struct mpc_config_translation *)oemptr;
252 MP_translation_info(m);
253 oemptr += sizeof(*m);
254 count += sizeof(*m);
255 ++mpc_record;
256 break;
257 }
258 default:
259 {
260 printk(KERN_WARNING
261 "Unrecognised OEM table entry type! - %d\n",
262 (int)*oemptr);
263 return;
264 }
265 }
266 } 227 }
228
229 assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
230 if (++mp_irq_entries == MAX_IRQ_SOURCES)
231 panic("Max # of irq sources exceeded!!\n");
267} 232}
268 233
269static inline void mps_oem_check(struct mp_config_table *mpc, char *oem, 234#endif
270 char *productid) 235
236static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
271{ 237{
272 if (strncmp(oem, "IBM NUMA", 8)) 238 printk(KERN_INFO "Lint: type %d, pol %d, trig %d, bus %02x,"
273 printk("Warning! May not be a NUMA-Q system!\n"); 239 " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
274 if (mpc->mpc_oemptr) 240 m->mpc_irqtype, m->mpc_irqflag & 3,
275 smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr, 241 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
276 mpc->mpc_oemsize); 242 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
277} 243}
278#endif /* CONFIG_X86_NUMAQ */
279 244
280/* 245/*
281 * Read/parse the MPC 246 * Read/parse the MPC
282 */ 247 */
283 248
284static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) 249static int __init smp_check_mpc(struct mp_config_table *mpc, char *oem,
250 char *str)
285{ 251{
286 char str[16];
287 char oem[10];
288 int count = sizeof(*mpc);
289 unsigned char *mpt = ((unsigned char *)mpc) + count;
290 252
291 if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) { 253 if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) {
292 printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n", 254 printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
@@ -309,19 +271,41 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
309 } 271 }
310 memcpy(oem, mpc->mpc_oem, 8); 272 memcpy(oem, mpc->mpc_oem, 8);
311 oem[8] = 0; 273 oem[8] = 0;
312 printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem); 274 printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
313 275
314 memcpy(str, mpc->mpc_productid, 12); 276 memcpy(str, mpc->mpc_productid, 12);
315 str[12] = 0; 277 str[12] = 0;
316 printk("Product ID: %s ", str);
317 278
318#ifdef CONFIG_X86_32 279 printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
319 mps_oem_check(mpc, oem, str);
320#endif
321 printk(KERN_INFO "MPTABLE: Product ID: %s ", str);
322 280
323 printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic); 281 printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
324 282
283 return 1;
284}
285
286static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
287{
288 char str[16];
289 char oem[10];
290
291 int count = sizeof(*mpc);
292 unsigned char *mpt = ((unsigned char *)mpc) + count;
293
294 if (!smp_check_mpc(mpc, oem, str))
295 return 0;
296
297#ifdef CONFIG_X86_32
298 /*
299 * need to make sure summit and es7000's mps_oem_check is safe to be
300 * called early via genericarch 's mps_oem_check
301 */
302 if (early) {
303#ifdef CONFIG_X86_NUMAQ
304 numaq_mps_oem_check(mpc, oem, str);
305#endif
306 } else
307 mps_oem_check(mpc, oem, str);
308#endif
325 /* save the local APIC address, it might be non-default */ 309 /* save the local APIC address, it might be non-default */
326 if (!acpi_lapic) 310 if (!acpi_lapic)
327 mp_lapic_addr = mpc->mpc_lapic; 311 mp_lapic_addr = mpc->mpc_lapic;
@@ -329,12 +313,17 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
329 if (early) 313 if (early)
330 return 1; 314 return 1;
331 315
316 if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) {
317 struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr;
318 x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize);
319 }
320
332 /* 321 /*
333 * Now process the configuration blocks. 322 * Now process the configuration blocks.
334 */ 323 */
335#ifdef CONFIG_X86_NUMAQ 324 if (x86_quirks->mpc_record)
336 mpc_record = 0; 325 *x86_quirks->mpc_record = 0;
337#endif 326
338 while (count < mpc->mpc_length) { 327 while (count < mpc->mpc_length) {
339 switch (*mpt) { 328 switch (*mpt) {
340 case MP_PROCESSOR: 329 case MP_PROCESSOR:
@@ -352,7 +341,9 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
352 { 341 {
353 struct mpc_config_bus *m = 342 struct mpc_config_bus *m =
354 (struct mpc_config_bus *)mpt; 343 (struct mpc_config_bus *)mpt;
344#ifdef CONFIG_X86_IO_APIC
355 MP_bus_info(m); 345 MP_bus_info(m);
346#endif
356 mpt += sizeof(*m); 347 mpt += sizeof(*m);
357 count += sizeof(*m); 348 count += sizeof(*m);
358 break; 349 break;
@@ -398,10 +389,14 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
398 count = mpc->mpc_length; 389 count = mpc->mpc_length;
399 break; 390 break;
400 } 391 }
401#ifdef CONFIG_X86_NUMAQ 392 if (x86_quirks->mpc_record)
402 ++mpc_record; 393 (*x86_quirks->mpc_record)++;
403#endif
404 } 394 }
395
396#ifdef CONFIG_X86_GENERICARCH
397 generic_bigsmp_probe();
398#endif
399
405 setup_apic_routing(); 400 setup_apic_routing();
406 if (!num_processors) 401 if (!num_processors)
407 printk(KERN_ERR "MPTABLE: no processors registered!\n"); 402 printk(KERN_ERR "MPTABLE: no processors registered!\n");
@@ -427,7 +422,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
427 intsrc.mpc_type = MP_INTSRC; 422 intsrc.mpc_type = MP_INTSRC;
428 intsrc.mpc_irqflag = 0; /* conforming */ 423 intsrc.mpc_irqflag = 0; /* conforming */
429 intsrc.mpc_srcbus = 0; 424 intsrc.mpc_srcbus = 0;
430 intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; 425 intsrc.mpc_dstapic = mp_ioapics[0].mp_apicid;
431 426
432 intsrc.mpc_irqtype = mp_INT; 427 intsrc.mpc_irqtype = mp_INT;
433 428
@@ -488,40 +483,11 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
488 MP_intsrc_info(&intsrc); 483 MP_intsrc_info(&intsrc);
489} 484}
490 485
491#endif
492 486
493static inline void __init construct_default_ISA_mptable(int mpc_default_type) 487static void construct_ioapic_table(int mpc_default_type)
494{ 488{
495 struct mpc_config_processor processor;
496 struct mpc_config_bus bus;
497#ifdef CONFIG_X86_IO_APIC
498 struct mpc_config_ioapic ioapic; 489 struct mpc_config_ioapic ioapic;
499#endif 490 struct mpc_config_bus bus;
500 struct mpc_config_lintsrc lintsrc;
501 int linttypes[2] = { mp_ExtINT, mp_NMI };
502 int i;
503
504 /*
505 * local APIC has default address
506 */
507 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
508
509 /*
510 * 2 CPUs, numbered 0 & 1.
511 */
512 processor.mpc_type = MP_PROCESSOR;
513 /* Either an integrated APIC or a discrete 82489DX. */
514 processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
515 processor.mpc_cpuflag = CPU_ENABLED;
516 processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
517 (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
518 processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
519 processor.mpc_reserved[0] = 0;
520 processor.mpc_reserved[1] = 0;
521 for (i = 0; i < 2; i++) {
522 processor.mpc_apicid = i;
523 MP_processor_info(&processor);
524 }
525 491
526 bus.mpc_type = MP_BUS; 492 bus.mpc_type = MP_BUS;
527 bus.mpc_busid = 0; 493 bus.mpc_busid = 0;
@@ -550,7 +516,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
550 MP_bus_info(&bus); 516 MP_bus_info(&bus);
551 } 517 }
552 518
553#ifdef CONFIG_X86_IO_APIC
554 ioapic.mpc_type = MP_IOAPIC; 519 ioapic.mpc_type = MP_IOAPIC;
555 ioapic.mpc_apicid = 2; 520 ioapic.mpc_apicid = 2;
556 ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; 521 ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
@@ -562,7 +527,42 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
562 * We set up most of the low 16 IO-APIC pins according to MPS rules. 527 * We set up most of the low 16 IO-APIC pins according to MPS rules.
563 */ 528 */
564 construct_default_ioirq_mptable(mpc_default_type); 529 construct_default_ioirq_mptable(mpc_default_type);
530}
531#else
532static inline void construct_ioapic_table(int mpc_default_type) { }
565#endif 533#endif
534
535static inline void __init construct_default_ISA_mptable(int mpc_default_type)
536{
537 struct mpc_config_processor processor;
538 struct mpc_config_lintsrc lintsrc;
539 int linttypes[2] = { mp_ExtINT, mp_NMI };
540 int i;
541
542 /*
543 * local APIC has default address
544 */
545 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
546
547 /*
548 * 2 CPUs, numbered 0 & 1.
549 */
550 processor.mpc_type = MP_PROCESSOR;
551 /* Either an integrated APIC or a discrete 82489DX. */
552 processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
553 processor.mpc_cpuflag = CPU_ENABLED;
554 processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
555 (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
556 processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
557 processor.mpc_reserved[0] = 0;
558 processor.mpc_reserved[1] = 0;
559 for (i = 0; i < 2; i++) {
560 processor.mpc_apicid = i;
561 MP_processor_info(&processor);
562 }
563
564 construct_ioapic_table(mpc_default_type);
565
566 lintsrc.mpc_type = MP_LINTSRC; 566 lintsrc.mpc_type = MP_LINTSRC;
567 lintsrc.mpc_irqflag = 0; /* conforming */ 567 lintsrc.mpc_irqflag = 0; /* conforming */
568 lintsrc.mpc_srcbusid = 0; 568 lintsrc.mpc_srcbusid = 0;
@@ -580,10 +580,14 @@ static struct intel_mp_floating *mpf_found;
580/* 580/*
581 * Scan the memory blocks for an SMP configuration block. 581 * Scan the memory blocks for an SMP configuration block.
582 */ 582 */
583static void __init __get_smp_config(unsigned early) 583static void __init __get_smp_config(unsigned int early)
584{ 584{
585 struct intel_mp_floating *mpf = mpf_found; 585 struct intel_mp_floating *mpf = mpf_found;
586 586
587 if (x86_quirks->mach_get_smp_config) {
588 if (x86_quirks->mach_get_smp_config(early))
589 return;
590 }
587 if (acpi_lapic && early) 591 if (acpi_lapic && early)
588 return; 592 return;
589 /* 593 /*
@@ -600,7 +604,7 @@ static void __init __get_smp_config(unsigned early)
600 604
601 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", 605 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
602 mpf->mpf_specification); 606 mpf->mpf_specification);
603#ifdef CONFIG_X86_32 607#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
604 if (mpf->mpf_feature2 & (1 << 7)) { 608 if (mpf->mpf_feature2 & (1 << 7)) {
605 printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); 609 printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
606 pic_mode = 1; 610 pic_mode = 1;
@@ -632,7 +636,9 @@ static void __init __get_smp_config(unsigned early)
632 * override the defaults. 636 * override the defaults.
633 */ 637 */
634 if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr), early)) { 638 if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr), early)) {
639#ifdef CONFIG_X86_LOCAL_APIC
635 smp_found_config = 0; 640 smp_found_config = 0;
641#endif
636 printk(KERN_ERR 642 printk(KERN_ERR
637 "BIOS bug, MP table errors detected!...\n"); 643 "BIOS bug, MP table errors detected!...\n");
638 printk(KERN_ERR "... disabling SMP support. " 644 printk(KERN_ERR "... disabling SMP support. "
@@ -689,7 +695,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
689 unsigned int *bp = phys_to_virt(base); 695 unsigned int *bp = phys_to_virt(base);
690 struct intel_mp_floating *mpf; 696 struct intel_mp_floating *mpf;
691 697
692 Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length); 698 printk(KERN_DEBUG "Scan SMP from %p for %ld bytes.\n", bp, length);
693 BUILD_BUG_ON(sizeof(*mpf) != 16); 699 BUILD_BUG_ON(sizeof(*mpf) != 16);
694 700
695 while (length > 0) { 701 while (length > 0) {
@@ -699,15 +705,21 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
699 !mpf_checksum((unsigned char *)bp, 16) && 705 !mpf_checksum((unsigned char *)bp, 16) &&
700 ((mpf->mpf_specification == 1) 706 ((mpf->mpf_specification == 1)
701 || (mpf->mpf_specification == 4))) { 707 || (mpf->mpf_specification == 4))) {
702 708#ifdef CONFIG_X86_LOCAL_APIC
703 smp_found_config = 1; 709 smp_found_config = 1;
710#endif
704 mpf_found = mpf; 711 mpf_found = mpf;
705#ifdef CONFIG_X86_32 712
706 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", 713 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
707 mpf, virt_to_phys(mpf)); 714 mpf, virt_to_phys(mpf));
708 reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE, 715
716 if (!reserve)
717 return 1;
718 reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
709 BOOTMEM_DEFAULT); 719 BOOTMEM_DEFAULT);
710 if (mpf->mpf_physptr) { 720 if (mpf->mpf_physptr) {
721 unsigned long size = PAGE_SIZE;
722#ifdef CONFIG_X86_32
711 /* 723 /*
712 * We cannot access to MPC table to compute 724 * We cannot access to MPC table to compute
713 * table size yet, as only few megabytes from 725 * table size yet, as only few megabytes from
@@ -717,24 +729,15 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
717 * PAGE_SIZE from mpg->mpf_physptr yields BUG() 729 * PAGE_SIZE from mpg->mpf_physptr yields BUG()
718 * in reserve_bootmem. 730 * in reserve_bootmem.
719 */ 731 */
720 unsigned long size = PAGE_SIZE;
721 unsigned long end = max_low_pfn * PAGE_SIZE; 732 unsigned long end = max_low_pfn * PAGE_SIZE;
722 if (mpf->mpf_physptr + size > end) 733 if (mpf->mpf_physptr + size > end)
723 size = end - mpf->mpf_physptr; 734 size = end - mpf->mpf_physptr;
724 reserve_bootmem(mpf->mpf_physptr, size, 735#endif
736 reserve_bootmem_generic(mpf->mpf_physptr, size,
725 BOOTMEM_DEFAULT); 737 BOOTMEM_DEFAULT);
726 } 738 }
727 739
728#else 740 return 1;
729 if (!reserve)
730 return 1;
731
732 reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
733 if (mpf->mpf_physptr)
734 reserve_bootmem_generic(mpf->mpf_physptr,
735 PAGE_SIZE);
736#endif
737 return 1;
738 } 741 }
739 bp += 4; 742 bp += 4;
740 length -= 16; 743 length -= 16;
@@ -742,10 +745,14 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
742 return 0; 745 return 0;
743} 746}
744 747
745static void __init __find_smp_config(unsigned reserve) 748static void __init __find_smp_config(unsigned int reserve)
746{ 749{
747 unsigned int address; 750 unsigned int address;
748 751
752 if (x86_quirks->mach_find_smp_config) {
753 if (x86_quirks->mach_find_smp_config(reserve))
754 return;
755 }
749 /* 756 /*
750 * FIXME: Linux assumes you have 640K of base ram.. 757 * FIXME: Linux assumes you have 640K of base ram..
751 * this continues the error... 758 * this continues the error...
@@ -790,298 +797,294 @@ void __init find_smp_config(void)
790 __find_smp_config(1); 797 __find_smp_config(1);
791} 798}
792 799
793/* -------------------------------------------------------------------------- 800#ifdef CONFIG_X86_IO_APIC
794 ACPI-based MP Configuration 801static u8 __initdata irq_used[MAX_IRQ_SOURCES];
795 -------------------------------------------------------------------------- */
796 802
797/* 803static int __init get_MP_intsrc_index(struct mpc_config_intsrc *m)
798 * Keep this outside and initialized to 0, for !CONFIG_ACPI builds: 804{
799 */ 805 int i;
800int es7000_plat;
801 806
802#ifdef CONFIG_ACPI 807 if (m->mpc_irqtype != mp_INT)
808 return 0;
803 809
804#ifdef CONFIG_X86_IO_APIC 810 if (m->mpc_irqflag != 0x0f)
811 return 0;
805 812
806#define MP_ISA_BUS 0 813 /* not legacy */
807 814
808extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS]; 815 for (i = 0; i < mp_irq_entries; i++) {
816 if (mp_irqs[i].mp_irqtype != mp_INT)
817 continue;
809 818
810static int mp_find_ioapic(int gsi) 819 if (mp_irqs[i].mp_irqflag != 0x0f)
811{ 820 continue;
812 int i = 0;
813 821
814 /* Find the IOAPIC that manages this GSI. */ 822 if (mp_irqs[i].mp_srcbus != m->mpc_srcbus)
815 for (i = 0; i < nr_ioapics; i++) { 823 continue;
816 if ((gsi >= mp_ioapic_routing[i].gsi_base) 824 if (mp_irqs[i].mp_srcbusirq != m->mpc_srcbusirq)
817 && (gsi <= mp_ioapic_routing[i].gsi_end)) 825 continue;
818 return i; 826 if (irq_used[i]) {
827 /* already claimed */
828 return -2;
829 }
830 irq_used[i] = 1;
831 return i;
819 } 832 }
820 833
821 printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); 834 /* not found */
822 return -1; 835 return -1;
823} 836}
824 837
825static u8 __init uniq_ioapic_id(u8 id) 838#define SPARE_SLOT_NUM 20
826{ 839
827#ifdef CONFIG_X86_32 840static struct mpc_config_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
828 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
829 !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
830 return io_apic_get_unique_id(nr_ioapics, id);
831 else
832 return id;
833#else
834 int i;
835 DECLARE_BITMAP(used, 256);
836 bitmap_zero(used, 256);
837 for (i = 0; i < nr_ioapics; i++) {
838 struct mpc_config_ioapic *ia = &mp_ioapics[i];
839 __set_bit(ia->mpc_apicid, used);
840 }
841 if (!test_bit(id, used))
842 return id;
843 return find_first_zero_bit(used, 256);
844#endif 841#endif
845}
846 842
847void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) 843static int __init replace_intsrc_all(struct mp_config_table *mpc,
844 unsigned long mpc_new_phys,
845 unsigned long mpc_new_length)
848{ 846{
849 int idx = 0; 847#ifdef CONFIG_X86_IO_APIC
850 848 int i;
851 if (bad_ioapic(address)) 849 int nr_m_spare = 0;
852 return; 850#endif
853 851
854 idx = nr_ioapics; 852 int count = sizeof(*mpc);
853 unsigned char *mpt = ((unsigned char *)mpc) + count;
855 854
856 mp_ioapics[idx].mpc_type = MP_IOAPIC; 855 printk(KERN_INFO "mpc_length %x\n", mpc->mpc_length);
857 mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; 856 while (count < mpc->mpc_length) {
858 mp_ioapics[idx].mpc_apicaddr = address; 857 switch (*mpt) {
858 case MP_PROCESSOR:
859 {
860 struct mpc_config_processor *m =
861 (struct mpc_config_processor *)mpt;
862 mpt += sizeof(*m);
863 count += sizeof(*m);
864 break;
865 }
866 case MP_BUS:
867 {
868 struct mpc_config_bus *m =
869 (struct mpc_config_bus *)mpt;
870 mpt += sizeof(*m);
871 count += sizeof(*m);
872 break;
873 }
874 case MP_IOAPIC:
875 {
876 mpt += sizeof(struct mpc_config_ioapic);
877 count += sizeof(struct mpc_config_ioapic);
878 break;
879 }
880 case MP_INTSRC:
881 {
882#ifdef CONFIG_X86_IO_APIC
883 struct mpc_config_intsrc *m =
884 (struct mpc_config_intsrc *)mpt;
859 885
860 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); 886 printk(KERN_INFO "OLD ");
861 mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id); 887 print_MP_intsrc_info(m);
862#ifdef CONFIG_X86_32 888 i = get_MP_intsrc_index(m);
863 mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); 889 if (i > 0) {
864#else 890 assign_to_mpc_intsrc(&mp_irqs[i], m);
865 mp_ioapics[idx].mpc_apicver = 0; 891 printk(KERN_INFO "NEW ");
892 print_mp_irq_info(&mp_irqs[i]);
893 } else if (!i) {
894 /* legacy, do nothing */
895 } else if (nr_m_spare < SPARE_SLOT_NUM) {
896 /*
897 * not found (-1), or duplicated (-2)
898 * are invalid entries,
899 * we need to use the slot later
900 */
901 m_spare[nr_m_spare] = m;
902 nr_m_spare++;
903 }
866#endif 904#endif
867 /* 905 mpt += sizeof(struct mpc_config_intsrc);
868 * Build basic GSI lookup table to facilitate gsi->io_apic lookups 906 count += sizeof(struct mpc_config_intsrc);
869 * and to prevent reprogramming of IOAPIC pins (PCI GSIs). 907 break;
870 */ 908 }
871 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; 909 case MP_LINTSRC:
872 mp_ioapic_routing[idx].gsi_base = gsi_base; 910 {
873 mp_ioapic_routing[idx].gsi_end = gsi_base + 911 struct mpc_config_lintsrc *m =
874 io_apic_get_redir_entries(idx); 912 (struct mpc_config_lintsrc *)mpt;
875 913 mpt += sizeof(*m);
876 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " 914 count += sizeof(*m);
877 "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, 915 break;
878 mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, 916 }
879 mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end); 917 default:
880 918 /* wrong mptable */
881 nr_ioapics++; 919 printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
882} 920 printk(KERN_ERR "type %x\n", *mpt);
921 print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
922 1, mpc, mpc->mpc_length, 1);
923 goto out;
924 }
925 }
883 926
884void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) 927#ifdef CONFIG_X86_IO_APIC
885{ 928 for (i = 0; i < mp_irq_entries; i++) {
886 struct mpc_config_intsrc intsrc; 929 if (irq_used[i])
887 int ioapic = -1; 930 continue;
888 int pin = -1;
889 931
890 /* 932 if (mp_irqs[i].mp_irqtype != mp_INT)
891 * Convert 'gsi' to 'ioapic.pin'. 933 continue;
892 */
893 ioapic = mp_find_ioapic(gsi);
894 if (ioapic < 0)
895 return;
896 pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
897 934
898 /* 935 if (mp_irqs[i].mp_irqflag != 0x0f)
899 * TBD: This check is for faulty timer entries, where the override 936 continue;
900 * erroneously sets the trigger to level, resulting in a HUGE
901 * increase of timer interrupts!
902 */
903 if ((bus_irq == 0) && (trigger == 3))
904 trigger = 1;
905 937
906 intsrc.mpc_type = MP_INTSRC; 938 if (nr_m_spare > 0) {
907 intsrc.mpc_irqtype = mp_INT; 939 printk(KERN_INFO "*NEW* found ");
908 intsrc.mpc_irqflag = (trigger << 2) | polarity; 940 nr_m_spare--;
909 intsrc.mpc_srcbus = MP_ISA_BUS; 941 assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
910 intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ 942 m_spare[nr_m_spare] = NULL;
911 intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ 943 } else {
912 intsrc.mpc_dstirq = pin; /* INTIN# */ 944 struct mpc_config_intsrc *m =
945 (struct mpc_config_intsrc *)mpt;
946 count += sizeof(struct mpc_config_intsrc);
947 if (!mpc_new_phys) {
948 printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count);
949 } else {
950 if (count <= mpc_new_length)
951 printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count);
952 else {
953 printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length);
954 goto out;
955 }
956 }
957 assign_to_mpc_intsrc(&mp_irqs[i], m);
958 mpc->mpc_length = count;
959 mpt += sizeof(struct mpc_config_intsrc);
960 }
961 print_mp_irq_info(&mp_irqs[i]);
962 }
963#endif
964out:
965 /* update checksum */
966 mpc->mpc_checksum = 0;
967 mpc->mpc_checksum -= mpf_checksum((unsigned char *)mpc,
968 mpc->mpc_length);
913 969
914 MP_intsrc_info(&intsrc); 970 return 0;
915} 971}
916 972
917void __init mp_config_acpi_legacy_irqs(void) 973static int __initdata enable_update_mptable;
918{
919 struct mpc_config_intsrc intsrc;
920 int i = 0;
921 int ioapic = -1;
922 974
923#if defined (CONFIG_MCA) || defined (CONFIG_EISA) 975static int __init update_mptable_setup(char *str)
924 /* 976{
925 * Fabricate the legacy ISA bus (bus #31). 977 enable_update_mptable = 1;
926 */ 978 return 0;
927 mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; 979}
928#endif 980early_param("update_mptable", update_mptable_setup);
929 set_bit(MP_ISA_BUS, mp_bus_not_pci);
930 Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
931 981
932 /* 982static unsigned long __initdata mpc_new_phys;
933 * Older generations of ES7000 have no legacy identity mappings 983static unsigned long mpc_new_length __initdata = 4096;
934 */
935 if (es7000_plat == 1)
936 return;
937 984
938 /* 985/* alloc_mptable or alloc_mptable=4k */
939 * Locate the IOAPIC that manages the ISA IRQs (0-15). 986static int __initdata alloc_mptable;
940 */ 987static int __init parse_alloc_mptable_opt(char *p)
941 ioapic = mp_find_ioapic(0); 988{
942 if (ioapic < 0) 989 enable_update_mptable = 1;
943 return; 990 alloc_mptable = 1;
991 if (!p)
992 return 0;
993 mpc_new_length = memparse(p, &p);
994 return 0;
995}
996early_param("alloc_mptable", parse_alloc_mptable_opt);
944 997
945 intsrc.mpc_type = MP_INTSRC; 998void __init early_reserve_e820_mpc_new(void)
946 intsrc.mpc_irqflag = 0; /* Conforming */ 999{
947 intsrc.mpc_srcbus = MP_ISA_BUS; 1000 if (enable_update_mptable && alloc_mptable) {
948#ifdef CONFIG_X86_IO_APIC 1001 u64 startt = 0;
949 intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; 1002#ifdef CONFIG_X86_TRAMPOLINE
1003 startt = TRAMPOLINE_BASE;
950#endif 1004#endif
951 /* 1005 mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
952 * Use the default configuration for the IRQs 0-15. Unless
953 * overridden by (MADT) interrupt source override entries.
954 */
955 for (i = 0; i < 16; i++) {
956 int idx;
957
958 for (idx = 0; idx < mp_irq_entries; idx++) {
959 struct mpc_config_intsrc *irq = mp_irqs + idx;
960
961 /* Do we already have a mapping for this ISA IRQ? */
962 if (irq->mpc_srcbus == MP_ISA_BUS
963 && irq->mpc_srcbusirq == i)
964 break;
965
966 /* Do we already have a mapping for this IOAPIC pin */
967 if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
968 (irq->mpc_dstirq == i))
969 break;
970 }
971
972 if (idx != mp_irq_entries) {
973 printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
974 continue; /* IRQ already used */
975 }
976
977 intsrc.mpc_irqtype = mp_INT;
978 intsrc.mpc_srcbusirq = i; /* Identity mapped */
979 intsrc.mpc_dstirq = i;
980
981 MP_intsrc_info(&intsrc);
982 } 1006 }
983} 1007}
984 1008
985int mp_register_gsi(u32 gsi, int triggering, int polarity) 1009static int __init update_mp_table(void)
986{ 1010{
987 int ioapic; 1011 char str[16];
988 int ioapic_pin; 1012 char oem[10];
989#ifdef CONFIG_X86_32 1013 struct intel_mp_floating *mpf;
990#define MAX_GSI_NUM 4096 1014 struct mp_config_table *mpc;
991#define IRQ_COMPRESSION_START 64 1015 struct mp_config_table *mpc_new;
1016
1017 if (!enable_update_mptable)
1018 return 0;
1019
1020 mpf = mpf_found;
1021 if (!mpf)
1022 return 0;
992 1023
993 static int pci_irq = IRQ_COMPRESSION_START;
994 /* 1024 /*
995 * Mapping between Global System Interrupts, which 1025 * Now see if we need to go further.
996 * represent all possible interrupts, and IRQs
997 * assigned to actual devices.
998 */ 1026 */
999 static int gsi_to_irq[MAX_GSI_NUM]; 1027 if (mpf->mpf_feature1 != 0)
1000#else 1028 return 0;
1001 1029
1002 if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) 1030 if (!mpf->mpf_physptr)
1003 return gsi; 1031 return 0;
1004#endif
1005 1032
1006 /* Don't set up the ACPI SCI because it's already set up */ 1033 mpc = phys_to_virt(mpf->mpf_physptr);
1007 if (acpi_gbl_FADT.sci_interrupt == gsi)
1008 return gsi;
1009 1034
1010 ioapic = mp_find_ioapic(gsi); 1035 if (!smp_check_mpc(mpc, oem, str))
1011 if (ioapic < 0) { 1036 return 0;
1012 printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
1013 return gsi;
1014 }
1015 1037
1016 ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base; 1038 printk(KERN_INFO "mpf: %lx\n", virt_to_phys(mpf));
1039 printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr);
1017 1040
1018#ifdef CONFIG_X86_32 1041 if (mpc_new_phys && mpc->mpc_length > mpc_new_length) {
1019 if (ioapic_renumber_irq) 1042 mpc_new_phys = 0;
1020 gsi = ioapic_renumber_irq(ioapic, gsi); 1043 printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
1021#endif 1044 mpc_new_length);
1022
1023 /*
1024 * Avoid pin reprogramming. PRTs typically include entries
1025 * with redundant pin->gsi mappings (but unique PCI devices);
1026 * we only program the IOAPIC on the first.
1027 */
1028 if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
1029 printk(KERN_ERR "Invalid reference to IOAPIC pin "
1030 "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
1031 ioapic_pin);
1032 return gsi;
1033 } 1045 }
1034 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { 1046
1035 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", 1047 if (!mpc_new_phys) {
1036 mp_ioapic_routing[ioapic].apic_id, ioapic_pin); 1048 unsigned char old, new;
1037#ifdef CONFIG_X86_32 1049 /* check if we can change the postion */
1038 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); 1050 mpc->mpc_checksum = 0;
1039#else 1051 old = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
1040 return gsi; 1052 mpc->mpc_checksum = 0xff;
1041#endif 1053 new = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
1054 if (old == new) {
1055 printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
1056 return 0;
1057 }
1058 printk(KERN_INFO "use in-positon replacing\n");
1059 } else {
1060 mpf->mpf_physptr = mpc_new_phys;
1061 mpc_new = phys_to_virt(mpc_new_phys);
1062 memcpy(mpc_new, mpc, mpc->mpc_length);
1063 mpc = mpc_new;
1064 /* check if we can modify that */
1065 if (mpc_new_phys - mpf->mpf_physptr) {
1066 struct intel_mp_floating *mpf_new;
1067 /* steal 16 bytes from [0, 1k) */
1068 printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
1069 mpf_new = phys_to_virt(0x400 - 16);
1070 memcpy(mpf_new, mpf, 16);
1071 mpf = mpf_new;
1072 mpf->mpf_physptr = mpc_new_phys;
1073 }
1074 mpf->mpf_checksum = 0;
1075 mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16);
1076 printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr);
1042 } 1077 }
1043 1078
1044 set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
1045#ifdef CONFIG_X86_32
1046 /* 1079 /*
1047 * For GSI >= 64, use IRQ compression 1080 * only replace the one with mp_INT and
1081 * MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW,
1082 * already in mp_irqs , stored by ... and mp_config_acpi_gsi,
1083 * may need pci=routeirq for all coverage
1048 */ 1084 */
1049 if ((gsi >= IRQ_COMPRESSION_START) 1085 replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
1050 && (triggering == ACPI_LEVEL_SENSITIVE)) { 1086
1051 /* 1087 return 0;
1052 * For PCI devices assign IRQs in order, avoiding gaps
1053 * due to unused I/O APIC pins.
1054 */
1055 int irq = gsi;
1056 if (gsi < MAX_GSI_NUM) {
1057 /*
1058 * Retain the VIA chipset work-around (gsi > 15), but
1059 * avoid a problem where the 8254 timer (IRQ0) is setup
1060 * via an override (so it's not on pin 0 of the ioapic),
1061 * and at the same time, the pin 0 interrupt is a PCI
1062 * type. The gsi > 15 test could cause these two pins
1063 * to be shared as IRQ0, and they are not shareable.
1064 * So test for this condition, and if necessary, avoid
1065 * the pin collision.
1066 */
1067 gsi = pci_irq++;
1068 /*
1069 * Don't assign IRQ used by ACPI SCI
1070 */
1071 if (gsi == acpi_gbl_FADT.sci_interrupt)
1072 gsi = pci_irq++;
1073 gsi_to_irq[irq] = gsi;
1074 } else {
1075 printk(KERN_ERR "GSI %u is too high\n", gsi);
1076 return gsi;
1077 }
1078 }
1079#endif
1080 io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
1081 triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
1082 polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
1083 return gsi;
1084} 1088}
1085 1089
1086#endif /* CONFIG_X86_IO_APIC */ 1090late_initcall(update_mp_table);
1087#endif /* CONFIG_ACPI */
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 1f3abe048e93..9fd809552447 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -117,12 +117,20 @@ static int msr_open(struct inode *inode, struct file *file)
117{ 117{
118 unsigned int cpu = iminor(file->f_path.dentry->d_inode); 118 unsigned int cpu = iminor(file->f_path.dentry->d_inode);
119 struct cpuinfo_x86 *c = &cpu_data(cpu); 119 struct cpuinfo_x86 *c = &cpu_data(cpu);
120 int ret = 0;
120 121
121 if (cpu >= NR_CPUS || !cpu_online(cpu)) 122 lock_kernel();
122 return -ENXIO; /* No such CPU */ 123 cpu = iminor(file->f_path.dentry->d_inode);
123 if (!cpu_has(c, X86_FEATURE_MSR))
124 return -EIO; /* MSR not supported */
125 124
125 if (cpu >= NR_CPUS || !cpu_online(cpu)) {
126 ret = -ENXIO; /* No such CPU */
127 goto out;
128 }
129 c = &cpu_data(cpu);
130 if (!cpu_has(c, X86_FEATURE_MSR))
131 ret = -EIO; /* MSR not supported */
132out:
133 unlock_kernel();
126 return 0; 134 return 0;
127} 135}
128 136
@@ -141,8 +149,8 @@ static int __cpuinit msr_device_create(int cpu)
141{ 149{
142 struct device *dev; 150 struct device *dev;
143 151
144 dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), 152 dev = device_create_drvdata(msr_class, NULL, MKDEV(MSR_MAJOR, cpu),
145 "msr%d", cpu); 153 NULL, "msr%d", cpu);
146 return IS_ERR(dev) ? PTR_ERR(dev) : 0; 154 return IS_ERR(dev) ? PTR_ERR(dev) : 0;
147} 155}
148 156
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi.c
index 5a29ded994fa..ac6d51222e7d 100644
--- a/arch/x86/kernel/nmi_64.c
+++ b/arch/x86/kernel/nmi.c
@@ -6,10 +6,13 @@
6 * Fixes: 6 * Fixes:
7 * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. 7 * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
8 * Mikael Pettersson : Power Management for local APIC NMI watchdog. 8 * Mikael Pettersson : Power Management for local APIC NMI watchdog.
9 * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog.
9 * Pavel Machek and 10 * Pavel Machek and
10 * Mikael Pettersson : PM converted to driver model. Disable/enable API. 11 * Mikael Pettersson : PM converted to driver model. Disable/enable API.
11 */ 12 */
12 13
14#include <asm/apic.h>
15
13#include <linux/nmi.h> 16#include <linux/nmi.h>
14#include <linux/mm.h> 17#include <linux/mm.h>
15#include <linux/delay.h> 18#include <linux/delay.h>
@@ -17,20 +20,26 @@
17#include <linux/module.h> 20#include <linux/module.h>
18#include <linux/sysdev.h> 21#include <linux/sysdev.h>
19#include <linux/sysctl.h> 22#include <linux/sysctl.h>
23#include <linux/percpu.h>
20#include <linux/kprobes.h> 24#include <linux/kprobes.h>
21#include <linux/cpumask.h> 25#include <linux/cpumask.h>
26#include <linux/kernel_stat.h>
22#include <linux/kdebug.h> 27#include <linux/kdebug.h>
28#include <linux/smp.h>
23 29
30#include <asm/i8259.h>
31#include <asm/io_apic.h>
24#include <asm/smp.h> 32#include <asm/smp.h>
25#include <asm/nmi.h> 33#include <asm/nmi.h>
26#include <asm/proto.h> 34#include <asm/proto.h>
35#include <asm/timer.h>
36
27#include <asm/mce.h> 37#include <asm/mce.h>
28 38
29#include <mach_traps.h> 39#include <mach_traps.h>
30 40
31int unknown_nmi_panic; 41int unknown_nmi_panic;
32int nmi_watchdog_enabled; 42int nmi_watchdog_enabled;
33int panic_on_unrecovered_nmi;
34 43
35static cpumask_t backtrace_mask = CPU_MASK_NONE; 44static cpumask_t backtrace_mask = CPU_MASK_NONE;
36 45
@@ -41,37 +50,65 @@ static cpumask_t backtrace_mask = CPU_MASK_NONE;
41 * 0: the lapic NMI watchdog is disabled, but can be enabled 50 * 0: the lapic NMI watchdog is disabled, but can be enabled
42 */ 51 */
43atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ 52atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
53EXPORT_SYMBOL(nmi_active);
54
55unsigned int nmi_watchdog = NMI_NONE;
56EXPORT_SYMBOL(nmi_watchdog);
57
44static int panic_on_timeout; 58static int panic_on_timeout;
45 59
46unsigned int nmi_watchdog = NMI_DEFAULT;
47static unsigned int nmi_hz = HZ; 60static unsigned int nmi_hz = HZ;
48
49static DEFINE_PER_CPU(short, wd_enabled); 61static DEFINE_PER_CPU(short, wd_enabled);
62static int endflag __initdata;
50 63
51/* Run after command line and cpu_init init, but before all other checks */ 64static inline unsigned int get_nmi_count(int cpu)
52void nmi_watchdog_default(void)
53{ 65{
54 if (nmi_watchdog != NMI_DEFAULT) 66#ifdef CONFIG_X86_64
55 return; 67 return cpu_pda(cpu)->__nmi_count;
56 nmi_watchdog = NMI_NONE; 68#else
69 return nmi_count(cpu);
70#endif
71}
72
73static inline int mce_in_progress(void)
74{
75#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
76 return atomic_read(&mce_entry) > 0;
77#endif
78 return 0;
57} 79}
58 80
59static int endflag __initdata = 0; 81/*
82 * Take the local apic timer and PIT/HPET into account. We don't
83 * know which one is active, when we have highres/dyntick on
84 */
85static inline unsigned int get_timer_irqs(int cpu)
86{
87#ifdef CONFIG_X86_64
88 return read_pda(apic_timer_irqs) + read_pda(irq0_irqs);
89#else
90 return per_cpu(irq_stat, cpu).apic_timer_irqs +
91 per_cpu(irq_stat, cpu).irq0_irqs;
92#endif
93}
60 94
61#ifdef CONFIG_SMP 95#ifdef CONFIG_SMP
62/* The performance counters used by NMI_LOCAL_APIC don't trigger when 96/*
97 * The performance counters used by NMI_LOCAL_APIC don't trigger when
63 * the CPU is idle. To make sure the NMI watchdog really ticks on all 98 * the CPU is idle. To make sure the NMI watchdog really ticks on all
64 * CPUs during the test make them busy. 99 * CPUs during the test make them busy.
65 */ 100 */
66static __init void nmi_cpu_busy(void *data) 101static __init void nmi_cpu_busy(void *data)
67{ 102{
68 local_irq_enable_in_hardirq(); 103 local_irq_enable_in_hardirq();
69 /* Intentionally don't use cpu_relax here. This is 104 /*
70 to make sure that the performance counter really ticks, 105 * Intentionally don't use cpu_relax here. This is
71 even if there is a simulator or similar that catches the 106 * to make sure that the performance counter really ticks,
72 pause instruction. On a real HT machine this is fine because 107 * even if there is a simulator or similar that catches the
73 all other CPUs are busy with "useless" delay loops and don't 108 * pause instruction. On a real HT machine this is fine because
74 care if they get somewhat less cycles. */ 109 * all other CPUs are busy with "useless" delay loops and don't
110 * care if they get somewhat less cycles.
111 */
75 while (endflag == 0) 112 while (endflag == 0)
76 mb(); 113 mb();
77} 114}
@@ -79,40 +116,37 @@ static __init void nmi_cpu_busy(void *data)
79 116
80int __init check_nmi_watchdog(void) 117int __init check_nmi_watchdog(void)
81{ 118{
82 int *prev_nmi_count; 119 unsigned int *prev_nmi_count;
83 int cpu; 120 int cpu;
84 121
85 if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) 122 if (!nmi_watchdog_active() || !atomic_read(&nmi_active))
86 return 0;
87
88 if (!atomic_read(&nmi_active))
89 return 0; 123 return 0;
90 124
91 prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); 125 prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
92 if (!prev_nmi_count) 126 if (!prev_nmi_count)
93 return -1; 127 goto error;
94 128
95 printk(KERN_INFO "Testing NMI watchdog ... "); 129 printk(KERN_INFO "Testing NMI watchdog ... ");
96 130
97#ifdef CONFIG_SMP 131#ifdef CONFIG_SMP
98 if (nmi_watchdog == NMI_LOCAL_APIC) 132 if (nmi_watchdog == NMI_LOCAL_APIC)
99 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); 133 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0);
100#endif 134#endif
101 135
102 for (cpu = 0; cpu < NR_CPUS; cpu++) 136 for_each_possible_cpu(cpu)
103 prev_nmi_count[cpu] = cpu_pda(cpu)->__nmi_count; 137 prev_nmi_count[cpu] = get_nmi_count(cpu);
104 local_irq_enable(); 138 local_irq_enable();
105 mdelay((20*1000)/nmi_hz); // wait 20 ticks 139 mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */
106 140
107 for_each_online_cpu(cpu) { 141 for_each_online_cpu(cpu) {
108 if (!per_cpu(wd_enabled, cpu)) 142 if (!per_cpu(wd_enabled, cpu))
109 continue; 143 continue;
110 if (cpu_pda(cpu)->__nmi_count - prev_nmi_count[cpu] <= 5) { 144 if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
111 printk(KERN_WARNING "WARNING: CPU#%d: NMI " 145 printk(KERN_WARNING "WARNING: CPU#%d: NMI "
112 "appears to be stuck (%d->%d)!\n", 146 "appears to be stuck (%d->%d)!\n",
113 cpu, 147 cpu,
114 prev_nmi_count[cpu], 148 prev_nmi_count[cpu],
115 cpu_pda(cpu)->__nmi_count); 149 get_nmi_count(cpu));
116 per_cpu(wd_enabled, cpu) = 0; 150 per_cpu(wd_enabled, cpu) = 0;
117 atomic_dec(&nmi_active); 151 atomic_dec(&nmi_active);
118 } 152 }
@@ -121,24 +155,33 @@ int __init check_nmi_watchdog(void)
121 if (!atomic_read(&nmi_active)) { 155 if (!atomic_read(&nmi_active)) {
122 kfree(prev_nmi_count); 156 kfree(prev_nmi_count);
123 atomic_set(&nmi_active, -1); 157 atomic_set(&nmi_active, -1);
124 return -1; 158 goto error;
125 } 159 }
126 printk("OK.\n"); 160 printk("OK.\n");
127 161
128 /* now that we know it works we can reduce NMI frequency to 162 /*
129 something more reasonable; makes a difference in some configs */ 163 * now that we know it works we can reduce NMI frequency to
164 * something more reasonable; makes a difference in some configs
165 */
130 if (nmi_watchdog == NMI_LOCAL_APIC) 166 if (nmi_watchdog == NMI_LOCAL_APIC)
131 nmi_hz = lapic_adjust_nmi_hz(1); 167 nmi_hz = lapic_adjust_nmi_hz(1);
132 168
133 kfree(prev_nmi_count); 169 kfree(prev_nmi_count);
134 return 0; 170 return 0;
171error:
172 if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259)
173 disable_8259A_irq(0);
174#ifdef CONFIG_X86_32
175 timer_ack = 0;
176#endif
177 return -1;
135} 178}
136 179
137static int __init setup_nmi_watchdog(char *str) 180static int __init setup_nmi_watchdog(char *str)
138{ 181{
139 int nmi; 182 unsigned int nmi;
140 183
141 if (!strncmp(str,"panic",5)) { 184 if (!strncmp(str, "panic", 5)) {
142 panic_on_timeout = 1; 185 panic_on_timeout = 1;
143 str = strchr(str, ','); 186 str = strchr(str, ',');
144 if (!str) 187 if (!str)
@@ -148,15 +191,17 @@ static int __init setup_nmi_watchdog(char *str)
148 191
149 get_option(&str, &nmi); 192 get_option(&str, &nmi);
150 193
151 if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE)) 194 if (nmi >= NMI_INVALID)
152 return 0; 195 return 0;
153 196
154 nmi_watchdog = nmi; 197 nmi_watchdog = nmi;
155 return 1; 198 return 1;
156} 199}
157
158__setup("nmi_watchdog=", setup_nmi_watchdog); 200__setup("nmi_watchdog=", setup_nmi_watchdog);
159 201
202/*
203 * Suspend/resume support
204 */
160#ifdef CONFIG_PM 205#ifdef CONFIG_PM
161 206
162static int nmi_pm_active; /* nmi_active before suspend */ 207static int nmi_pm_active; /* nmi_active before suspend */
@@ -195,7 +240,8 @@ static int __init init_lapic_nmi_sysfs(void)
195{ 240{
196 int error; 241 int error;
197 242
198 /* should really be a BUG_ON but b/c this is an 243 /*
244 * should really be a BUG_ON but b/c this is an
199 * init call, it just doesn't work. -dcz 245 * init call, it just doesn't work. -dcz
200 */ 246 */
201 if (nmi_watchdog != NMI_LOCAL_APIC) 247 if (nmi_watchdog != NMI_LOCAL_APIC)
@@ -209,6 +255,7 @@ static int __init init_lapic_nmi_sysfs(void)
209 error = sysdev_register(&device_lapic_nmi); 255 error = sysdev_register(&device_lapic_nmi);
210 return error; 256 return error;
211} 257}
258
212/* must come after the local APIC's device_initcall() */ 259/* must come after the local APIC's device_initcall() */
213late_initcall(init_lapic_nmi_sysfs); 260late_initcall(init_lapic_nmi_sysfs);
214 261
@@ -225,7 +272,7 @@ static void __acpi_nmi_enable(void *__unused)
225void acpi_nmi_enable(void) 272void acpi_nmi_enable(void)
226{ 273{
227 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) 274 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
228 on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); 275 on_each_cpu(__acpi_nmi_enable, NULL, 1);
229} 276}
230 277
231static void __acpi_nmi_disable(void *__unused) 278static void __acpi_nmi_disable(void *__unused)
@@ -239,7 +286,7 @@ static void __acpi_nmi_disable(void *__unused)
239void acpi_nmi_disable(void) 286void acpi_nmi_disable(void)
240{ 287{
241 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) 288 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
242 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); 289 on_each_cpu(__acpi_nmi_disable, NULL, 1);
243} 290}
244 291
245void setup_apic_nmi_watchdog(void *unused) 292void setup_apic_nmi_watchdog(void *unused)
@@ -249,11 +296,12 @@ void setup_apic_nmi_watchdog(void *unused)
249 296
250 /* cheap hack to support suspend/resume */ 297 /* cheap hack to support suspend/resume */
251 /* if cpu0 is not active neither should the other cpus */ 298 /* if cpu0 is not active neither should the other cpus */
252 if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) 299 if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0)
253 return; 300 return;
254 301
255 switch (nmi_watchdog) { 302 switch (nmi_watchdog) {
256 case NMI_LOCAL_APIC: 303 case NMI_LOCAL_APIC:
304 /* enable it before to avoid race with handler */
257 __get_cpu_var(wd_enabled) = 1; 305 __get_cpu_var(wd_enabled) = 1;
258 if (lapic_watchdog_init(nmi_hz) < 0) { 306 if (lapic_watchdog_init(nmi_hz) < 0) {
259 __get_cpu_var(wd_enabled) = 0; 307 __get_cpu_var(wd_enabled) = 0;
@@ -269,9 +317,8 @@ void setup_apic_nmi_watchdog(void *unused)
269void stop_apic_nmi_watchdog(void *unused) 317void stop_apic_nmi_watchdog(void *unused)
270{ 318{
271 /* only support LOCAL and IO APICs for now */ 319 /* only support LOCAL and IO APICs for now */
272 if ((nmi_watchdog != NMI_LOCAL_APIC) && 320 if (!nmi_watchdog_active())
273 (nmi_watchdog != NMI_IO_APIC)) 321 return;
274 return;
275 if (__get_cpu_var(wd_enabled) == 0) 322 if (__get_cpu_var(wd_enabled) == 0)
276 return; 323 return;
277 if (nmi_watchdog == NMI_LOCAL_APIC) 324 if (nmi_watchdog == NMI_LOCAL_APIC)
@@ -287,6 +334,11 @@ void stop_apic_nmi_watchdog(void *unused)
287 * 334 *
288 * as these watchdog NMI IRQs are generated on every CPU, we only 335 * as these watchdog NMI IRQs are generated on every CPU, we only
289 * have to check the current processor. 336 * have to check the current processor.
337 *
338 * since NMIs don't listen to _any_ locks, we have to be extremely
339 * careful not to rely on unsafe variables. The printk might lock
340 * up though, so we have to break up any console locks first ...
341 * [when there will be more tty-related locks, break them up here too!]
290 */ 342 */
291 343
292static DEFINE_PER_CPU(unsigned, last_irq_sum); 344static DEFINE_PER_CPU(unsigned, last_irq_sum);
@@ -295,11 +347,11 @@ static DEFINE_PER_CPU(int, nmi_touch);
295 347
296void touch_nmi_watchdog(void) 348void touch_nmi_watchdog(void)
297{ 349{
298 if (nmi_watchdog > 0) { 350 if (nmi_watchdog_active()) {
299 unsigned cpu; 351 unsigned cpu;
300 352
301 /* 353 /*
302 * Tell other CPUs to reset their alert counters. We cannot 354 * Tell other CPUs to reset their alert counters. We cannot
303 * do it ourselves because the alert count increase is not 355 * do it ourselves because the alert count increase is not
304 * atomic. 356 * atomic.
305 */ 357 */
@@ -309,6 +361,9 @@ void touch_nmi_watchdog(void)
309 } 361 }
310 } 362 }
311 363
364 /*
365 * Tickle the softlockup detector too:
366 */
312 touch_softlockup_watchdog(); 367 touch_softlockup_watchdog();
313} 368}
314EXPORT_SYMBOL(touch_nmi_watchdog); 369EXPORT_SYMBOL(touch_nmi_watchdog);
@@ -316,7 +371,12 @@ EXPORT_SYMBOL(touch_nmi_watchdog);
316notrace __kprobes int 371notrace __kprobes int
317nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) 372nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
318{ 373{
319 int sum; 374 /*
375 * Since current_thread_info()-> is always on the stack, and we
376 * always switch the stack NMI-atomically, it's safe to use
377 * smp_processor_id().
378 */
379 unsigned int sum;
320 int touched = 0; 380 int touched = 0;
321 int cpu = smp_processor_id(); 381 int cpu = smp_processor_id();
322 int rc = 0; 382 int rc = 0;
@@ -328,7 +388,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
328 touched = 1; 388 touched = 1;
329 } 389 }
330 390
331 sum = read_pda(apic_timer_irqs) + read_pda(irq0_irqs); 391 sum = get_timer_irqs(cpu);
392
332 if (__get_cpu_var(nmi_touch)) { 393 if (__get_cpu_var(nmi_touch)) {
333 __get_cpu_var(nmi_touch) = 0; 394 __get_cpu_var(nmi_touch) = 0;
334 touched = 1; 395 touched = 1;
@@ -338,28 +399,29 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
338 static DEFINE_SPINLOCK(lock); /* Serialise the printks */ 399 static DEFINE_SPINLOCK(lock); /* Serialise the printks */
339 400
340 spin_lock(&lock); 401 spin_lock(&lock);
341 printk("NMI backtrace for cpu %d\n", cpu); 402 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
342 dump_stack(); 403 dump_stack();
343 spin_unlock(&lock); 404 spin_unlock(&lock);
344 cpu_clear(cpu, backtrace_mask); 405 cpu_clear(cpu, backtrace_mask);
345 } 406 }
346 407
347#ifdef CONFIG_X86_MCE 408 /* Could check oops_in_progress here too, but it's safer not to */
348 /* Could check oops_in_progress here too, but it's safer 409 if (mce_in_progress())
349 not too */
350 if (atomic_read(&mce_entry) > 0)
351 touched = 1; 410 touched = 1;
352#endif 411
353 /* if the apic timer isn't firing, this cpu isn't doing much */ 412 /* if the none of the timers isn't firing, this cpu isn't doing much */
354 if (!touched && __get_cpu_var(last_irq_sum) == sum) { 413 if (!touched && __get_cpu_var(last_irq_sum) == sum) {
355 /* 414 /*
356 * Ayiee, looks like this CPU is stuck ... 415 * Ayiee, looks like this CPU is stuck ...
357 * wait a few IRQs (5 seconds) before doing the oops ... 416 * wait a few IRQs (5 seconds) before doing the oops ...
358 */ 417 */
359 local_inc(&__get_cpu_var(alert_counter)); 418 local_inc(&__get_cpu_var(alert_counter));
360 if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) 419 if (local_read(&__get_cpu_var(alert_counter)) == 5 * nmi_hz)
361 die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs, 420 /*
362 panic_on_timeout); 421 * die_nmi will return ONLY if NOTIFY_STOP happens..
422 */
423 die_nmi("BUG: NMI Watchdog detected LOCKUP",
424 regs, panic_on_timeout);
363 } else { 425 } else {
364 __get_cpu_var(last_irq_sum) = sum; 426 __get_cpu_var(last_irq_sum) = sum;
365 local_set(&__get_cpu_var(alert_counter), 0); 427 local_set(&__get_cpu_var(alert_counter), 0);
@@ -373,7 +435,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
373 rc |= lapic_wd_event(nmi_hz); 435 rc |= lapic_wd_event(nmi_hz);
374 break; 436 break;
375 case NMI_IO_APIC: 437 case NMI_IO_APIC:
376 /* don't know how to accurately check for this. 438 /*
439 * don't know how to accurately check for this.
377 * just assume it was a watchdog timer interrupt 440 * just assume it was a watchdog timer interrupt
378 * This matches the old behaviour. 441 * This matches the old behaviour.
379 */ 442 */
@@ -383,31 +446,14 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
383 return rc; 446 return rc;
384} 447}
385 448
386static unsigned ignore_nmis; 449#ifdef CONFIG_SYSCTL
387
388asmlinkage notrace __kprobes void
389do_nmi(struct pt_regs *regs, long error_code)
390{
391 nmi_enter();
392 add_pda(__nmi_count,1);
393 if (!ignore_nmis)
394 default_do_nmi(regs);
395 nmi_exit();
396}
397
398void stop_nmi(void)
399{
400 acpi_nmi_disable();
401 ignore_nmis++;
402}
403 450
404void restart_nmi(void) 451static int __init setup_unknown_nmi_panic(char *str)
405{ 452{
406 ignore_nmis--; 453 unknown_nmi_panic = 1;
407 acpi_nmi_enable(); 454 return 1;
408} 455}
409 456__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
410#ifdef CONFIG_SYSCTL
411 457
412static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) 458static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
413{ 459{
@@ -415,7 +461,7 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
415 char buf[64]; 461 char buf[64];
416 462
417 sprintf(buf, "NMI received for unknown reason %02x\n", reason); 463 sprintf(buf, "NMI received for unknown reason %02x\n", reason);
418 die_nmi(buf, regs, 1); /* Always panic here */ 464 die_nmi(buf, regs, 1); /* Always panic here */
419 return 0; 465 return 0;
420} 466}
421 467
@@ -433,28 +479,26 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
433 if (!!old_state == !!nmi_watchdog_enabled) 479 if (!!old_state == !!nmi_watchdog_enabled)
434 return 0; 480 return 0;
435 481
436 if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) { 482 if (atomic_read(&nmi_active) < 0 || !nmi_watchdog_active()) {
437 printk( KERN_WARNING "NMI watchdog is permanently disabled\n"); 483 printk(KERN_WARNING
484 "NMI watchdog is permanently disabled\n");
438 return -EIO; 485 return -EIO;
439 } 486 }
440 487
441 /* if nmi_watchdog is not set yet, then set it */
442 nmi_watchdog_default();
443
444 if (nmi_watchdog == NMI_LOCAL_APIC) { 488 if (nmi_watchdog == NMI_LOCAL_APIC) {
445 if (nmi_watchdog_enabled) 489 if (nmi_watchdog_enabled)
446 enable_lapic_nmi_watchdog(); 490 enable_lapic_nmi_watchdog();
447 else 491 else
448 disable_lapic_nmi_watchdog(); 492 disable_lapic_nmi_watchdog();
449 } else { 493 } else {
450 printk( KERN_WARNING 494 printk(KERN_WARNING
451 "NMI watchdog doesn't know what hardware to touch\n"); 495 "NMI watchdog doesn't know what hardware to touch\n");
452 return -EIO; 496 return -EIO;
453 } 497 }
454 return 0; 498 return 0;
455} 499}
456 500
457#endif 501#endif /* CONFIG_SYSCTL */
458 502
459int do_nmi_callback(struct pt_regs *regs, int cpu) 503int do_nmi_callback(struct pt_regs *regs, int cpu)
460{ 504{
@@ -477,6 +521,3 @@ void __trigger_all_cpu_backtrace(void)
477 mdelay(1); 521 mdelay(1);
478 } 522 }
479} 523}
480
481EXPORT_SYMBOL(nmi_active);
482EXPORT_SYMBOL(nmi_watchdog);
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
deleted file mode 100644
index 84160f74eeb0..000000000000
--- a/arch/x86/kernel/nmi_32.c
+++ /dev/null
@@ -1,467 +0,0 @@
1/*
2 * NMI watchdog support on APIC systems
3 *
4 * Started by Ingo Molnar <mingo@redhat.com>
5 *
6 * Fixes:
7 * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
8 * Mikael Pettersson : Power Management for local APIC NMI watchdog.
9 * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog.
10 * Pavel Machek and
11 * Mikael Pettersson : PM converted to driver model. Disable/enable API.
12 */
13
14#include <linux/delay.h>
15#include <linux/interrupt.h>
16#include <linux/module.h>
17#include <linux/nmi.h>
18#include <linux/sysdev.h>
19#include <linux/sysctl.h>
20#include <linux/percpu.h>
21#include <linux/kprobes.h>
22#include <linux/cpumask.h>
23#include <linux/kernel_stat.h>
24#include <linux/kdebug.h>
25#include <linux/slab.h>
26
27#include <asm/smp.h>
28#include <asm/nmi.h>
29
30#include "mach_traps.h"
31
32int unknown_nmi_panic;
33int nmi_watchdog_enabled;
34
35static cpumask_t backtrace_mask = CPU_MASK_NONE;
36
37/* nmi_active:
38 * >0: the lapic NMI watchdog is active, but can be disabled
39 * <0: the lapic NMI watchdog has not been set up, and cannot
40 * be enabled
41 * 0: the lapic NMI watchdog is disabled, but can be enabled
42 */
43atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
44
45unsigned int nmi_watchdog = NMI_DEFAULT;
46static unsigned int nmi_hz = HZ;
47
48static DEFINE_PER_CPU(short, wd_enabled);
49
50static int endflag __initdata = 0;
51
52#ifdef CONFIG_SMP
53/* The performance counters used by NMI_LOCAL_APIC don't trigger when
54 * the CPU is idle. To make sure the NMI watchdog really ticks on all
55 * CPUs during the test make them busy.
56 */
57static __init void nmi_cpu_busy(void *data)
58{
59 local_irq_enable_in_hardirq();
60 /* Intentionally don't use cpu_relax here. This is
61 to make sure that the performance counter really ticks,
62 even if there is a simulator or similar that catches the
63 pause instruction. On a real HT machine this is fine because
64 all other CPUs are busy with "useless" delay loops and don't
65 care if they get somewhat less cycles. */
66 while (endflag == 0)
67 mb();
68}
69#endif
70
71int __init check_nmi_watchdog(void)
72{
73 unsigned int *prev_nmi_count;
74 int cpu;
75
76 if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED))
77 return 0;
78
79 if (!atomic_read(&nmi_active))
80 return 0;
81
82 prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
83 if (!prev_nmi_count)
84 return -1;
85
86 printk(KERN_INFO "Testing NMI watchdog ... ");
87
88#ifdef CONFIG_SMP
89 if (nmi_watchdog == NMI_LOCAL_APIC)
90 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
91#endif
92
93 for_each_possible_cpu(cpu)
94 prev_nmi_count[cpu] = nmi_count(cpu);
95 local_irq_enable();
96 mdelay((20*1000)/nmi_hz); // wait 20 ticks
97
98 for_each_possible_cpu(cpu) {
99#ifdef CONFIG_SMP
100 /* Check cpu_callin_map here because that is set
101 after the timer is started. */
102 if (!cpu_isset(cpu, cpu_callin_map))
103 continue;
104#endif
105 if (!per_cpu(wd_enabled, cpu))
106 continue;
107 if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
108 printk(KERN_WARNING "WARNING: CPU#%d: NMI "
109 "appears to be stuck (%d->%d)!\n",
110 cpu,
111 prev_nmi_count[cpu],
112 nmi_count(cpu));
113 per_cpu(wd_enabled, cpu) = 0;
114 atomic_dec(&nmi_active);
115 }
116 }
117 endflag = 1;
118 if (!atomic_read(&nmi_active)) {
119 kfree(prev_nmi_count);
120 atomic_set(&nmi_active, -1);
121 return -1;
122 }
123 printk("OK.\n");
124
125 /* now that we know it works we can reduce NMI frequency to
126 something more reasonable; makes a difference in some configs */
127 if (nmi_watchdog == NMI_LOCAL_APIC)
128 nmi_hz = lapic_adjust_nmi_hz(1);
129
130 kfree(prev_nmi_count);
131 return 0;
132}
133
134static int __init setup_nmi_watchdog(char *str)
135{
136 int nmi;
137
138 get_option(&str, &nmi);
139
140 if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
141 return 0;
142
143 nmi_watchdog = nmi;
144 return 1;
145}
146
147__setup("nmi_watchdog=", setup_nmi_watchdog);
148
149
150/* Suspend/resume support */
151
152#ifdef CONFIG_PM
153
154static int nmi_pm_active; /* nmi_active before suspend */
155
156static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
157{
158 /* only CPU0 goes here, other CPUs should be offline */
159 nmi_pm_active = atomic_read(&nmi_active);
160 stop_apic_nmi_watchdog(NULL);
161 BUG_ON(atomic_read(&nmi_active) != 0);
162 return 0;
163}
164
165static int lapic_nmi_resume(struct sys_device *dev)
166{
167 /* only CPU0 goes here, other CPUs should be offline */
168 if (nmi_pm_active > 0) {
169 setup_apic_nmi_watchdog(NULL);
170 touch_nmi_watchdog();
171 }
172 return 0;
173}
174
175
176static struct sysdev_class nmi_sysclass = {
177 .name = "lapic_nmi",
178 .resume = lapic_nmi_resume,
179 .suspend = lapic_nmi_suspend,
180};
181
182static struct sys_device device_lapic_nmi = {
183 .id = 0,
184 .cls = &nmi_sysclass,
185};
186
187static int __init init_lapic_nmi_sysfs(void)
188{
189 int error;
190
191 /* should really be a BUG_ON but b/c this is an
192 * init call, it just doesn't work. -dcz
193 */
194 if (nmi_watchdog != NMI_LOCAL_APIC)
195 return 0;
196
197 if (atomic_read(&nmi_active) < 0)
198 return 0;
199
200 error = sysdev_class_register(&nmi_sysclass);
201 if (!error)
202 error = sysdev_register(&device_lapic_nmi);
203 return error;
204}
205/* must come after the local APIC's device_initcall() */
206late_initcall(init_lapic_nmi_sysfs);
207
208#endif /* CONFIG_PM */
209
210static void __acpi_nmi_enable(void *__unused)
211{
212 apic_write_around(APIC_LVT0, APIC_DM_NMI);
213}
214
215/*
216 * Enable timer based NMIs on all CPUs:
217 */
218void acpi_nmi_enable(void)
219{
220 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
221 on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
222}
223
224static void __acpi_nmi_disable(void *__unused)
225{
226 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
227}
228
229/*
230 * Disable timer based NMIs on all CPUs:
231 */
232void acpi_nmi_disable(void)
233{
234 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
235 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
236}
237
238void setup_apic_nmi_watchdog(void *unused)
239{
240 if (__get_cpu_var(wd_enabled))
241 return;
242
243 /* cheap hack to support suspend/resume */
244 /* if cpu0 is not active neither should the other cpus */
245 if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
246 return;
247
248 switch (nmi_watchdog) {
249 case NMI_LOCAL_APIC:
250 __get_cpu_var(wd_enabled) = 1; /* enable it before to avoid race with handler */
251 if (lapic_watchdog_init(nmi_hz) < 0) {
252 __get_cpu_var(wd_enabled) = 0;
253 return;
254 }
255 /* FALL THROUGH */
256 case NMI_IO_APIC:
257 __get_cpu_var(wd_enabled) = 1;
258 atomic_inc(&nmi_active);
259 }
260}
261
262void stop_apic_nmi_watchdog(void *unused)
263{
264 /* only support LOCAL and IO APICs for now */
265 if ((nmi_watchdog != NMI_LOCAL_APIC) &&
266 (nmi_watchdog != NMI_IO_APIC))
267 return;
268 if (__get_cpu_var(wd_enabled) == 0)
269 return;
270 if (nmi_watchdog == NMI_LOCAL_APIC)
271 lapic_watchdog_stop();
272 __get_cpu_var(wd_enabled) = 0;
273 atomic_dec(&nmi_active);
274}
275
276/*
277 * the best way to detect whether a CPU has a 'hard lockup' problem
278 * is to check it's local APIC timer IRQ counts. If they are not
279 * changing then that CPU has some problem.
280 *
281 * as these watchdog NMI IRQs are generated on every CPU, we only
282 * have to check the current processor.
283 *
284 * since NMIs don't listen to _any_ locks, we have to be extremely
285 * careful not to rely on unsafe variables. The printk might lock
286 * up though, so we have to break up any console locks first ...
287 * [when there will be more tty-related locks, break them up
288 * here too!]
289 */
290
291static unsigned int
292 last_irq_sums [NR_CPUS],
293 alert_counter [NR_CPUS];
294
295void touch_nmi_watchdog(void)
296{
297 if (nmi_watchdog > 0) {
298 unsigned cpu;
299
300 /*
301 * Just reset the alert counters, (other CPUs might be
302 * spinning on locks we hold):
303 */
304 for_each_present_cpu(cpu) {
305 if (alert_counter[cpu])
306 alert_counter[cpu] = 0;
307 }
308 }
309
310 /*
311 * Tickle the softlockup detector too:
312 */
313 touch_softlockup_watchdog();
314}
315EXPORT_SYMBOL(touch_nmi_watchdog);
316
317extern void die_nmi(struct pt_regs *, const char *msg);
318
319notrace __kprobes int
320nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
321{
322
323 /*
324 * Since current_thread_info()-> is always on the stack, and we
325 * always switch the stack NMI-atomically, it's safe to use
326 * smp_processor_id().
327 */
328 unsigned int sum;
329 int touched = 0;
330 int cpu = smp_processor_id();
331 int rc = 0;
332
333 /* check for other users first */
334 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
335 == NOTIFY_STOP) {
336 rc = 1;
337 touched = 1;
338 }
339
340 if (cpu_isset(cpu, backtrace_mask)) {
341 static DEFINE_SPINLOCK(lock); /* Serialise the printks */
342
343 spin_lock(&lock);
344 printk("NMI backtrace for cpu %d\n", cpu);
345 dump_stack();
346 spin_unlock(&lock);
347 cpu_clear(cpu, backtrace_mask);
348 }
349
350 /*
351 * Take the local apic timer and PIT/HPET into account. We don't
352 * know which one is active, when we have highres/dyntick on
353 */
354 sum = per_cpu(irq_stat, cpu).apic_timer_irqs +
355 per_cpu(irq_stat, cpu).irq0_irqs;
356
357 /* if the none of the timers isn't firing, this cpu isn't doing much */
358 if (!touched && last_irq_sums[cpu] == sum) {
359 /*
360 * Ayiee, looks like this CPU is stuck ...
361 * wait a few IRQs (5 seconds) before doing the oops ...
362 */
363 alert_counter[cpu]++;
364 if (alert_counter[cpu] == 5*nmi_hz)
365 /*
366 * die_nmi will return ONLY if NOTIFY_STOP happens..
367 */
368 die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP");
369 } else {
370 last_irq_sums[cpu] = sum;
371 alert_counter[cpu] = 0;
372 }
373 /* see if the nmi watchdog went off */
374 if (!__get_cpu_var(wd_enabled))
375 return rc;
376 switch (nmi_watchdog) {
377 case NMI_LOCAL_APIC:
378 rc |= lapic_wd_event(nmi_hz);
379 break;
380 case NMI_IO_APIC:
381 /* don't know how to accurately check for this.
382 * just assume it was a watchdog timer interrupt
383 * This matches the old behaviour.
384 */
385 rc = 1;
386 break;
387 }
388 return rc;
389}
390
391#ifdef CONFIG_SYSCTL
392
393static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
394{
395 unsigned char reason = get_nmi_reason();
396 char buf[64];
397
398 sprintf(buf, "NMI received for unknown reason %02x\n", reason);
399 die_nmi(regs, buf);
400 return 0;
401}
402
403/*
404 * proc handler for /proc/sys/kernel/nmi
405 */
406int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
407 void __user *buffer, size_t *length, loff_t *ppos)
408{
409 int old_state;
410
411 nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
412 old_state = nmi_watchdog_enabled;
413 proc_dointvec(table, write, file, buffer, length, ppos);
414 if (!!old_state == !!nmi_watchdog_enabled)
415 return 0;
416
417 if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) {
418 printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
419 return -EIO;
420 }
421
422 if (nmi_watchdog == NMI_DEFAULT) {
423 if (lapic_watchdog_ok())
424 nmi_watchdog = NMI_LOCAL_APIC;
425 else
426 nmi_watchdog = NMI_IO_APIC;
427 }
428
429 if (nmi_watchdog == NMI_LOCAL_APIC) {
430 if (nmi_watchdog_enabled)
431 enable_lapic_nmi_watchdog();
432 else
433 disable_lapic_nmi_watchdog();
434 } else {
435 printk( KERN_WARNING
436 "NMI watchdog doesn't know what hardware to touch\n");
437 return -EIO;
438 }
439 return 0;
440}
441
442#endif
443
444int do_nmi_callback(struct pt_regs *regs, int cpu)
445{
446#ifdef CONFIG_SYSCTL
447 if (unknown_nmi_panic)
448 return unknown_nmi_panic_callback(regs, cpu);
449#endif
450 return 0;
451}
452
453void __trigger_all_cpu_backtrace(void)
454{
455 int i;
456
457 backtrace_mask = cpu_online_map;
458 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
459 for (i = 0; i < 10 * 1000; i++) {
460 if (cpus_empty(backtrace_mask))
461 break;
462 mdelay(1);
463 }
464}
465
466EXPORT_SYMBOL(nmi_active);
467EXPORT_SYMBOL(nmi_watchdog);
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index e65281b1634b..b8c45610b20a 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -31,6 +31,9 @@
31#include <asm/numaq.h> 31#include <asm/numaq.h>
32#include <asm/topology.h> 32#include <asm/topology.h>
33#include <asm/processor.h> 33#include <asm/processor.h>
34#include <asm/mpspec.h>
35#include <asm/e820.h>
36#include <asm/setup.h>
34 37
35#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) 38#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
36 39
@@ -58,6 +61,8 @@ static void __init smp_dump_qct(void)
58 node_end_pfn[node] = MB_TO_PAGES( 61 node_end_pfn[node] = MB_TO_PAGES(
59 eq->hi_shrd_mem_start + eq->hi_shrd_mem_size); 62 eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
60 63
64 e820_register_active_regions(node, node_start_pfn[node],
65 node_end_pfn[node]);
61 memory_present(node, 66 memory_present(node,
62 node_start_pfn[node], node_end_pfn[node]); 67 node_start_pfn[node], node_end_pfn[node]);
63 node_remap_size[node] = node_memmap_size_bytes(node, 68 node_remap_size[node] = node_memmap_size_bytes(node,
@@ -67,23 +72,209 @@ static void __init smp_dump_qct(void)
67 } 72 }
68} 73}
69 74
70/*
71 * Unlike Summit, we don't really care to let the NUMA-Q
72 * fall back to flat mode. Don't compile for NUMA-Q
73 * unless you really need it!
74 */
75int __init get_memcfg_numaq(void)
76{
77 smp_dump_qct();
78 return 1;
79}
80 75
81static int __init numaq_tsc_disable(void) 76void __init numaq_tsc_disable(void)
82{ 77{
78 if (!found_numaq)
79 return;
80
83 if (num_online_nodes() > 1) { 81 if (num_online_nodes() > 1) {
84 printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); 82 printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
85 setup_clear_cpu_cap(X86_FEATURE_TSC); 83 setup_clear_cpu_cap(X86_FEATURE_TSC);
86 } 84 }
85}
86
87static int __init numaq_pre_time_init(void)
88{
89 numaq_tsc_disable();
87 return 0; 90 return 0;
88} 91}
89arch_initcall(numaq_tsc_disable); 92
93int found_numaq;
94/*
95 * Have to match translation table entries to main table entries by counter
96 * hence the mpc_record variable .... can't see a less disgusting way of
97 * doing this ....
98 */
99struct mpc_config_translation {
100 unsigned char mpc_type;
101 unsigned char trans_len;
102 unsigned char trans_type;
103 unsigned char trans_quad;
104 unsigned char trans_global;
105 unsigned char trans_local;
106 unsigned short trans_reserved;
107};
108
109/* x86_quirks member */
110static int mpc_record;
111static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
112 __cpuinitdata;
113
114static inline int generate_logical_apicid(int quad, int phys_apicid)
115{
116 return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
117}
118
119/* x86_quirks member */
120static int mpc_apic_id(struct mpc_config_processor *m)
121{
122 int quad = translation_table[mpc_record]->trans_quad;
123 int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
124
125 printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
126 m->mpc_apicid,
127 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
128 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
129 m->mpc_apicver, quad, logical_apicid);
130 return logical_apicid;
131}
132
133int mp_bus_id_to_node[MAX_MP_BUSSES];
134
135int mp_bus_id_to_local[MAX_MP_BUSSES];
136
137/* x86_quirks member */
138static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name)
139{
140 int quad = translation_table[mpc_record]->trans_quad;
141 int local = translation_table[mpc_record]->trans_local;
142
143 mp_bus_id_to_node[m->mpc_busid] = quad;
144 mp_bus_id_to_local[m->mpc_busid] = local;
145 printk(KERN_INFO "Bus #%d is %s (node %d)\n",
146 m->mpc_busid, name, quad);
147}
148
149int quad_local_to_mp_bus_id [NR_CPUS/4][4];
150
151/* x86_quirks member */
152static void mpc_oem_pci_bus(struct mpc_config_bus *m)
153{
154 int quad = translation_table[mpc_record]->trans_quad;
155 int local = translation_table[mpc_record]->trans_local;
156
157 quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
158}
159
160static void __init MP_translation_info(struct mpc_config_translation *m)
161{
162 printk(KERN_INFO
163 "Translation: record %d, type %d, quad %d, global %d, local %d\n",
164 mpc_record, m->trans_type, m->trans_quad, m->trans_global,
165 m->trans_local);
166
167 if (mpc_record >= MAX_MPC_ENTRY)
168 printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
169 else
170 translation_table[mpc_record] = m; /* stash this for later */
171 if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
172 node_set_online(m->trans_quad);
173}
174
175static int __init mpf_checksum(unsigned char *mp, int len)
176{
177 int sum = 0;
178
179 while (len--)
180 sum += *mp++;
181
182 return sum & 0xFF;
183}
184
185/*
186 * Read/parse the MPC oem tables
187 */
188
189static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
190 unsigned short oemsize)
191{
192 int count = sizeof(*oemtable); /* the header size */
193 unsigned char *oemptr = ((unsigned char *)oemtable) + count;
194
195 mpc_record = 0;
196 printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
197 oemtable);
198 if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
199 printk(KERN_WARNING
200 "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
201 oemtable->oem_signature[0], oemtable->oem_signature[1],
202 oemtable->oem_signature[2], oemtable->oem_signature[3]);
203 return;
204 }
205 if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
206 printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
207 return;
208 }
209 while (count < oemtable->oem_length) {
210 switch (*oemptr) {
211 case MP_TRANSLATION:
212 {
213 struct mpc_config_translation *m =
214 (struct mpc_config_translation *)oemptr;
215 MP_translation_info(m);
216 oemptr += sizeof(*m);
217 count += sizeof(*m);
218 ++mpc_record;
219 break;
220 }
221 default:
222 {
223 printk(KERN_WARNING
224 "Unrecognised OEM table entry type! - %d\n",
225 (int)*oemptr);
226 return;
227 }
228 }
229 }
230}
231
232static struct x86_quirks numaq_x86_quirks __initdata = {
233 .arch_pre_time_init = numaq_pre_time_init,
234 .arch_time_init = NULL,
235 .arch_pre_intr_init = NULL,
236 .arch_memory_setup = NULL,
237 .arch_intr_init = NULL,
238 .arch_trap_init = NULL,
239 .mach_get_smp_config = NULL,
240 .mach_find_smp_config = NULL,
241 .mpc_record = &mpc_record,
242 .mpc_apic_id = mpc_apic_id,
243 .mpc_oem_bus_info = mpc_oem_bus_info,
244 .mpc_oem_pci_bus = mpc_oem_pci_bus,
245 .smp_read_mpc_oem = smp_read_mpc_oem,
246};
247
248void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
249 char *productid)
250{
251 if (strncmp(oem, "IBM NUMA", 8))
252 printk("Warning! Not a NUMA-Q system!\n");
253 else
254 found_numaq = 1;
255}
256
257static __init void early_check_numaq(void)
258{
259 /*
260 * Find possible boot-time SMP configuration:
261 */
262 early_find_smp_config();
263 /*
264 * get boot-time SMP configuration:
265 */
266 if (smp_found_config)
267 early_get_smp_config();
268
269 if (found_numaq)
270 x86_quirks = &numaq_x86_quirks;
271}
272
273int __init get_memcfg_numaq(void)
274{
275 early_check_numaq();
276 if (!found_numaq)
277 return 0;
278 smp_dump_qct();
279 return 1;
280}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 74f0c5ea2a03..94da4d52d798 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -29,7 +29,9 @@
29#include <asm/desc.h> 29#include <asm/desc.h>
30#include <asm/setup.h> 30#include <asm/setup.h>
31#include <asm/arch_hooks.h> 31#include <asm/arch_hooks.h>
32#include <asm/pgtable.h>
32#include <asm/time.h> 33#include <asm/time.h>
34#include <asm/pgalloc.h>
33#include <asm/irq.h> 35#include <asm/irq.h>
34#include <asm/delay.h> 36#include <asm/delay.h>
35#include <asm/fixmap.h> 37#include <asm/fixmap.h>
@@ -122,6 +124,7 @@ static void *get_call_destination(u8 type)
122 .pv_irq_ops = pv_irq_ops, 124 .pv_irq_ops = pv_irq_ops,
123 .pv_apic_ops = pv_apic_ops, 125 .pv_apic_ops = pv_apic_ops,
124 .pv_mmu_ops = pv_mmu_ops, 126 .pv_mmu_ops = pv_mmu_ops,
127 .pv_lock_ops = pv_lock_ops,
125 }; 128 };
126 return *((void **)&tmpl + type); 129 return *((void **)&tmpl + type);
127} 130}
@@ -139,7 +142,9 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
139 /* If the operation is a nop, then nop the callsite */ 142 /* If the operation is a nop, then nop the callsite */
140 ret = paravirt_patch_nop(); 143 ret = paravirt_patch_nop();
141 else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || 144 else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
142 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret)) 145 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
146 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
147 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64))
143 /* If operation requires a jmp, then jmp */ 148 /* If operation requires a jmp, then jmp */
144 ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len); 149 ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
145 else 150 else
@@ -190,7 +195,9 @@ static void native_flush_tlb_single(unsigned long addr)
190 195
191/* These are in entry.S */ 196/* These are in entry.S */
192extern void native_iret(void); 197extern void native_iret(void);
193extern void native_irq_enable_syscall_ret(void); 198extern void native_irq_enable_sysexit(void);
199extern void native_usergs_sysret32(void);
200extern void native_usergs_sysret64(void);
194 201
195static int __init print_banner(void) 202static int __init print_banner(void)
196{ 203{
@@ -261,6 +268,17 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
261 return __get_cpu_var(paravirt_lazy_mode); 268 return __get_cpu_var(paravirt_lazy_mode);
262} 269}
263 270
271void __init paravirt_use_bytelocks(void)
272{
273#ifdef CONFIG_SMP
274 pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
275 pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
276 pv_lock_ops.spin_lock = __byte_spin_lock;
277 pv_lock_ops.spin_trylock = __byte_spin_trylock;
278 pv_lock_ops.spin_unlock = __byte_spin_unlock;
279#endif
280}
281
264struct pv_info pv_info = { 282struct pv_info pv_info = {
265 .name = "bare hardware", 283 .name = "bare hardware",
266 .paravirt_enabled = 0, 284 .paravirt_enabled = 0,
@@ -280,7 +298,7 @@ struct pv_time_ops pv_time_ops = {
280 .get_wallclock = native_get_wallclock, 298 .get_wallclock = native_get_wallclock,
281 .set_wallclock = native_set_wallclock, 299 .set_wallclock = native_set_wallclock,
282 .sched_clock = native_sched_clock, 300 .sched_clock = native_sched_clock,
283 .get_cpu_khz = native_calculate_cpu_khz, 301 .get_tsc_khz = native_calibrate_tsc,
284}; 302};
285 303
286struct pv_irq_ops pv_irq_ops = { 304struct pv_irq_ops pv_irq_ops = {
@@ -291,6 +309,9 @@ struct pv_irq_ops pv_irq_ops = {
291 .irq_enable = native_irq_enable, 309 .irq_enable = native_irq_enable,
292 .safe_halt = native_safe_halt, 310 .safe_halt = native_safe_halt,
293 .halt = native_halt, 311 .halt = native_halt,
312#ifdef CONFIG_X86_64
313 .adjust_exception_frame = paravirt_nop,
314#endif
294}; 315};
295 316
296struct pv_cpu_ops pv_cpu_ops = { 317struct pv_cpu_ops pv_cpu_ops = {
@@ -321,12 +342,23 @@ struct pv_cpu_ops pv_cpu_ops = {
321 .store_idt = native_store_idt, 342 .store_idt = native_store_idt,
322 .store_tr = native_store_tr, 343 .store_tr = native_store_tr,
323 .load_tls = native_load_tls, 344 .load_tls = native_load_tls,
345#ifdef CONFIG_X86_64
346 .load_gs_index = native_load_gs_index,
347#endif
324 .write_ldt_entry = native_write_ldt_entry, 348 .write_ldt_entry = native_write_ldt_entry,
325 .write_gdt_entry = native_write_gdt_entry, 349 .write_gdt_entry = native_write_gdt_entry,
326 .write_idt_entry = native_write_idt_entry, 350 .write_idt_entry = native_write_idt_entry,
327 .load_sp0 = native_load_sp0, 351 .load_sp0 = native_load_sp0,
328 352
329 .irq_enable_syscall_ret = native_irq_enable_syscall_ret, 353#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
354 .irq_enable_sysexit = native_irq_enable_sysexit,
355#endif
356#ifdef CONFIG_X86_64
357#ifdef CONFIG_IA32_EMULATION
358 .usergs_sysret32 = native_usergs_sysret32,
359#endif
360 .usergs_sysret64 = native_usergs_sysret64,
361#endif
330 .iret = native_iret, 362 .iret = native_iret,
331 .swapgs = native_swapgs, 363 .swapgs = native_swapgs,
332 364
@@ -342,7 +374,6 @@ struct pv_cpu_ops pv_cpu_ops = {
342struct pv_apic_ops pv_apic_ops = { 374struct pv_apic_ops pv_apic_ops = {
343#ifdef CONFIG_X86_LOCAL_APIC 375#ifdef CONFIG_X86_LOCAL_APIC
344 .apic_write = native_apic_write, 376 .apic_write = native_apic_write,
345 .apic_write_atomic = native_apic_write_atomic,
346 .apic_read = native_apic_read, 377 .apic_read = native_apic_read,
347 .setup_boot_clock = setup_boot_APIC_clock, 378 .setup_boot_clock = setup_boot_APIC_clock,
348 .setup_secondary_clock = setup_secondary_APIC_clock, 379 .setup_secondary_clock = setup_secondary_APIC_clock,
@@ -354,6 +385,9 @@ struct pv_mmu_ops pv_mmu_ops = {
354#ifndef CONFIG_X86_64 385#ifndef CONFIG_X86_64
355 .pagetable_setup_start = native_pagetable_setup_start, 386 .pagetable_setup_start = native_pagetable_setup_start,
356 .pagetable_setup_done = native_pagetable_setup_done, 387 .pagetable_setup_done = native_pagetable_setup_done,
388#else
389 .pagetable_setup_start = paravirt_nop,
390 .pagetable_setup_done = paravirt_nop,
357#endif 391#endif
358 392
359 .read_cr2 = native_read_cr2, 393 .read_cr2 = native_read_cr2,
@@ -366,6 +400,9 @@ struct pv_mmu_ops pv_mmu_ops = {
366 .flush_tlb_single = native_flush_tlb_single, 400 .flush_tlb_single = native_flush_tlb_single,
367 .flush_tlb_others = native_flush_tlb_others, 401 .flush_tlb_others = native_flush_tlb_others,
368 402
403 .pgd_alloc = __paravirt_pgd_alloc,
404 .pgd_free = paravirt_nop,
405
369 .alloc_pte = paravirt_nop, 406 .alloc_pte = paravirt_nop,
370 .alloc_pmd = paravirt_nop, 407 .alloc_pmd = paravirt_nop,
371 .alloc_pmd_clone = paravirt_nop, 408 .alloc_pmd_clone = paravirt_nop,
@@ -380,6 +417,9 @@ struct pv_mmu_ops pv_mmu_ops = {
380 .pte_update = paravirt_nop, 417 .pte_update = paravirt_nop,
381 .pte_update_defer = paravirt_nop, 418 .pte_update_defer = paravirt_nop,
382 419
420 .ptep_modify_prot_start = __ptep_modify_prot_start,
421 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
422
383#ifdef CONFIG_HIGHPTE 423#ifdef CONFIG_HIGHPTE
384 .kmap_atomic_pte = kmap_atomic, 424 .kmap_atomic_pte = kmap_atomic,
385#endif 425#endif
@@ -403,6 +443,7 @@ struct pv_mmu_ops pv_mmu_ops = {
403#endif /* PAGETABLE_LEVELS >= 3 */ 443#endif /* PAGETABLE_LEVELS >= 3 */
404 444
405 .pte_val = native_pte_val, 445 .pte_val = native_pte_val,
446 .pte_flags = native_pte_flags,
406 .pgd_val = native_pgd_val, 447 .pgd_val = native_pgd_val,
407 448
408 .make_pte = native_make_pte, 449 .make_pte = native_make_pte,
@@ -416,7 +457,21 @@ struct pv_mmu_ops pv_mmu_ops = {
416 .enter = paravirt_nop, 457 .enter = paravirt_nop,
417 .leave = paravirt_nop, 458 .leave = paravirt_nop,
418 }, 459 },
460
461 .set_fixmap = native_set_fixmap,
462};
463
464struct pv_lock_ops pv_lock_ops = {
465#ifdef CONFIG_SMP
466 .spin_is_locked = __ticket_spin_is_locked,
467 .spin_is_contended = __ticket_spin_is_contended,
468
469 .spin_lock = __ticket_spin_lock,
470 .spin_trylock = __ticket_spin_trylock,
471 .spin_unlock = __ticket_spin_unlock,
472#endif
419}; 473};
474EXPORT_SYMBOL_GPL(pv_lock_ops);
420 475
421EXPORT_SYMBOL_GPL(pv_time_ops); 476EXPORT_SYMBOL_GPL(pv_time_ops);
422EXPORT_SYMBOL (pv_cpu_ops); 477EXPORT_SYMBOL (pv_cpu_ops);
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 82fc5fcab4f4..58262218781b 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -5,7 +5,7 @@ DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
5DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf"); 5DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
6DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax"); 6DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
7DEF_NATIVE(pv_cpu_ops, iret, "iret"); 7DEF_NATIVE(pv_cpu_ops, iret, "iret");
8DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "sti; sysexit"); 8DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit");
9DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax"); 9DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
10DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); 10DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
11DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); 11DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
@@ -29,7 +29,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
29 PATCH_SITE(pv_irq_ops, restore_fl); 29 PATCH_SITE(pv_irq_ops, restore_fl);
30 PATCH_SITE(pv_irq_ops, save_fl); 30 PATCH_SITE(pv_irq_ops, save_fl);
31 PATCH_SITE(pv_cpu_ops, iret); 31 PATCH_SITE(pv_cpu_ops, iret);
32 PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret); 32 PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
33 PATCH_SITE(pv_mmu_ops, read_cr2); 33 PATCH_SITE(pv_mmu_ops, read_cr2);
34 PATCH_SITE(pv_mmu_ops, read_cr3); 34 PATCH_SITE(pv_mmu_ops, read_cr3);
35 PATCH_SITE(pv_mmu_ops, write_cr3); 35 PATCH_SITE(pv_mmu_ops, write_cr3);
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 7d904e138d7e..061d01df9ae6 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -14,8 +14,9 @@ DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
14DEF_NATIVE(pv_cpu_ops, clts, "clts"); 14DEF_NATIVE(pv_cpu_ops, clts, "clts");
15DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); 15DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
16 16
17/* the three commands give us more control to how to return from a syscall */ 17DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "swapgs; sti; sysexit");
18DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "movq %gs:" __stringify(pda_oldrsp) ", %rsp; swapgs; sysretq;"); 18DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
19DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl");
19DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); 20DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
20 21
21unsigned native_patch(u8 type, u16 clobbers, void *ibuf, 22unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
@@ -35,7 +36,9 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
35 PATCH_SITE(pv_irq_ops, irq_enable); 36 PATCH_SITE(pv_irq_ops, irq_enable);
36 PATCH_SITE(pv_irq_ops, irq_disable); 37 PATCH_SITE(pv_irq_ops, irq_disable);
37 PATCH_SITE(pv_cpu_ops, iret); 38 PATCH_SITE(pv_cpu_ops, iret);
38 PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret); 39 PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
40 PATCH_SITE(pv_cpu_ops, usergs_sysret32);
41 PATCH_SITE(pv_cpu_ops, usergs_sysret64);
39 PATCH_SITE(pv_cpu_ops, swapgs); 42 PATCH_SITE(pv_cpu_ops, swapgs);
40 PATCH_SITE(pv_mmu_ops, read_cr2); 43 PATCH_SITE(pv_mmu_ops, read_cr2);
41 PATCH_SITE(pv_mmu_ops, read_cr3); 44 PATCH_SITE(pv_mmu_ops, read_cr3);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index e28ec497e142..b67a4b1d4eae 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -29,6 +29,7 @@
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/spinlock.h> 30#include <linux/spinlock.h>
31#include <linux/string.h> 31#include <linux/string.h>
32#include <linux/crash_dump.h>
32#include <linux/dma-mapping.h> 33#include <linux/dma-mapping.h>
33#include <linux/bitops.h> 34#include <linux/bitops.h>
34#include <linux/pci_ids.h> 35#include <linux/pci_ids.h>
@@ -36,7 +37,8 @@
36#include <linux/delay.h> 37#include <linux/delay.h>
37#include <linux/scatterlist.h> 38#include <linux/scatterlist.h>
38#include <linux/iommu-helper.h> 39#include <linux/iommu-helper.h>
39#include <asm/gart.h> 40
41#include <asm/iommu.h>
40#include <asm/calgary.h> 42#include <asm/calgary.h>
41#include <asm/tce.h> 43#include <asm/tce.h>
42#include <asm/pci-direct.h> 44#include <asm/pci-direct.h>
@@ -167,6 +169,8 @@ static void calgary_dump_error_regs(struct iommu_table *tbl);
167static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); 169static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
168static void calioc2_tce_cache_blast(struct iommu_table *tbl); 170static void calioc2_tce_cache_blast(struct iommu_table *tbl);
169static void calioc2_dump_error_regs(struct iommu_table *tbl); 171static void calioc2_dump_error_regs(struct iommu_table *tbl);
172static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl);
173static void get_tce_space_from_tar(void);
170 174
171static struct cal_chipset_ops calgary_chip_ops = { 175static struct cal_chipset_ops calgary_chip_ops = {
172 .handle_quirks = calgary_handle_quirks, 176 .handle_quirks = calgary_handle_quirks,
@@ -410,22 +414,6 @@ static void calgary_unmap_sg(struct device *dev,
410 } 414 }
411} 415}
412 416
413static int calgary_nontranslate_map_sg(struct device* dev,
414 struct scatterlist *sg, int nelems, int direction)
415{
416 struct scatterlist *s;
417 int i;
418
419 for_each_sg(sg, s, nelems, i) {
420 struct page *p = sg_page(s);
421
422 BUG_ON(!p);
423 s->dma_address = virt_to_bus(sg_virt(s));
424 s->dma_length = s->length;
425 }
426 return nelems;
427}
428
429static int calgary_map_sg(struct device *dev, struct scatterlist *sg, 417static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
430 int nelems, int direction) 418 int nelems, int direction)
431{ 419{
@@ -436,9 +424,6 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
436 unsigned long entry; 424 unsigned long entry;
437 int i; 425 int i;
438 426
439 if (!translation_enabled(tbl))
440 return calgary_nontranslate_map_sg(dev, sg, nelems, direction);
441
442 for_each_sg(sg, s, nelems, i) { 427 for_each_sg(sg, s, nelems, i) {
443 BUG_ON(!sg_page(s)); 428 BUG_ON(!sg_page(s));
444 429
@@ -474,7 +459,6 @@ error:
474static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, 459static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
475 size_t size, int direction) 460 size_t size, int direction)
476{ 461{
477 dma_addr_t dma_handle = bad_dma_address;
478 void *vaddr = phys_to_virt(paddr); 462 void *vaddr = phys_to_virt(paddr);
479 unsigned long uaddr; 463 unsigned long uaddr;
480 unsigned int npages; 464 unsigned int npages;
@@ -483,12 +467,7 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
483 uaddr = (unsigned long)vaddr; 467 uaddr = (unsigned long)vaddr;
484 npages = num_dma_pages(uaddr, size); 468 npages = num_dma_pages(uaddr, size);
485 469
486 if (translation_enabled(tbl)) 470 return iommu_alloc(dev, tbl, vaddr, npages, direction);
487 dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction);
488 else
489 dma_handle = virt_to_bus(vaddr);
490
491 return dma_handle;
492} 471}
493 472
494static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, 473static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
@@ -497,9 +476,6 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
497 struct iommu_table *tbl = find_iommu_table(dev); 476 struct iommu_table *tbl = find_iommu_table(dev);
498 unsigned int npages; 477 unsigned int npages;
499 478
500 if (!translation_enabled(tbl))
501 return;
502
503 npages = num_dma_pages(dma_handle, size); 479 npages = num_dma_pages(dma_handle, size);
504 iommu_free(tbl, dma_handle, npages); 480 iommu_free(tbl, dma_handle, npages);
505} 481}
@@ -522,18 +498,12 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
522 goto error; 498 goto error;
523 memset(ret, 0, size); 499 memset(ret, 0, size);
524 500
525 if (translation_enabled(tbl)) { 501 /* set up tces to cover the allocated range */
526 /* set up tces to cover the allocated range */ 502 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
527 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); 503 if (mapping == bad_dma_address)
528 if (mapping == bad_dma_address) 504 goto free;
529 goto free; 505 *dma_handle = mapping;
530
531 *dma_handle = mapping;
532 } else /* non translated slot */
533 *dma_handle = virt_to_bus(ret);
534
535 return ret; 506 return ret;
536
537free: 507free:
538 free_pages((unsigned long)ret, get_order(size)); 508 free_pages((unsigned long)ret, get_order(size));
539 ret = NULL; 509 ret = NULL;
@@ -541,7 +511,7 @@ error:
541 return ret; 511 return ret;
542} 512}
543 513
544static const struct dma_mapping_ops calgary_dma_ops = { 514static struct dma_mapping_ops calgary_dma_ops = {
545 .alloc_coherent = calgary_alloc_coherent, 515 .alloc_coherent = calgary_alloc_coherent,
546 .map_single = calgary_map_single, 516 .map_single = calgary_map_single,
547 .unmap_single = calgary_unmap_single, 517 .unmap_single = calgary_unmap_single,
@@ -830,7 +800,11 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
830 800
831 tbl = pci_iommu(dev->bus); 801 tbl = pci_iommu(dev->bus);
832 tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; 802 tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space;
833 tce_free(tbl, 0, tbl->it_size); 803
804 if (is_kdump_kernel())
805 calgary_init_bitmap_from_tce_table(tbl);
806 else
807 tce_free(tbl, 0, tbl->it_size);
834 808
835 if (is_calgary(dev->device)) 809 if (is_calgary(dev->device))
836 tbl->chip_ops = &calgary_chip_ops; 810 tbl->chip_ops = &calgary_chip_ops;
@@ -1209,6 +1183,10 @@ static int __init calgary_init(void)
1209 if (ret) 1183 if (ret)
1210 return ret; 1184 return ret;
1211 1185
1186 /* Purely for kdump kernel case */
1187 if (is_kdump_kernel())
1188 get_tce_space_from_tar();
1189
1212 do { 1190 do {
1213 dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); 1191 dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
1214 if (!dev) 1192 if (!dev)
@@ -1230,6 +1208,16 @@ static int __init calgary_init(void)
1230 goto error; 1208 goto error;
1231 } while (1); 1209 } while (1);
1232 1210
1211 dev = NULL;
1212 for_each_pci_dev(dev) {
1213 struct iommu_table *tbl;
1214
1215 tbl = find_iommu_table(&dev->dev);
1216
1217 if (translation_enabled(tbl))
1218 dev->dev.archdata.dma_ops = &calgary_dma_ops;
1219 }
1220
1233 return ret; 1221 return ret;
1234 1222
1235error: 1223error:
@@ -1251,6 +1239,7 @@ error:
1251 calgary_disable_translation(dev); 1239 calgary_disable_translation(dev);
1252 calgary_free_bus(dev); 1240 calgary_free_bus(dev);
1253 pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ 1241 pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
1242 dev->dev.archdata.dma_ops = NULL;
1254 } while (1); 1243 } while (1);
1255 1244
1256 return ret; 1245 return ret;
@@ -1339,6 +1328,61 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
1339 return (val != 0xffffffff); 1328 return (val != 0xffffffff);
1340} 1329}
1341 1330
1331/*
1332 * calgary_init_bitmap_from_tce_table():
1333 * Funtion for kdump case. In the second/kdump kernel initialize
1334 * the bitmap based on the tce table entries obtained from first kernel
1335 */
1336static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
1337{
1338 u64 *tp;
1339 unsigned int index;
1340 tp = ((u64 *)tbl->it_base);
1341 for (index = 0 ; index < tbl->it_size; index++) {
1342 if (*tp != 0x0)
1343 set_bit(index, tbl->it_map);
1344 tp++;
1345 }
1346}
1347
1348/*
1349 * get_tce_space_from_tar():
1350 * Function for kdump case. Get the tce tables from first kernel
1351 * by reading the contents of the base adress register of calgary iommu
1352 */
1353static void get_tce_space_from_tar()
1354{
1355 int bus;
1356 void __iomem *target;
1357 unsigned long tce_space;
1358
1359 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
1360 struct calgary_bus_info *info = &bus_info[bus];
1361 unsigned short pci_device;
1362 u32 val;
1363
1364 val = read_pci_config(bus, 0, 0, 0);
1365 pci_device = (val & 0xFFFF0000) >> 16;
1366
1367 if (!is_cal_pci_dev(pci_device))
1368 continue;
1369 if (info->translation_disabled)
1370 continue;
1371
1372 if (calgary_bus_has_devices(bus, pci_device) ||
1373 translate_empty_slots) {
1374 target = calgary_reg(bus_info[bus].bbar,
1375 tar_offset(bus));
1376 tce_space = be64_to_cpu(readq(target));
1377 tce_space = tce_space & TAR_SW_BITS;
1378
1379 tce_space = tce_space & (~specified_table_size);
1380 info->tce_space = (u64 *)__va(tce_space);
1381 }
1382 }
1383 return;
1384}
1385
1342void __init detect_calgary(void) 1386void __init detect_calgary(void)
1343{ 1387{
1344 int bus; 1388 int bus;
@@ -1394,7 +1438,8 @@ void __init detect_calgary(void)
1394 return; 1438 return;
1395 } 1439 }
1396 1440
1397 specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); 1441 specified_table_size = determine_tce_table_size((is_kdump_kernel() ?
1442 saved_max_pfn : max_pfn) * PAGE_SIZE);
1398 1443
1399 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { 1444 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
1400 struct calgary_bus_info *info = &bus_info[bus]; 1445 struct calgary_bus_info *info = &bus_info[bus];
@@ -1412,10 +1457,16 @@ void __init detect_calgary(void)
1412 1457
1413 if (calgary_bus_has_devices(bus, pci_device) || 1458 if (calgary_bus_has_devices(bus, pci_device) ||
1414 translate_empty_slots) { 1459 translate_empty_slots) {
1415 tbl = alloc_tce_table(); 1460 /*
1416 if (!tbl) 1461 * If it is kdump kernel, find and use tce tables
1417 goto cleanup; 1462 * from first kernel, else allocate tce tables here
1418 info->tce_space = tbl; 1463 */
1464 if (!is_kdump_kernel()) {
1465 tbl = alloc_tce_table();
1466 if (!tbl)
1467 goto cleanup;
1468 info->tce_space = tbl;
1469 }
1419 calgary_found = 1; 1470 calgary_found = 1;
1420 } 1471 }
1421 } 1472 }
@@ -1430,6 +1481,10 @@ void __init detect_calgary(void)
1430 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " 1481 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, "
1431 "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, 1482 "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size,
1432 debugging ? "enabled" : "disabled"); 1483 debugging ? "enabled" : "disabled");
1484
1485 /* swiotlb for devices that aren't behind the Calgary. */
1486 if (max_pfn > MAX_DMA32_PFN)
1487 swiotlb = 1;
1433 } 1488 }
1434 return; 1489 return;
1435 1490
@@ -1446,7 +1501,7 @@ int __init calgary_iommu_init(void)
1446{ 1501{
1447 int ret; 1502 int ret;
1448 1503
1449 if (no_iommu || swiotlb) 1504 if (no_iommu || (swiotlb && !calgary_detected))
1450 return -ENODEV; 1505 return -ENODEV;
1451 1506
1452 if (!calgary_detected) 1507 if (!calgary_detected)
@@ -1459,15 +1514,14 @@ int __init calgary_iommu_init(void)
1459 if (ret) { 1514 if (ret) {
1460 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " 1515 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
1461 "falling back to no_iommu\n", ret); 1516 "falling back to no_iommu\n", ret);
1462 if (end_pfn > MAX_DMA32_PFN)
1463 printk(KERN_ERR "WARNING more than 4GB of memory, "
1464 "32bit PCI may malfunction.\n");
1465 return ret; 1517 return ret;
1466 } 1518 }
1467 1519
1468 force_iommu = 1; 1520 force_iommu = 1;
1469 bad_dma_address = 0x0; 1521 bad_dma_address = 0x0;
1470 dma_ops = &calgary_dma_ops; 1522 /* dma_ops is set to swiotlb or nommu */
1523 if (!dma_ops)
1524 dma_ops = &nommu_dma_ops;
1471 1525
1472 return 0; 1526 return 0;
1473} 1527}
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index dc00a1331ace..37544123896d 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -5,13 +5,13 @@
5 5
6#include <asm/proto.h> 6#include <asm/proto.h>
7#include <asm/dma.h> 7#include <asm/dma.h>
8#include <asm/gart.h> 8#include <asm/iommu.h>
9#include <asm/calgary.h> 9#include <asm/calgary.h>
10#include <asm/amd_iommu.h>
10 11
11int forbid_dac __read_mostly; 12static int forbid_dac __read_mostly;
12EXPORT_SYMBOL(forbid_dac);
13 13
14const struct dma_mapping_ops *dma_ops; 14struct dma_mapping_ops *dma_ops;
15EXPORT_SYMBOL(dma_ops); 15EXPORT_SYMBOL(dma_ops);
16 16
17static int iommu_sac_force __read_mostly; 17static int iommu_sac_force __read_mostly;
@@ -74,13 +74,17 @@ early_param("dma32_size", parse_dma32_size_opt);
74void __init dma32_reserve_bootmem(void) 74void __init dma32_reserve_bootmem(void)
75{ 75{
76 unsigned long size, align; 76 unsigned long size, align;
77 if (end_pfn <= MAX_DMA32_PFN) 77 if (max_pfn <= MAX_DMA32_PFN)
78 return; 78 return;
79 79
80 /*
81 * check aperture_64.c allocate_aperture() for reason about
82 * using 512M as goal
83 */
80 align = 64ULL<<20; 84 align = 64ULL<<20;
81 size = round_up(dma32_bootmem_size, align); 85 size = round_up(dma32_bootmem_size, align);
82 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, 86 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
83 __pa(MAX_DMA_ADDRESS)); 87 512ULL<<20);
84 if (dma32_bootmem_ptr) 88 if (dma32_bootmem_ptr)
85 dma32_bootmem_size = size; 89 dma32_bootmem_size = size;
86 else 90 else
@@ -88,17 +92,14 @@ void __init dma32_reserve_bootmem(void)
88} 92}
89static void __init dma32_free_bootmem(void) 93static void __init dma32_free_bootmem(void)
90{ 94{
91 int node;
92 95
93 if (end_pfn <= MAX_DMA32_PFN) 96 if (max_pfn <= MAX_DMA32_PFN)
94 return; 97 return;
95 98
96 if (!dma32_bootmem_ptr) 99 if (!dma32_bootmem_ptr)
97 return; 100 return;
98 101
99 for_each_online_node(node) 102 free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size);
100 free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr),
101 dma32_bootmem_size);
102 103
103 dma32_bootmem_ptr = NULL; 104 dma32_bootmem_ptr = NULL;
104 dma32_bootmem_size = 0; 105 dma32_bootmem_size = 0;
@@ -112,19 +113,15 @@ void __init pci_iommu_alloc(void)
112 * The order of these functions is important for 113 * The order of these functions is important for
113 * fall-back/fail-over reasons 114 * fall-back/fail-over reasons
114 */ 115 */
115#ifdef CONFIG_GART_IOMMU
116 gart_iommu_hole_init(); 116 gart_iommu_hole_init();
117#endif
118 117
119#ifdef CONFIG_CALGARY_IOMMU
120 detect_calgary(); 118 detect_calgary();
121#endif
122 119
123 detect_intel_iommu(); 120 detect_intel_iommu();
124 121
125#ifdef CONFIG_SWIOTLB 122 amd_iommu_detect();
123
126 pci_swiotlb_init(); 124 pci_swiotlb_init();
127#endif
128} 125}
129#endif 126#endif
130 127
@@ -180,9 +177,7 @@ static __init int iommu_setup(char *p)
180 swiotlb = 1; 177 swiotlb = 1;
181#endif 178#endif
182 179
183#ifdef CONFIG_GART_IOMMU
184 gart_parse_options(p); 180 gart_parse_options(p);
185#endif
186 181
187#ifdef CONFIG_CALGARY_IOMMU 182#ifdef CONFIG_CALGARY_IOMMU
188 if (!strncmp(p, "calgary", 7)) 183 if (!strncmp(p, "calgary", 7))
@@ -317,16 +312,17 @@ static int dma_release_coherent(struct device *dev, int order, void *vaddr)
317 312
318int dma_supported(struct device *dev, u64 mask) 313int dma_supported(struct device *dev, u64 mask)
319{ 314{
315 struct dma_mapping_ops *ops = get_dma_ops(dev);
316
320#ifdef CONFIG_PCI 317#ifdef CONFIG_PCI
321 if (mask > 0xffffffff && forbid_dac > 0) { 318 if (mask > 0xffffffff && forbid_dac > 0) {
322 printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", 319 dev_info(dev, "PCI: Disallowing DAC for device\n");
323 dev->bus_id);
324 return 0; 320 return 0;
325 } 321 }
326#endif 322#endif
327 323
328 if (dma_ops->dma_supported) 324 if (ops->dma_supported)
329 return dma_ops->dma_supported(dev, mask); 325 return ops->dma_supported(dev, mask);
330 326
331 /* Copied from i386. Doesn't make much sense, because it will 327 /* Copied from i386. Doesn't make much sense, because it will
332 only work for pci_alloc_coherent. 328 only work for pci_alloc_coherent.
@@ -347,8 +343,7 @@ int dma_supported(struct device *dev, u64 mask)
347 type. Normally this doesn't make any difference, but gives 343 type. Normally this doesn't make any difference, but gives
348 more gentle handling of IOMMU overflow. */ 344 more gentle handling of IOMMU overflow. */
349 if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) { 345 if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
350 printk(KERN_INFO "%s: Force SAC with mask %Lx\n", 346 dev_info(dev, "Force SAC with mask %Lx\n", mask);
351 dev->bus_id, mask);
352 return 0; 347 return 0;
353 } 348 }
354 349
@@ -357,7 +352,7 @@ int dma_supported(struct device *dev, u64 mask)
357EXPORT_SYMBOL(dma_supported); 352EXPORT_SYMBOL(dma_supported);
358 353
359/* Allocate DMA memory on node near device */ 354/* Allocate DMA memory on node near device */
360noinline struct page * 355static noinline struct page *
361dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order) 356dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
362{ 357{
363 int node; 358 int node;
@@ -374,6 +369,7 @@ void *
374dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, 369dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
375 gfp_t gfp) 370 gfp_t gfp)
376{ 371{
372 struct dma_mapping_ops *ops = get_dma_ops(dev);
377 void *memory = NULL; 373 void *memory = NULL;
378 struct page *page; 374 struct page *page;
379 unsigned long dma_mask = 0; 375 unsigned long dma_mask = 0;
@@ -442,8 +438,8 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
442 /* Let low level make its own zone decisions */ 438 /* Let low level make its own zone decisions */
443 gfp &= ~(GFP_DMA32|GFP_DMA); 439 gfp &= ~(GFP_DMA32|GFP_DMA);
444 440
445 if (dma_ops->alloc_coherent) 441 if (ops->alloc_coherent)
446 return dma_ops->alloc_coherent(dev, size, 442 return ops->alloc_coherent(dev, size,
447 dma_handle, gfp); 443 dma_handle, gfp);
448 return NULL; 444 return NULL;
449 } 445 }
@@ -455,14 +451,14 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
455 } 451 }
456 } 452 }
457 453
458 if (dma_ops->alloc_coherent) { 454 if (ops->alloc_coherent) {
459 free_pages((unsigned long)memory, get_order(size)); 455 free_pages((unsigned long)memory, get_order(size));
460 gfp &= ~(GFP_DMA|GFP_DMA32); 456 gfp &= ~(GFP_DMA|GFP_DMA32);
461 return dma_ops->alloc_coherent(dev, size, dma_handle, gfp); 457 return ops->alloc_coherent(dev, size, dma_handle, gfp);
462 } 458 }
463 459
464 if (dma_ops->map_simple) { 460 if (ops->map_simple) {
465 *dma_handle = dma_ops->map_simple(dev, virt_to_phys(memory), 461 *dma_handle = ops->map_simple(dev, virt_to_phys(memory),
466 size, 462 size,
467 PCI_DMA_BIDIRECTIONAL); 463 PCI_DMA_BIDIRECTIONAL);
468 if (*dma_handle != bad_dma_address) 464 if (*dma_handle != bad_dma_address)
@@ -484,27 +480,27 @@ EXPORT_SYMBOL(dma_alloc_coherent);
484void dma_free_coherent(struct device *dev, size_t size, 480void dma_free_coherent(struct device *dev, size_t size,
485 void *vaddr, dma_addr_t bus) 481 void *vaddr, dma_addr_t bus)
486{ 482{
483 struct dma_mapping_ops *ops = get_dma_ops(dev);
484
487 int order = get_order(size); 485 int order = get_order(size);
488 WARN_ON(irqs_disabled()); /* for portability */ 486 WARN_ON(irqs_disabled()); /* for portability */
489 if (dma_release_coherent(dev, order, vaddr)) 487 if (dma_release_coherent(dev, order, vaddr))
490 return; 488 return;
491 if (dma_ops->unmap_single) 489 if (ops->unmap_single)
492 dma_ops->unmap_single(dev, bus, size, 0); 490 ops->unmap_single(dev, bus, size, 0);
493 free_pages((unsigned long)vaddr, order); 491 free_pages((unsigned long)vaddr, order);
494} 492}
495EXPORT_SYMBOL(dma_free_coherent); 493EXPORT_SYMBOL(dma_free_coherent);
496 494
497static int __init pci_iommu_init(void) 495static int __init pci_iommu_init(void)
498{ 496{
499#ifdef CONFIG_CALGARY_IOMMU
500 calgary_iommu_init(); 497 calgary_iommu_init();
501#endif
502 498
503 intel_iommu_init(); 499 intel_iommu_init();
504 500
505#ifdef CONFIG_GART_IOMMU 501 amd_iommu_init();
502
506 gart_iommu_init(); 503 gart_iommu_init();
507#endif
508 504
509 no_iommu_init(); 505 no_iommu_init();
510 return 0; 506 return 0;
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index aa8ec928caa8..744126e64950 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -32,6 +32,7 @@
32#include <asm/mtrr.h> 32#include <asm/mtrr.h>
33#include <asm/pgtable.h> 33#include <asm/pgtable.h>
34#include <asm/proto.h> 34#include <asm/proto.h>
35#include <asm/iommu.h>
35#include <asm/gart.h> 36#include <asm/gart.h>
36#include <asm/cacheflush.h> 37#include <asm/cacheflush.h>
37#include <asm/swiotlb.h> 38#include <asm/swiotlb.h>
@@ -104,7 +105,6 @@ static unsigned long alloc_iommu(struct device *dev, int size)
104 size, base_index, boundary_size, 0); 105 size, base_index, boundary_size, 0);
105 } 106 }
106 if (offset != -1) { 107 if (offset != -1) {
107 set_bit_string(iommu_gart_bitmap, offset, size);
108 next_bit = offset+size; 108 next_bit = offset+size;
109 if (next_bit >= iommu_pages) { 109 if (next_bit >= iommu_pages) {
110 next_bit = 0; 110 next_bit = 0;
@@ -198,9 +198,7 @@ static void iommu_full(struct device *dev, size_t size, int dir)
198 * out. Hopefully no network devices use single mappings that big. 198 * out. Hopefully no network devices use single mappings that big.
199 */ 199 */
200 200
201 printk(KERN_ERR 201 dev_err(dev, "PCI-DMA: Out of IOMMU space for %lu bytes\n", size);
202 "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
203 size, dev->bus_id);
204 202
205 if (size > PAGE_SIZE*EMERGENCY_PAGES) { 203 if (size > PAGE_SIZE*EMERGENCY_PAGES) {
206 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) 204 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
@@ -534,8 +532,8 @@ static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
534 unsigned aper_size = 0, aper_base_32, aper_order; 532 unsigned aper_size = 0, aper_base_32, aper_order;
535 u64 aper_base; 533 u64 aper_base;
536 534
537 pci_read_config_dword(dev, 0x94, &aper_base_32); 535 pci_read_config_dword(dev, AMD64_GARTAPERTUREBASE, &aper_base_32);
538 pci_read_config_dword(dev, 0x90, &aper_order); 536 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &aper_order);
539 aper_order = (aper_order >> 1) & 7; 537 aper_order = (aper_order >> 1) & 7;
540 538
541 aper_base = aper_base_32 & 0x7fff; 539 aper_base = aper_base_32 & 0x7fff;
@@ -549,14 +547,63 @@ static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
549 return aper_base; 547 return aper_base;
550} 548}
551 549
550static void enable_gart_translations(void)
551{
552 int i;
553
554 for (i = 0; i < num_k8_northbridges; i++) {
555 struct pci_dev *dev = k8_northbridges[i];
556
557 enable_gart_translation(dev, __pa(agp_gatt_table));
558 }
559}
560
561/*
562 * If fix_up_north_bridges is set, the north bridges have to be fixed up on
563 * resume in the same way as they are handled in gart_iommu_hole_init().
564 */
565static bool fix_up_north_bridges;
566static u32 aperture_order;
567static u32 aperture_alloc;
568
569void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
570{
571 fix_up_north_bridges = true;
572 aperture_order = aper_order;
573 aperture_alloc = aper_alloc;
574}
575
552static int gart_resume(struct sys_device *dev) 576static int gart_resume(struct sys_device *dev)
553{ 577{
578 printk(KERN_INFO "PCI-DMA: Resuming GART IOMMU\n");
579
580 if (fix_up_north_bridges) {
581 int i;
582
583 printk(KERN_INFO "PCI-DMA: Restoring GART aperture settings\n");
584
585 for (i = 0; i < num_k8_northbridges; i++) {
586 struct pci_dev *dev = k8_northbridges[i];
587
588 /*
589 * Don't enable translations just yet. That is the next
590 * step. Restore the pre-suspend aperture settings.
591 */
592 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL,
593 aperture_order << 1);
594 pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE,
595 aperture_alloc >> 25);
596 }
597 }
598
599 enable_gart_translations();
600
554 return 0; 601 return 0;
555} 602}
556 603
557static int gart_suspend(struct sys_device *dev, pm_message_t state) 604static int gart_suspend(struct sys_device *dev, pm_message_t state)
558{ 605{
559 return -EINVAL; 606 return 0;
560} 607}
561 608
562static struct sysdev_class gart_sysdev_class = { 609static struct sysdev_class gart_sysdev_class = {
@@ -582,6 +629,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
582 struct pci_dev *dev; 629 struct pci_dev *dev;
583 void *gatt; 630 void *gatt;
584 int i, error; 631 int i, error;
632 unsigned long start_pfn, end_pfn;
585 633
586 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); 634 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
587 aper_size = aper_base = info->aper_size = 0; 635 aper_size = aper_base = info->aper_size = 0;
@@ -614,31 +662,25 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
614 memset(gatt, 0, gatt_size); 662 memset(gatt, 0, gatt_size);
615 agp_gatt_table = gatt; 663 agp_gatt_table = gatt;
616 664
617 for (i = 0; i < num_k8_northbridges; i++) { 665 enable_gart_translations();
618 u32 gatt_reg;
619 u32 ctl;
620
621 dev = k8_northbridges[i];
622 gatt_reg = __pa(gatt) >> 12;
623 gatt_reg <<= 4;
624 pci_write_config_dword(dev, 0x98, gatt_reg);
625 pci_read_config_dword(dev, 0x90, &ctl);
626
627 ctl |= 1;
628 ctl &= ~((1<<4) | (1<<5));
629
630 pci_write_config_dword(dev, 0x90, ctl);
631 }
632 666
633 error = sysdev_class_register(&gart_sysdev_class); 667 error = sysdev_class_register(&gart_sysdev_class);
634 if (!error) 668 if (!error)
635 error = sysdev_register(&device_gart); 669 error = sysdev_register(&device_gart);
636 if (error) 670 if (error)
637 panic("Could not register gart_sysdev -- would corrupt data on next suspend"); 671 panic("Could not register gart_sysdev -- would corrupt data on next suspend");
672
638 flush_gart(); 673 flush_gart();
639 674
640 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", 675 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
641 aper_base, aper_size>>10); 676 aper_base, aper_size>>10);
677
678 /* need to map that range */
679 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
680 if (end_pfn > max_low_pfn_mapped) {
681 start_pfn = (aper_base>>PAGE_SHIFT);
682 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
683 }
642 return 0; 684 return 0;
643 685
644 nommu: 686 nommu:
@@ -650,8 +692,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
650 692
651extern int agp_amd64_init(void); 693extern int agp_amd64_init(void);
652 694
653static const struct dma_mapping_ops gart_dma_ops = { 695static struct dma_mapping_ops gart_dma_ops = {
654 .mapping_error = NULL,
655 .map_single = gart_map_single, 696 .map_single = gart_map_single,
656 .map_simple = gart_map_simple, 697 .map_simple = gart_map_simple,
657 .unmap_single = gart_unmap_single, 698 .unmap_single = gart_unmap_single,
@@ -677,11 +718,11 @@ void gart_iommu_shutdown(void)
677 u32 ctl; 718 u32 ctl;
678 719
679 dev = k8_northbridges[i]; 720 dev = k8_northbridges[i];
680 pci_read_config_dword(dev, 0x90, &ctl); 721 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
681 722
682 ctl &= ~1; 723 ctl &= ~GARTEN;
683 724
684 pci_write_config_dword(dev, 0x90, ctl); 725 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
685 } 726 }
686} 727}
687 728
@@ -716,10 +757,10 @@ void __init gart_iommu_init(void)
716 return; 757 return;
717 758
718 if (no_iommu || 759 if (no_iommu ||
719 (!force_iommu && end_pfn <= MAX_DMA32_PFN) || 760 (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
720 !gart_iommu_aperture || 761 !gart_iommu_aperture ||
721 (no_agp && init_k8_gatt(&info) < 0)) { 762 (no_agp && init_k8_gatt(&info) < 0)) {
722 if (end_pfn > MAX_DMA32_PFN) { 763 if (max_pfn > MAX_DMA32_PFN) {
723 printk(KERN_WARNING "More than 4GB of memory " 764 printk(KERN_WARNING "More than 4GB of memory "
724 "but GART IOMMU not available.\n" 765 "but GART IOMMU not available.\n"
725 KERN_WARNING "falling back to iommu=soft.\n"); 766 KERN_WARNING "falling back to iommu=soft.\n");
@@ -788,10 +829,10 @@ void __init gart_iommu_init(void)
788 wbinvd(); 829 wbinvd();
789 830
790 /* 831 /*
791 * Try to workaround a bug (thanks to BenH) 832 * Try to workaround a bug (thanks to BenH):
792 * Set unmapped entries to a scratch page instead of 0. 833 * Set unmapped entries to a scratch page instead of 0.
793 * Any prefetches that hit unmapped entries won't get an bus abort 834 * Any prefetches that hit unmapped entries won't get an bus abort
794 * then. 835 * then. (P2P bridge may be prefetching on DMA reads).
795 */ 836 */
796 scratch = get_zeroed_page(GFP_KERNEL); 837 scratch = get_zeroed_page(GFP_KERNEL);
797 if (!scratch) 838 if (!scratch)
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index aec43d56f49c..3f91f71cdc3e 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -7,7 +7,7 @@
7#include <linux/dma-mapping.h> 7#include <linux/dma-mapping.h>
8#include <linux/scatterlist.h> 8#include <linux/scatterlist.h>
9 9
10#include <asm/gart.h> 10#include <asm/iommu.h>
11#include <asm/processor.h> 11#include <asm/processor.h>
12#include <asm/dma.h> 12#include <asm/dma.h>
13 13
@@ -72,21 +72,9 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
72 return nents; 72 return nents;
73} 73}
74 74
75/* Make sure we keep the same behaviour */ 75struct dma_mapping_ops nommu_dma_ops = {
76static int nommu_mapping_error(dma_addr_t dma_addr)
77{
78#ifdef CONFIG_X86_32
79 return 0;
80#else
81 return (dma_addr == bad_dma_address);
82#endif
83}
84
85
86const struct dma_mapping_ops nommu_dma_ops = {
87 .map_single = nommu_map_single, 76 .map_single = nommu_map_single,
88 .map_sg = nommu_map_sg, 77 .map_sg = nommu_map_sg,
89 .mapping_error = nommu_mapping_error,
90 .is_phys = 1, 78 .is_phys = 1,
91}; 79};
92 80
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 490da7f4b8d0..c4ce0332759e 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -5,7 +5,7 @@
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/dma-mapping.h> 6#include <linux/dma-mapping.h>
7 7
8#include <asm/gart.h> 8#include <asm/iommu.h>
9#include <asm/swiotlb.h> 9#include <asm/swiotlb.h>
10#include <asm/dma.h> 10#include <asm/dma.h>
11 11
@@ -18,7 +18,7 @@ swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size,
18 return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction); 18 return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction);
19} 19}
20 20
21const struct dma_mapping_ops swiotlb_dma_ops = { 21struct dma_mapping_ops swiotlb_dma_ops = {
22 .mapping_error = swiotlb_dma_mapping_error, 22 .mapping_error = swiotlb_dma_mapping_error,
23 .alloc_coherent = swiotlb_alloc_coherent, 23 .alloc_coherent = swiotlb_alloc_coherent,
24 .free_coherent = swiotlb_free_coherent, 24 .free_coherent = swiotlb_free_coherent,
@@ -38,7 +38,7 @@ const struct dma_mapping_ops swiotlb_dma_ops = {
38void __init pci_swiotlb_init(void) 38void __init pci_swiotlb_init(void)
39{ 39{
40 /* don't initialize swiotlb if iommu=off (no_iommu=1) */ 40 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
41 if (!iommu_detected && !no_iommu && end_pfn > MAX_DMA32_PFN) 41 if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)
42 swiotlb = 1; 42 swiotlb = 1;
43 if (swiotlb_force) 43 if (swiotlb_force)
44 swiotlb = 1; 44 swiotlb = 1;
diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms_32.c
new file mode 100644
index 000000000000..675a48c404a5
--- /dev/null
+++ b/arch/x86/kernel/probe_roms_32.c
@@ -0,0 +1,166 @@
1#include <linux/sched.h>
2#include <linux/mm.h>
3#include <linux/uaccess.h>
4#include <linux/mmzone.h>
5#include <linux/ioport.h>
6#include <linux/seq_file.h>
7#include <linux/console.h>
8#include <linux/init.h>
9#include <linux/edd.h>
10#include <linux/dmi.h>
11#include <linux/pfn.h>
12#include <linux/pci.h>
13#include <asm/pci-direct.h>
14
15
16#include <asm/e820.h>
17#include <asm/mmzone.h>
18#include <asm/setup.h>
19#include <asm/sections.h>
20#include <asm/io.h>
21#include <setup_arch.h>
22
23static struct resource system_rom_resource = {
24 .name = "System ROM",
25 .start = 0xf0000,
26 .end = 0xfffff,
27 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
28};
29
30static struct resource extension_rom_resource = {
31 .name = "Extension ROM",
32 .start = 0xe0000,
33 .end = 0xeffff,
34 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
35};
36
37static struct resource adapter_rom_resources[] = { {
38 .name = "Adapter ROM",
39 .start = 0xc8000,
40 .end = 0,
41 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
42}, {
43 .name = "Adapter ROM",
44 .start = 0,
45 .end = 0,
46 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
47}, {
48 .name = "Adapter ROM",
49 .start = 0,
50 .end = 0,
51 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
52}, {
53 .name = "Adapter ROM",
54 .start = 0,
55 .end = 0,
56 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
57}, {
58 .name = "Adapter ROM",
59 .start = 0,
60 .end = 0,
61 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
62}, {
63 .name = "Adapter ROM",
64 .start = 0,
65 .end = 0,
66 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
67} };
68
69static struct resource video_rom_resource = {
70 .name = "Video ROM",
71 .start = 0xc0000,
72 .end = 0xc7fff,
73 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
74};
75
76#define ROMSIGNATURE 0xaa55
77
78static int __init romsignature(const unsigned char *rom)
79{
80 const unsigned short * const ptr = (const unsigned short *)rom;
81 unsigned short sig;
82
83 return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
84}
85
86static int __init romchecksum(const unsigned char *rom, unsigned long length)
87{
88 unsigned char sum, c;
89
90 for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
91 sum += c;
92 return !length && !sum;
93}
94
95void __init probe_roms(void)
96{
97 const unsigned char *rom;
98 unsigned long start, length, upper;
99 unsigned char c;
100 int i;
101
102 /* video rom */
103 upper = adapter_rom_resources[0].start;
104 for (start = video_rom_resource.start; start < upper; start += 2048) {
105 rom = isa_bus_to_virt(start);
106 if (!romsignature(rom))
107 continue;
108
109 video_rom_resource.start = start;
110
111 if (probe_kernel_address(rom + 2, c) != 0)
112 continue;
113
114 /* 0 < length <= 0x7f * 512, historically */
115 length = c * 512;
116
117 /* if checksum okay, trust length byte */
118 if (length && romchecksum(rom, length))
119 video_rom_resource.end = start + length - 1;
120
121 request_resource(&iomem_resource, &video_rom_resource);
122 break;
123 }
124
125 start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
126 if (start < upper)
127 start = upper;
128
129 /* system rom */
130 request_resource(&iomem_resource, &system_rom_resource);
131 upper = system_rom_resource.start;
132
133 /* check for extension rom (ignore length byte!) */
134 rom = isa_bus_to_virt(extension_rom_resource.start);
135 if (romsignature(rom)) {
136 length = extension_rom_resource.end - extension_rom_resource.start + 1;
137 if (romchecksum(rom, length)) {
138 request_resource(&iomem_resource, &extension_rom_resource);
139 upper = extension_rom_resource.start;
140 }
141 }
142
143 /* check for adapter roms on 2k boundaries */
144 for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
145 rom = isa_bus_to_virt(start);
146 if (!romsignature(rom))
147 continue;
148
149 if (probe_kernel_address(rom + 2, c) != 0)
150 continue;
151
152 /* 0 < length <= 0x7f * 512, historically */
153 length = c * 512;
154
155 /* but accept any length that fits if checksum okay */
156 if (!length || start + length > upper || !romchecksum(rom, length))
157 continue;
158
159 adapter_rom_resources[i].start = start;
160 adapter_rom_resources[i].end = start + length - 1;
161 request_resource(&iomem_resource, &adapter_rom_resources[i]);
162
163 start = adapter_rom_resources[i++].end & ~2047UL;
164 }
165}
166
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ba370dc8685b..7fc4d5b0a6a0 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -6,8 +6,16 @@
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/pm.h> 8#include <linux/pm.h>
9#include <linux/clockchips.h>
10#include <asm/system.h>
11
12unsigned long idle_halt;
13EXPORT_SYMBOL(idle_halt);
14unsigned long idle_nomwait;
15EXPORT_SYMBOL(idle_nomwait);
9 16
10struct kmem_cache *task_xstate_cachep; 17struct kmem_cache *task_xstate_cachep;
18static int force_mwait __cpuinitdata;
11 19
12int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 20int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
13{ 21{
@@ -45,6 +53,76 @@ void arch_task_cache_init(void)
45 SLAB_PANIC, NULL); 53 SLAB_PANIC, NULL);
46} 54}
47 55
56/*
57 * Idle related variables and functions
58 */
59unsigned long boot_option_idle_override = 0;
60EXPORT_SYMBOL(boot_option_idle_override);
61
62/*
63 * Powermanagement idle function, if any..
64 */
65void (*pm_idle)(void);
66EXPORT_SYMBOL(pm_idle);
67
68#ifdef CONFIG_X86_32
69/*
70 * This halt magic was a workaround for ancient floppy DMA
71 * wreckage. It should be safe to remove.
72 */
73static int hlt_counter;
74void disable_hlt(void)
75{
76 hlt_counter++;
77}
78EXPORT_SYMBOL(disable_hlt);
79
80void enable_hlt(void)
81{
82 hlt_counter--;
83}
84EXPORT_SYMBOL(enable_hlt);
85
86static inline int hlt_use_halt(void)
87{
88 return (!hlt_counter && boot_cpu_data.hlt_works_ok);
89}
90#else
91static inline int hlt_use_halt(void)
92{
93 return 1;
94}
95#endif
96
97/*
98 * We use this if we don't have any better
99 * idle routine..
100 */
101void default_idle(void)
102{
103 if (hlt_use_halt()) {
104 current_thread_info()->status &= ~TS_POLLING;
105 /*
106 * TS_POLLING-cleared state must be visible before we
107 * test NEED_RESCHED:
108 */
109 smp_mb();
110
111 if (!need_resched())
112 safe_halt(); /* enables interrupts racelessly */
113 else
114 local_irq_enable();
115 current_thread_info()->status |= TS_POLLING;
116 } else {
117 local_irq_enable();
118 /* loop is done by the caller */
119 cpu_relax();
120 }
121}
122#ifdef CONFIG_APM_MODULE
123EXPORT_SYMBOL(default_idle);
124#endif
125
48static void do_nothing(void *unused) 126static void do_nothing(void *unused)
49{ 127{
50} 128}
@@ -61,7 +139,7 @@ void cpu_idle_wait(void)
61{ 139{
62 smp_mb(); 140 smp_mb();
63 /* kick all the CPUs so that they exit out of pm_idle */ 141 /* kick all the CPUs so that they exit out of pm_idle */
64 smp_call_function(do_nothing, NULL, 0, 1); 142 smp_call_function(do_nothing, NULL, 1);
65} 143}
66EXPORT_SYMBOL_GPL(cpu_idle_wait); 144EXPORT_SYMBOL_GPL(cpu_idle_wait);
67 145
@@ -122,54 +200,163 @@ static void poll_idle(void)
122 * 200 *
123 * idle=mwait overrides this decision and forces the usage of mwait. 201 * idle=mwait overrides this decision and forces the usage of mwait.
124 */ 202 */
203static int __cpuinitdata force_mwait;
204
205#define MWAIT_INFO 0x05
206#define MWAIT_ECX_EXTENDED_INFO 0x01
207#define MWAIT_EDX_C1 0xf0
208
125static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) 209static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
126{ 210{
211 u32 eax, ebx, ecx, edx;
212
127 if (force_mwait) 213 if (force_mwait)
128 return 1; 214 return 1;
129 215
130 if (c->x86_vendor == X86_VENDOR_AMD) { 216 if (c->cpuid_level < MWAIT_INFO)
131 switch(c->x86) { 217 return 0;
132 case 0x10: 218
133 case 0x11: 219 cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx);
134 return 0; 220 /* Check, whether EDX has extended info about MWAIT */
135 } 221 if (!(ecx & MWAIT_ECX_EXTENDED_INFO))
136 } 222 return 1;
223
224 /*
225 * edx enumeratios MONITOR/MWAIT extensions. Check, whether
226 * C1 supports MWAIT
227 */
228 return (edx & MWAIT_EDX_C1);
229}
230
231/*
232 * Check for AMD CPUs, which have potentially C1E support
233 */
234static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
235{
236 if (c->x86_vendor != X86_VENDOR_AMD)
237 return 0;
238
239 if (c->x86 < 0x0F)
240 return 0;
241
242 /* Family 0x0f models < rev F do not have C1E */
243 if (c->x86 == 0x0f && c->x86_model < 0x40)
244 return 0;
245
137 return 1; 246 return 1;
138} 247}
139 248
140void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) 249/*
250 * C1E aware idle routine. We check for C1E active in the interrupt
251 * pending message MSR. If we detect C1E, then we handle it the same
252 * way as C3 power states (local apic timer and TSC stop)
253 */
254static void c1e_idle(void)
141{ 255{
142 static int selected; 256 static cpumask_t c1e_mask = CPU_MASK_NONE;
257 static int c1e_detected;
143 258
144 if (selected) 259 if (need_resched())
145 return; 260 return;
261
262 if (!c1e_detected) {
263 u32 lo, hi;
264
265 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
266 if (lo & K8_INTP_C1E_ACTIVE_MASK) {
267 c1e_detected = 1;
268 mark_tsc_unstable("TSC halt in C1E");
269 printk(KERN_INFO "System has C1E enabled\n");
270 }
271 }
272
273 if (c1e_detected) {
274 int cpu = smp_processor_id();
275
276 if (!cpu_isset(cpu, c1e_mask)) {
277 cpu_set(cpu, c1e_mask);
278 /*
279 * Force broadcast so ACPI can not interfere. Needs
280 * to run with interrupts enabled as it uses
281 * smp_function_call.
282 */
283 local_irq_enable();
284 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
285 &cpu);
286 printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
287 cpu);
288 local_irq_disable();
289 }
290 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
291
292 default_idle();
293
294 /*
295 * The switch back from broadcast mode needs to be
296 * called with interrupts disabled.
297 */
298 local_irq_disable();
299 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
300 local_irq_enable();
301 } else
302 default_idle();
303}
304
305void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
306{
146#ifdef CONFIG_X86_SMP 307#ifdef CONFIG_X86_SMP
147 if (pm_idle == poll_idle && smp_num_siblings > 1) { 308 if (pm_idle == poll_idle && smp_num_siblings > 1) {
148 printk(KERN_WARNING "WARNING: polling idle and HT enabled," 309 printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
149 " performance may degrade.\n"); 310 " performance may degrade.\n");
150 } 311 }
151#endif 312#endif
313 if (pm_idle)
314 return;
315
152 if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { 316 if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
153 /* 317 /*
154 * Skip, if setup has overridden idle.
155 * One CPU supports mwait => All CPUs supports mwait 318 * One CPU supports mwait => All CPUs supports mwait
156 */ 319 */
157 if (!pm_idle) { 320 printk(KERN_INFO "using mwait in idle threads.\n");
158 printk(KERN_INFO "using mwait in idle threads.\n"); 321 pm_idle = mwait_idle;
159 pm_idle = mwait_idle; 322 } else if (check_c1e_idle(c)) {
160 } 323 printk(KERN_INFO "using C1E aware idle routine\n");
161 } 324 pm_idle = c1e_idle;
162 selected = 1; 325 } else
326 pm_idle = default_idle;
163} 327}
164 328
165static int __init idle_setup(char *str) 329static int __init idle_setup(char *str)
166{ 330{
331 if (!str)
332 return -EINVAL;
333
167 if (!strcmp(str, "poll")) { 334 if (!strcmp(str, "poll")) {
168 printk("using polling idle threads.\n"); 335 printk("using polling idle threads.\n");
169 pm_idle = poll_idle; 336 pm_idle = poll_idle;
170 } else if (!strcmp(str, "mwait")) 337 } else if (!strcmp(str, "mwait"))
171 force_mwait = 1; 338 force_mwait = 1;
172 else 339 else if (!strcmp(str, "halt")) {
340 /*
341 * When the boot option of idle=halt is added, halt is
342 * forced to be used for CPU idle. In such case CPU C2/C3
343 * won't be used again.
344 * To continue to load the CPU idle driver, don't touch
345 * the boot_option_idle_override.
346 */
347 pm_idle = default_idle;
348 idle_halt = 1;
349 return 0;
350 } else if (!strcmp(str, "nomwait")) {
351 /*
352 * If the boot option of "idle=nomwait" is added,
353 * it means that mwait will be disabled for CPU C2/C3
354 * states. In such case it won't touch the variable
355 * of boot_option_idle_override.
356 */
357 idle_nomwait = 1;
358 return 0;
359 } else
173 return -1; 360 return -1;
174 361
175 boot_option_idle_override = 1; 362 boot_option_idle_override = 1;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index e2db9ac5c61c..53bc653ed5ca 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -58,11 +58,6 @@
58 58
59asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 59asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
60 60
61static int hlt_counter;
62
63unsigned long boot_option_idle_override = 0;
64EXPORT_SYMBOL(boot_option_idle_override);
65
66DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; 61DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
67EXPORT_PER_CPU_SYMBOL(current_task); 62EXPORT_PER_CPU_SYMBOL(current_task);
68 63
@@ -77,57 +72,24 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
77 return ((unsigned long *)tsk->thread.sp)[3]; 72 return ((unsigned long *)tsk->thread.sp)[3];
78} 73}
79 74
80/* 75#ifdef CONFIG_HOTPLUG_CPU
81 * Powermanagement idle function, if any.. 76#include <asm/nmi.h>
82 */
83void (*pm_idle)(void);
84EXPORT_SYMBOL(pm_idle);
85 77
86void disable_hlt(void) 78static void cpu_exit_clear(void)
87{ 79{
88 hlt_counter++; 80 int cpu = raw_smp_processor_id();
89}
90 81
91EXPORT_SYMBOL(disable_hlt); 82 idle_task_exit();
92 83
93void enable_hlt(void) 84 cpu_uninit();
94{ 85 irq_ctx_exit(cpu);
95 hlt_counter--;
96}
97 86
98EXPORT_SYMBOL(enable_hlt); 87 cpu_clear(cpu, cpu_callout_map);
99 88 cpu_clear(cpu, cpu_callin_map);
100/*
101 * We use this if we don't have any better
102 * idle routine..
103 */
104void default_idle(void)
105{
106 if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
107 current_thread_info()->status &= ~TS_POLLING;
108 /*
109 * TS_POLLING-cleared state must be visible before we
110 * test NEED_RESCHED:
111 */
112 smp_mb();
113 89
114 if (!need_resched()) 90 numa_remove_cpu(cpu);
115 safe_halt(); /* enables interrupts racelessly */
116 else
117 local_irq_enable();
118 current_thread_info()->status |= TS_POLLING;
119 } else {
120 local_irq_enable();
121 /* loop is done by the caller */
122 cpu_relax();
123 }
124} 91}
125#ifdef CONFIG_APM_MODULE
126EXPORT_SYMBOL(default_idle);
127#endif
128 92
129#ifdef CONFIG_HOTPLUG_CPU
130#include <asm/nmi.h>
131/* We don't actually take CPU down, just spin without interrupts. */ 93/* We don't actually take CPU down, just spin without interrupts. */
132static inline void play_dead(void) 94static inline void play_dead(void)
133{ 95{
@@ -166,26 +128,24 @@ void cpu_idle(void)
166 128
167 /* endless idle loop with no priority at all */ 129 /* endless idle loop with no priority at all */
168 while (1) { 130 while (1) {
169 tick_nohz_stop_sched_tick(); 131 tick_nohz_stop_sched_tick(1);
170 while (!need_resched()) { 132 while (!need_resched()) {
171 void (*idle)(void);
172 133
173 check_pgt_cache(); 134 check_pgt_cache();
174 rmb(); 135 rmb();
175 idle = pm_idle;
176 136
177 if (rcu_pending(cpu)) 137 if (rcu_pending(cpu))
178 rcu_check_callbacks(cpu, 0); 138 rcu_check_callbacks(cpu, 0);
179 139
180 if (!idle)
181 idle = default_idle;
182
183 if (cpu_is_offline(cpu)) 140 if (cpu_is_offline(cpu))
184 play_dead(); 141 play_dead();
185 142
186 local_irq_disable(); 143 local_irq_disable();
187 __get_cpu_var(irq_stat).idle_timestamp = jiffies; 144 __get_cpu_var(irq_stat).idle_timestamp = jiffies;
188 idle(); 145 /* Don't trace irqs off for idle */
146 stop_critical_timings();
147 pm_idle();
148 start_critical_timings();
189 } 149 }
190 tick_nohz_restart_sched_tick(); 150 tick_nohz_restart_sched_tick();
191 preempt_enable_no_resched(); 151 preempt_enable_no_resched();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c6eb5c91e5f6..3fb62a7d9a16 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -56,15 +56,6 @@ asmlinkage extern void ret_from_fork(void);
56 56
57unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; 57unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
58 58
59unsigned long boot_option_idle_override = 0;
60EXPORT_SYMBOL(boot_option_idle_override);
61
62/*
63 * Powermanagement idle function, if any..
64 */
65void (*pm_idle)(void);
66EXPORT_SYMBOL(pm_idle);
67
68static ATOMIC_NOTIFIER_HEAD(idle_notifier); 59static ATOMIC_NOTIFIER_HEAD(idle_notifier);
69 60
70void idle_notifier_register(struct notifier_block *n) 61void idle_notifier_register(struct notifier_block *n)
@@ -94,25 +85,6 @@ void exit_idle(void)
94 __exit_idle(); 85 __exit_idle();
95} 86}
96 87
97/*
98 * We use this if we don't have any better
99 * idle routine..
100 */
101void default_idle(void)
102{
103 current_thread_info()->status &= ~TS_POLLING;
104 /*
105 * TS_POLLING-cleared state must be visible before we
106 * test NEED_RESCHED:
107 */
108 smp_mb();
109 if (!need_resched())
110 safe_halt(); /* enables interrupts racelessly */
111 else
112 local_irq_enable();
113 current_thread_info()->status |= TS_POLLING;
114}
115
116#ifdef CONFIG_HOTPLUG_CPU 88#ifdef CONFIG_HOTPLUG_CPU
117DECLARE_PER_CPU(int, cpu_state); 89DECLARE_PER_CPU(int, cpu_state);
118 90
@@ -148,14 +120,11 @@ void cpu_idle(void)
148 current_thread_info()->status |= TS_POLLING; 120 current_thread_info()->status |= TS_POLLING;
149 /* endless idle loop with no priority at all */ 121 /* endless idle loop with no priority at all */
150 while (1) { 122 while (1) {
151 tick_nohz_stop_sched_tick(); 123 tick_nohz_stop_sched_tick(1);
152 while (!need_resched()) { 124 while (!need_resched()) {
153 void (*idle)(void);
154 125
155 rmb(); 126 rmb();
156 idle = pm_idle; 127
157 if (!idle)
158 idle = default_idle;
159 if (cpu_is_offline(smp_processor_id())) 128 if (cpu_is_offline(smp_processor_id()))
160 play_dead(); 129 play_dead();
161 /* 130 /*
@@ -165,7 +134,10 @@ void cpu_idle(void)
165 */ 134 */
166 local_irq_disable(); 135 local_irq_disable();
167 enter_idle(); 136 enter_idle();
168 idle(); 137 /* Don't trace irqs off for idle */
138 stop_critical_timings();
139 pm_idle();
140 start_critical_timings();
169 /* In many cases the interrupt that ended idle 141 /* In many cases the interrupt that ended idle
170 has already called exit_idle. But some idle 142 has already called exit_idle. But some idle
171 loops can be woken up without interrupt. */ 143 loops can be woken up without interrupt. */
@@ -366,10 +338,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
366 p->thread.fs = me->thread.fs; 338 p->thread.fs = me->thread.fs;
367 p->thread.gs = me->thread.gs; 339 p->thread.gs = me->thread.gs;
368 340
369 asm("mov %%gs,%0" : "=m" (p->thread.gsindex)); 341 savesegment(gs, p->thread.gsindex);
370 asm("mov %%fs,%0" : "=m" (p->thread.fsindex)); 342 savesegment(fs, p->thread.fsindex);
371 asm("mov %%es,%0" : "=m" (p->thread.es)); 343 savesegment(es, p->thread.es);
372 asm("mov %%ds,%0" : "=m" (p->thread.ds)); 344 savesegment(ds, p->thread.ds);
373 345
374 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { 346 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
375 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); 347 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
@@ -408,7 +380,9 @@ out:
408void 380void
409start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) 381start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
410{ 382{
411 asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0)); 383 loadsegment(fs, 0);
384 loadsegment(es, 0);
385 loadsegment(ds, 0);
412 load_gs_index(0); 386 load_gs_index(0);
413 regs->ip = new_ip; 387 regs->ip = new_ip;
414 regs->sp = new_sp; 388 regs->sp = new_sp;
@@ -563,10 +537,11 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
563struct task_struct * 537struct task_struct *
564__switch_to(struct task_struct *prev_p, struct task_struct *next_p) 538__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
565{ 539{
566 struct thread_struct *prev = &prev_p->thread, 540 struct thread_struct *prev = &prev_p->thread;
567 *next = &next_p->thread; 541 struct thread_struct *next = &next_p->thread;
568 int cpu = smp_processor_id(); 542 int cpu = smp_processor_id();
569 struct tss_struct *tss = &per_cpu(init_tss, cpu); 543 struct tss_struct *tss = &per_cpu(init_tss, cpu);
544 unsigned fsindex, gsindex;
570 545
571 /* we're going to use this soon, after a few expensive things */ 546 /* we're going to use this soon, after a few expensive things */
572 if (next_p->fpu_counter>5) 547 if (next_p->fpu_counter>5)
@@ -581,52 +556,64 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
581 * Switch DS and ES. 556 * Switch DS and ES.
582 * This won't pick up thread selector changes, but I guess that is ok. 557 * This won't pick up thread selector changes, but I guess that is ok.
583 */ 558 */
584 asm volatile("mov %%es,%0" : "=m" (prev->es)); 559 savesegment(es, prev->es);
585 if (unlikely(next->es | prev->es)) 560 if (unlikely(next->es | prev->es))
586 loadsegment(es, next->es); 561 loadsegment(es, next->es);
587 562
588 asm volatile ("mov %%ds,%0" : "=m" (prev->ds)); 563 savesegment(ds, prev->ds);
589 if (unlikely(next->ds | prev->ds)) 564 if (unlikely(next->ds | prev->ds))
590 loadsegment(ds, next->ds); 565 loadsegment(ds, next->ds);
591 566
567
568 /* We must save %fs and %gs before load_TLS() because
569 * %fs and %gs may be cleared by load_TLS().
570 *
571 * (e.g. xen_load_tls())
572 */
573 savesegment(fs, fsindex);
574 savesegment(gs, gsindex);
575
592 load_TLS(next, cpu); 576 load_TLS(next, cpu);
593 577
578 /*
579 * Leave lazy mode, flushing any hypercalls made here.
580 * This must be done before restoring TLS segments so
581 * the GDT and LDT are properly updated, and must be
582 * done before math_state_restore, so the TS bit is up
583 * to date.
584 */
585 arch_leave_lazy_cpu_mode();
586
594 /* 587 /*
595 * Switch FS and GS. 588 * Switch FS and GS.
589 *
590 * Segment register != 0 always requires a reload. Also
591 * reload when it has changed. When prev process used 64bit
592 * base always reload to avoid an information leak.
596 */ 593 */
597 { 594 if (unlikely(fsindex | next->fsindex | prev->fs)) {
598 unsigned fsindex; 595 loadsegment(fs, next->fsindex);
599 asm volatile("movl %%fs,%0" : "=r" (fsindex)); 596 /*
600 /* segment register != 0 always requires a reload. 597 * Check if the user used a selector != 0; if yes
601 also reload when it has changed. 598 * clear 64bit base, since overloaded base is always
602 when prev process used 64bit base always reload 599 * mapped to the Null selector
603 to avoid an information leak. */ 600 */
604 if (unlikely(fsindex | next->fsindex | prev->fs)) { 601 if (fsindex)
605 loadsegment(fs, next->fsindex);
606 /* check if the user used a selector != 0
607 * if yes clear 64bit base, since overloaded base
608 * is always mapped to the Null selector
609 */
610 if (fsindex)
611 prev->fs = 0; 602 prev->fs = 0;
612 }
613 /* when next process has a 64bit base use it */
614 if (next->fs)
615 wrmsrl(MSR_FS_BASE, next->fs);
616 prev->fsindex = fsindex;
617 } 603 }
618 { 604 /* when next process has a 64bit base use it */
619 unsigned gsindex; 605 if (next->fs)
620 asm volatile("movl %%gs,%0" : "=r" (gsindex)); 606 wrmsrl(MSR_FS_BASE, next->fs);
621 if (unlikely(gsindex | next->gsindex | prev->gs)) { 607 prev->fsindex = fsindex;
622 load_gs_index(next->gsindex); 608
623 if (gsindex) 609 if (unlikely(gsindex | next->gsindex | prev->gs)) {
610 load_gs_index(next->gsindex);
611 if (gsindex)
624 prev->gs = 0; 612 prev->gs = 0;
625 }
626 if (next->gs)
627 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
628 prev->gsindex = gsindex;
629 } 613 }
614 if (next->gs)
615 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
616 prev->gsindex = gsindex;
630 617
631 /* Must be after DS reload */ 618 /* Must be after DS reload */
632 unlazy_fpu(prev_p); 619 unlazy_fpu(prev_p);
@@ -639,7 +626,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
639 write_pda(pcurrent, next_p); 626 write_pda(pcurrent, next_p);
640 627
641 write_pda(kernelstack, 628 write_pda(kernelstack,
642 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); 629 (unsigned long)task_stack_page(next_p) +
630 THREAD_SIZE - PDA_STACKOFFSET);
643#ifdef CONFIG_CC_STACKPROTECTOR 631#ifdef CONFIG_CC_STACKPROTECTOR
644 write_pda(stack_canary, next_p->stack_canary); 632 write_pda(stack_canary, next_p->stack_canary);
645 /* 633 /*
@@ -798,7 +786,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
798 set_32bit_tls(task, FS_TLS, addr); 786 set_32bit_tls(task, FS_TLS, addr);
799 if (doit) { 787 if (doit) {
800 load_TLS(&task->thread, cpu); 788 load_TLS(&task->thread, cpu);
801 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL)); 789 loadsegment(fs, FS_TLS_SEL);
802 } 790 }
803 task->thread.fsindex = FS_TLS_SEL; 791 task->thread.fsindex = FS_TLS_SEL;
804 task->thread.fs = 0; 792 task->thread.fs = 0;
@@ -808,7 +796,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
808 if (doit) { 796 if (doit) {
809 /* set the selector to 0 to not confuse 797 /* set the selector to 0 to not confuse
810 __switch_to */ 798 __switch_to */
811 asm volatile("movl %0,%%fs" :: "r" (0)); 799 loadsegment(fs, 0);
812 ret = checking_wrmsrl(MSR_FS_BASE, addr); 800 ret = checking_wrmsrl(MSR_FS_BASE, addr);
813 } 801 }
814 } 802 }
@@ -831,7 +819,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
831 if (task->thread.gsindex == GS_TLS_SEL) 819 if (task->thread.gsindex == GS_TLS_SEL)
832 base = read_32bit_tls(task, GS_TLS); 820 base = read_32bit_tls(task, GS_TLS);
833 else if (doit) { 821 else if (doit) {
834 asm("movl %%gs,%0" : "=r" (gsindex)); 822 savesegment(gs, gsindex);
835 if (gsindex) 823 if (gsindex)
836 rdmsrl(MSR_KERNEL_GS_BASE, base); 824 rdmsrl(MSR_KERNEL_GS_BASE, base);
837 else 825 else
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index a7835f282936..e37dccce85db 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -943,13 +943,13 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
943 return copy_regset_to_user(child, &user_x86_32_view, 943 return copy_regset_to_user(child, &user_x86_32_view,
944 REGSET_XFP, 944 REGSET_XFP,
945 0, sizeof(struct user_fxsr_struct), 945 0, sizeof(struct user_fxsr_struct),
946 datap); 946 datap) ? -EIO : 0;
947 947
948 case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ 948 case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
949 return copy_regset_from_user(child, &user_x86_32_view, 949 return copy_regset_from_user(child, &user_x86_32_view,
950 REGSET_XFP, 950 REGSET_XFP,
951 0, sizeof(struct user_fxsr_struct), 951 0, sizeof(struct user_fxsr_struct),
952 datap); 952 datap) ? -EIO : 0;
953#endif 953#endif
954 954
955#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 955#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -1357,8 +1357,6 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
1357#endif 1357#endif
1358} 1358}
1359 1359
1360#ifdef CONFIG_X86_32
1361
1362void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) 1360void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1363{ 1361{
1364 struct siginfo info; 1362 struct siginfo info;
@@ -1377,89 +1375,10 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1377 force_sig_info(SIGTRAP, &info, tsk); 1375 force_sig_info(SIGTRAP, &info, tsk);
1378} 1376}
1379 1377
1380/* notification of system call entry/exit
1381 * - triggered by current->work.syscall_trace
1382 */
1383int do_syscall_trace(struct pt_regs *regs, int entryexit)
1384{
1385 int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
1386 /*
1387 * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
1388 * interception
1389 */
1390 int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
1391 int ret = 0;
1392
1393 /* do the secure computing check first */
1394 if (!entryexit)
1395 secure_computing(regs->orig_ax);
1396
1397 if (unlikely(current->audit_context)) {
1398 if (entryexit)
1399 audit_syscall_exit(AUDITSC_RESULT(regs->ax),
1400 regs->ax);
1401 /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
1402 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
1403 * not used, entry.S will call us only on syscall exit, not
1404 * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
1405 * calling send_sigtrap() on syscall entry.
1406 *
1407 * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
1408 * is_singlestep is false, despite his name, so we will still do
1409 * the correct thing.
1410 */
1411 else if (is_singlestep)
1412 goto out;
1413 }
1414
1415 if (!(current->ptrace & PT_PTRACED))
1416 goto out;
1417
1418 /* If a process stops on the 1st tracepoint with SYSCALL_TRACE
1419 * and then is resumed with SYSEMU_SINGLESTEP, it will come in
1420 * here. We have to check this and return */
1421 if (is_sysemu && entryexit)
1422 return 0;
1423
1424 /* Fake a debug trap */
1425 if (is_singlestep)
1426 send_sigtrap(current, regs, 0);
1427
1428 if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
1429 goto out;
1430
1431 /* the 0x80 provides a way for the tracing parent to distinguish
1432 between a syscall stop and SIGTRAP delivery */
1433 /* Note that the debugger could change the result of test_thread_flag!*/
1434 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
1435
1436 /*
1437 * this isn't the same as continuing with a signal, but it will do
1438 * for normal use. strace only continues with a signal if the
1439 * stopping signal is not SIGTRAP. -brl
1440 */
1441 if (current->exit_code) {
1442 send_sig(current->exit_code, current, 1);
1443 current->exit_code = 0;
1444 }
1445 ret = is_sysemu;
1446out:
1447 if (unlikely(current->audit_context) && !entryexit)
1448 audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax,
1449 regs->bx, regs->cx, regs->dx, regs->si);
1450 if (ret == 0)
1451 return 0;
1452
1453 regs->orig_ax = -1; /* force skip of syscall restarting */
1454 if (unlikely(current->audit_context))
1455 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1456 return 1;
1457}
1458
1459#else /* CONFIG_X86_64 */
1460
1461static void syscall_trace(struct pt_regs *regs) 1378static void syscall_trace(struct pt_regs *regs)
1462{ 1379{
1380 if (!(current->ptrace & PT_PTRACED))
1381 return;
1463 1382
1464#if 0 1383#if 0
1465 printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n", 1384 printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
@@ -1481,39 +1400,81 @@ static void syscall_trace(struct pt_regs *regs)
1481 } 1400 }
1482} 1401}
1483 1402
1484asmlinkage void syscall_trace_enter(struct pt_regs *regs) 1403#ifdef CONFIG_X86_32
1404# define IS_IA32 1
1405#elif defined CONFIG_IA32_EMULATION
1406# define IS_IA32 test_thread_flag(TIF_IA32)
1407#else
1408# define IS_IA32 0
1409#endif
1410
1411/*
1412 * We must return the syscall number to actually look up in the table.
1413 * This can be -1L to skip running any syscall at all.
1414 */
1415asmregparm long syscall_trace_enter(struct pt_regs *regs)
1485{ 1416{
1417 long ret = 0;
1418
1419 /*
1420 * If we stepped into a sysenter/syscall insn, it trapped in
1421 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
1422 * If user-mode had set TF itself, then it's still clear from
1423 * do_debug() and we need to set it again to restore the user
1424 * state. If we entered on the slow path, TF was already set.
1425 */
1426 if (test_thread_flag(TIF_SINGLESTEP))
1427 regs->flags |= X86_EFLAGS_TF;
1428
1486 /* do the secure computing check first */ 1429 /* do the secure computing check first */
1487 secure_computing(regs->orig_ax); 1430 secure_computing(regs->orig_ax);
1488 1431
1489 if (test_thread_flag(TIF_SYSCALL_TRACE) 1432 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1490 && (current->ptrace & PT_PTRACED)) 1433 ret = -1L;
1434
1435 if (ret || test_thread_flag(TIF_SYSCALL_TRACE))
1491 syscall_trace(regs); 1436 syscall_trace(regs);
1492 1437
1493 if (unlikely(current->audit_context)) { 1438 if (unlikely(current->audit_context)) {
1494 if (test_thread_flag(TIF_IA32)) { 1439 if (IS_IA32)
1495 audit_syscall_entry(AUDIT_ARCH_I386, 1440 audit_syscall_entry(AUDIT_ARCH_I386,
1496 regs->orig_ax, 1441 regs->orig_ax,
1497 regs->bx, regs->cx, 1442 regs->bx, regs->cx,
1498 regs->dx, regs->si); 1443 regs->dx, regs->si);
1499 } else { 1444#ifdef CONFIG_X86_64
1445 else
1500 audit_syscall_entry(AUDIT_ARCH_X86_64, 1446 audit_syscall_entry(AUDIT_ARCH_X86_64,
1501 regs->orig_ax, 1447 regs->orig_ax,
1502 regs->di, regs->si, 1448 regs->di, regs->si,
1503 regs->dx, regs->r10); 1449 regs->dx, regs->r10);
1504 } 1450#endif
1505 } 1451 }
1452
1453 return ret ?: regs->orig_ax;
1506} 1454}
1507 1455
1508asmlinkage void syscall_trace_leave(struct pt_regs *regs) 1456asmregparm void syscall_trace_leave(struct pt_regs *regs)
1509{ 1457{
1510 if (unlikely(current->audit_context)) 1458 if (unlikely(current->audit_context))
1511 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); 1459 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1512 1460
1513 if ((test_thread_flag(TIF_SYSCALL_TRACE) 1461 if (test_thread_flag(TIF_SYSCALL_TRACE))
1514 || test_thread_flag(TIF_SINGLESTEP))
1515 && (current->ptrace & PT_PTRACED))
1516 syscall_trace(regs); 1462 syscall_trace(regs);
1517}
1518 1463
1519#endif /* CONFIG_X86_32 */ 1464 /*
1465 * If TIF_SYSCALL_EMU is set, we only get here because of
1466 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
1467 * We already reported this syscall instruction in
1468 * syscall_trace_enter(), so don't do any more now.
1469 */
1470 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1471 return;
1472
1473 /*
1474 * If we are single-stepping, synthesize a trap to follow the
1475 * system call instruction.
1476 */
1477 if (test_thread_flag(TIF_SINGLESTEP) &&
1478 (current->ptrace & PT_PTRACED))
1479 send_sigtrap(current, regs, 0);
1480}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index d89a648fe710..d13858818100 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -65,6 +65,7 @@ static enum {
65 ICH_FORCE_HPET_RESUME, 65 ICH_FORCE_HPET_RESUME,
66 VT8237_FORCE_HPET_RESUME, 66 VT8237_FORCE_HPET_RESUME,
67 NVIDIA_FORCE_HPET_RESUME, 67 NVIDIA_FORCE_HPET_RESUME,
68 ATI_FORCE_HPET_RESUME,
68} force_hpet_resume_type; 69} force_hpet_resume_type;
69 70
70static void __iomem *rcba_base; 71static void __iomem *rcba_base;
@@ -158,6 +159,8 @@ static void ich_force_enable_hpet(struct pci_dev *dev)
158 159
159DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0, 160DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0,
160 ich_force_enable_hpet); 161 ich_force_enable_hpet);
162DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_0,
163 ich_force_enable_hpet);
161DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1, 164DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1,
162 ich_force_enable_hpet); 165 ich_force_enable_hpet);
163DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0, 166DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0,
@@ -174,6 +177,12 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
174 177
175static struct pci_dev *cached_dev; 178static struct pci_dev *cached_dev;
176 179
180static void hpet_print_force_info(void)
181{
182 printk(KERN_INFO "HPET not enabled in BIOS. "
183 "You might try hpet=force boot option\n");
184}
185
177static void old_ich_force_hpet_resume(void) 186static void old_ich_force_hpet_resume(void)
178{ 187{
179 u32 val; 188 u32 val;
@@ -253,8 +262,12 @@ static void old_ich_force_enable_hpet_user(struct pci_dev *dev)
253{ 262{
254 if (hpet_force_user) 263 if (hpet_force_user)
255 old_ich_force_enable_hpet(dev); 264 old_ich_force_enable_hpet(dev);
265 else
266 hpet_print_force_info();
256} 267}
257 268
269DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1,
270 old_ich_force_enable_hpet_user);
258DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0, 271DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0,
259 old_ich_force_enable_hpet_user); 272 old_ich_force_enable_hpet_user);
260DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12, 273DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12,
@@ -290,9 +303,14 @@ static void vt8237_force_enable_hpet(struct pci_dev *dev)
290{ 303{
291 u32 uninitialized_var(val); 304 u32 uninitialized_var(val);
292 305
293 if (!hpet_force_user || hpet_address || force_hpet_address) 306 if (hpet_address || force_hpet_address)
294 return; 307 return;
295 308
309 if (!hpet_force_user) {
310 hpet_print_force_info();
311 return;
312 }
313
296 pci_read_config_dword(dev, 0x68, &val); 314 pci_read_config_dword(dev, 0x68, &val);
297 /* 315 /*
298 * Bit 7 is HPET enable bit. 316 * Bit 7 is HPET enable bit.
@@ -330,6 +348,36 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235,
330DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237, 348DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
331 vt8237_force_enable_hpet); 349 vt8237_force_enable_hpet);
332 350
351static void ati_force_hpet_resume(void)
352{
353 pci_write_config_dword(cached_dev, 0x14, 0xfed00000);
354 printk(KERN_DEBUG "Force enabled HPET at resume\n");
355}
356
357static void ati_force_enable_hpet(struct pci_dev *dev)
358{
359 u32 uninitialized_var(val);
360
361 if (hpet_address || force_hpet_address)
362 return;
363
364 if (!hpet_force_user) {
365 hpet_print_force_info();
366 return;
367 }
368
369 pci_write_config_dword(dev, 0x14, 0xfed00000);
370 pci_read_config_dword(dev, 0x14, &val);
371 force_hpet_address = val;
372 force_hpet_resume_type = ATI_FORCE_HPET_RESUME;
373 dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
374 force_hpet_address);
375 cached_dev = dev;
376 return;
377}
378DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
379 ati_force_enable_hpet);
380
333/* 381/*
334 * Undocumented chipset feature taken from LinuxBIOS. 382 * Undocumented chipset feature taken from LinuxBIOS.
335 */ 383 */
@@ -343,8 +391,13 @@ static void nvidia_force_enable_hpet(struct pci_dev *dev)
343{ 391{
344 u32 uninitialized_var(val); 392 u32 uninitialized_var(val);
345 393
346 if (!hpet_force_user || hpet_address || force_hpet_address) 394 if (hpet_address || force_hpet_address)
395 return;
396
397 if (!hpet_force_user) {
398 hpet_print_force_info();
347 return; 399 return;
400 }
348 401
349 pci_write_config_dword(dev, 0x44, 0xfed00001); 402 pci_write_config_dword(dev, 0x44, 0xfed00001);
350 pci_read_config_dword(dev, 0x44, &val); 403 pci_read_config_dword(dev, 0x44, &val);
@@ -397,6 +450,9 @@ void force_hpet_resume(void)
397 case NVIDIA_FORCE_HPET_RESUME: 450 case NVIDIA_FORCE_HPET_RESUME:
398 nvidia_force_hpet_resume(); 451 nvidia_force_hpet_resume();
399 return; 452 return;
453 case ATI_FORCE_HPET_RESUME:
454 ati_force_hpet_resume();
455 return;
400 default: 456 default:
401 break; 457 break;
402 } 458 }
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index f6be7d5f82f8..06a9f643817e 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -27,7 +27,7 @@
27void (*pm_power_off)(void); 27void (*pm_power_off)(void);
28EXPORT_SYMBOL(pm_power_off); 28EXPORT_SYMBOL(pm_power_off);
29 29
30static long no_idt[3]; 30static const struct desc_ptr no_idt = {};
31static int reboot_mode; 31static int reboot_mode;
32enum reboot_type reboot_type = BOOT_KBD; 32enum reboot_type reboot_type = BOOT_KBD;
33int reboot_force; 33int reboot_force;
@@ -177,6 +177,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
177 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"), 177 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
178 }, 178 },
179 }, 179 },
180 { /* Handle problems with rebooting on Dell T5400's */
181 .callback = set_bios_reboot,
182 .ident = "Dell Precision T5400",
183 .matches = {
184 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
185 DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"),
186 },
187 },
180 { /* Handle problems with rebooting on HP laptops */ 188 { /* Handle problems with rebooting on HP laptops */
181 .callback = set_bios_reboot, 189 .callback = set_bios_reboot,
182 .ident = "HP Compaq Laptop", 190 .ident = "HP Compaq Laptop",
@@ -201,15 +209,15 @@ core_initcall(reboot_init);
201 controller to pulse the CPU reset line, which is more thorough, but 209 controller to pulse the CPU reset line, which is more thorough, but
202 doesn't work with at least one type of 486 motherboard. It is easy 210 doesn't work with at least one type of 486 motherboard. It is easy
203 to stop this code working; hence the copious comments. */ 211 to stop this code working; hence the copious comments. */
204static unsigned long long 212static const unsigned long long
205real_mode_gdt_entries [3] = 213real_mode_gdt_entries [3] =
206{ 214{
207 0x0000000000000000ULL, /* Null descriptor */ 215 0x0000000000000000ULL, /* Null descriptor */
208 0x00009a000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */ 216 0x00009b000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */
209 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */ 217 0x000093000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */
210}; 218};
211 219
212static struct desc_ptr 220static const struct desc_ptr
213real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries }, 221real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
214real_mode_idt = { 0x3ff, 0 }; 222real_mode_idt = { 0x3ff, 0 };
215 223
@@ -231,7 +239,7 @@ real_mode_idt = { 0x3ff, 0 };
231 239
232 More could be done here to set up the registers as if a CPU reset had 240 More could be done here to set up the registers as if a CPU reset had
233 occurred; hopefully real BIOSs don't assume much. */ 241 occurred; hopefully real BIOSs don't assume much. */
234static unsigned char real_mode_switch [] = 242static const unsigned char real_mode_switch [] =
235{ 243{
236 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */ 244 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */
237 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */ 245 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */
@@ -245,7 +253,7 @@ static unsigned char real_mode_switch [] =
245 0x24, 0x10, /* f: andb $0x10,al */ 253 0x24, 0x10, /* f: andb $0x10,al */
246 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */ 254 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */
247}; 255};
248static unsigned char jump_to_bios [] = 256static const unsigned char jump_to_bios [] =
249{ 257{
250 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */ 258 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */
251}; 259};
@@ -255,7 +263,7 @@ static unsigned char jump_to_bios [] =
255 * specified by the code and length parameters. 263 * specified by the code and length parameters.
256 * We assume that length will aways be less that 100! 264 * We assume that length will aways be less that 100!
257 */ 265 */
258void machine_real_restart(unsigned char *code, int length) 266void machine_real_restart(const unsigned char *code, int length)
259{ 267{
260 local_irq_disable(); 268 local_irq_disable();
261 269
@@ -368,7 +376,7 @@ static void native_machine_emergency_restart(void)
368 } 376 }
369 377
370 case BOOT_TRIPLE: 378 case BOOT_TRIPLE:
371 load_idt((const struct desc_ptr *)&no_idt); 379 load_idt(&no_idt);
372 __asm__ __volatile__("int3"); 380 __asm__ __volatile__("int3");
373 381
374 reboot_type = BOOT_KBD; 382 reboot_type = BOOT_KBD;
@@ -403,24 +411,28 @@ void native_machine_shutdown(void)
403{ 411{
404 /* Stop the cpus and apics */ 412 /* Stop the cpus and apics */
405#ifdef CONFIG_SMP 413#ifdef CONFIG_SMP
406 int reboot_cpu_id;
407 414
408 /* The boot cpu is always logical cpu 0 */ 415 /* The boot cpu is always logical cpu 0 */
409 reboot_cpu_id = 0; 416 int reboot_cpu_id = 0;
417 cpumask_of_cpu_ptr(newmask, reboot_cpu_id);
410 418
411#ifdef CONFIG_X86_32 419#ifdef CONFIG_X86_32
412 /* See if there has been given a command line override */ 420 /* See if there has been given a command line override */
413 if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) && 421 if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) &&
414 cpu_online(reboot_cpu)) 422 cpu_online(reboot_cpu)) {
415 reboot_cpu_id = reboot_cpu; 423 reboot_cpu_id = reboot_cpu;
424 cpumask_of_cpu_ptr_next(newmask, reboot_cpu_id);
425 }
416#endif 426#endif
417 427
418 /* Make certain the cpu I'm about to reboot on is online */ 428 /* Make certain the cpu I'm about to reboot on is online */
419 if (!cpu_online(reboot_cpu_id)) 429 if (!cpu_online(reboot_cpu_id)) {
420 reboot_cpu_id = smp_processor_id(); 430 reboot_cpu_id = smp_processor_id();
431 cpumask_of_cpu_ptr_next(newmask, reboot_cpu_id);
432 }
421 433
422 /* Make certain I only run on the appropriate processor */ 434 /* Make certain I only run on the appropriate processor */
423 set_cpus_allowed_ptr(current, &cpumask_of_cpu(reboot_cpu_id)); 435 set_cpus_allowed_ptr(current, newmask);
424 436
425 /* O.K Now that I'm on the appropriate processor, 437 /* O.K Now that I'm on the appropriate processor,
426 * stop all of the others. 438 * stop all of the others.
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index dec0b5ec25c2..61a837743fe5 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -49,7 +49,7 @@ struct device_fixup {
49 void (*reboot_fixup)(struct pci_dev *); 49 void (*reboot_fixup)(struct pci_dev *);
50}; 50};
51 51
52static struct device_fixup fixups_table[] = { 52static const struct device_fixup fixups_table[] = {
53{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset }, 53{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
54{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset }, 54{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
55{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset }, 55{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset },
@@ -64,7 +64,7 @@ static struct device_fixup fixups_table[] = {
64 */ 64 */
65void mach_reboot_fixups(void) 65void mach_reboot_fixups(void)
66{ 66{
67 struct device_fixup *cur; 67 const struct device_fixup *cur;
68 struct pci_dev *dev; 68 struct pci_dev *dev;
69 int i; 69 int i;
70 70
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index c30fe25d470d..703310a99023 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -20,11 +20,44 @@
20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) 20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
21#define PAE_PGD_ATTR (_PAGE_PRESENT) 21#define PAE_PGD_ATTR (_PAGE_PRESENT)
22 22
23/* control_page + PAGE_SIZE/2 ~ control_page + PAGE_SIZE * 3/4 are
24 * used to save some data for jumping back
25 */
26#define DATA(offset) (PAGE_SIZE/2+(offset))
27
28/* Minimal CPU state */
29#define ESP DATA(0x0)
30#define CR0 DATA(0x4)
31#define CR3 DATA(0x8)
32#define CR4 DATA(0xc)
33
34/* other data */
35#define CP_VA_CONTROL_PAGE DATA(0x10)
36#define CP_PA_PGD DATA(0x14)
37#define CP_PA_SWAP_PAGE DATA(0x18)
38#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c)
39
23 .text 40 .text
24 .align PAGE_SIZE 41 .align PAGE_SIZE
25 .globl relocate_kernel 42 .globl relocate_kernel
26relocate_kernel: 43relocate_kernel:
27 movl 8(%esp), %ebp /* list of pages */ 44 /* Save the CPU context, used for jumping back */
45
46 pushl %ebx
47 pushl %esi
48 pushl %edi
49 pushl %ebp
50 pushf
51
52 movl 20+8(%esp), %ebp /* list of pages */
53 movl PTR(VA_CONTROL_PAGE)(%ebp), %edi
54 movl %esp, ESP(%edi)
55 movl %cr0, %eax
56 movl %eax, CR0(%edi)
57 movl %cr3, %eax
58 movl %eax, CR3(%edi)
59 movl %cr4, %eax
60 movl %eax, CR4(%edi)
28 61
29#ifdef CONFIG_X86_PAE 62#ifdef CONFIG_X86_PAE
30 /* map the control page at its virtual address */ 63 /* map the control page at its virtual address */
@@ -138,15 +171,25 @@ relocate_kernel:
138 171
139relocate_new_kernel: 172relocate_new_kernel:
140 /* read the arguments and say goodbye to the stack */ 173 /* read the arguments and say goodbye to the stack */
141 movl 4(%esp), %ebx /* page_list */ 174 movl 20+4(%esp), %ebx /* page_list */
142 movl 8(%esp), %ebp /* list of pages */ 175 movl 20+8(%esp), %ebp /* list of pages */
143 movl 12(%esp), %edx /* start address */ 176 movl 20+12(%esp), %edx /* start address */
144 movl 16(%esp), %ecx /* cpu_has_pae */ 177 movl 20+16(%esp), %ecx /* cpu_has_pae */
178 movl 20+20(%esp), %esi /* preserve_context */
145 179
146 /* zero out flags, and disable interrupts */ 180 /* zero out flags, and disable interrupts */
147 pushl $0 181 pushl $0
148 popfl 182 popfl
149 183
184 /* save some information for jumping back */
185 movl PTR(VA_CONTROL_PAGE)(%ebp), %edi
186 movl %edi, CP_VA_CONTROL_PAGE(%edi)
187 movl PTR(PA_PGD)(%ebp), %eax
188 movl %eax, CP_PA_PGD(%edi)
189 movl PTR(PA_SWAP_PAGE)(%ebp), %eax
190 movl %eax, CP_PA_SWAP_PAGE(%edi)
191 movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi)
192
150 /* get physical address of control page now */ 193 /* get physical address of control page now */
151 /* this is impossible after page table switch */ 194 /* this is impossible after page table switch */
152 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi 195 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi
@@ -197,8 +240,90 @@ identity_mapped:
197 xorl %eax, %eax 240 xorl %eax, %eax
198 movl %eax, %cr3 241 movl %eax, %cr3
199 242
243 movl CP_PA_SWAP_PAGE(%edi), %eax
244 pushl %eax
245 pushl %ebx
246 call swap_pages
247 addl $8, %esp
248
249 /* To be certain of avoiding problems with self-modifying code
250 * I need to execute a serializing instruction here.
251 * So I flush the TLB, it's handy, and not processor dependent.
252 */
253 xorl %eax, %eax
254 movl %eax, %cr3
255
256 /* set all of the registers to known values */
257 /* leave %esp alone */
258
259 testl %esi, %esi
260 jnz 1f
261 xorl %edi, %edi
262 xorl %eax, %eax
263 xorl %ebx, %ebx
264 xorl %ecx, %ecx
265 xorl %edx, %edx
266 xorl %esi, %esi
267 xorl %ebp, %ebp
268 ret
2691:
270 popl %edx
271 movl CP_PA_SWAP_PAGE(%edi), %esp
272 addl $PAGE_SIZE, %esp
2732:
274 call *%edx
275
276 /* get the re-entry point of the peer system */
277 movl 0(%esp), %ebp
278 call 1f
2791:
280 popl %ebx
281 subl $(1b - relocate_kernel), %ebx
282 movl CP_VA_CONTROL_PAGE(%ebx), %edi
283 lea PAGE_SIZE(%ebx), %esp
284 movl CP_PA_SWAP_PAGE(%ebx), %eax
285 movl CP_PA_BACKUP_PAGES_MAP(%ebx), %edx
286 pushl %eax
287 pushl %edx
288 call swap_pages
289 addl $8, %esp
290 movl CP_PA_PGD(%ebx), %eax
291 movl %eax, %cr3
292 movl %cr0, %eax
293 orl $(1<<31), %eax
294 movl %eax, %cr0
295 lea PAGE_SIZE(%edi), %esp
296 movl %edi, %eax
297 addl $(virtual_mapped - relocate_kernel), %eax
298 pushl %eax
299 ret
300
301virtual_mapped:
302 movl CR4(%edi), %eax
303 movl %eax, %cr4
304 movl CR3(%edi), %eax
305 movl %eax, %cr3
306 movl CR0(%edi), %eax
307 movl %eax, %cr0
308 movl ESP(%edi), %esp
309 movl %ebp, %eax
310
311 popf
312 popl %ebp
313 popl %edi
314 popl %esi
315 popl %ebx
316 ret
317
200 /* Do the copies */ 318 /* Do the copies */
201 movl %ebx, %ecx 319swap_pages:
320 movl 8(%esp), %edx
321 movl 4(%esp), %ecx
322 pushl %ebp
323 pushl %ebx
324 pushl %edi
325 pushl %esi
326 movl %ecx, %ebx
202 jmp 1f 327 jmp 1f
203 328
2040: /* top, read another word from the indirection page */ 3290: /* top, read another word from the indirection page */
@@ -226,27 +351,28 @@ identity_mapped:
226 movl %ecx, %esi /* For every source page do a copy */ 351 movl %ecx, %esi /* For every source page do a copy */
227 andl $0xfffff000, %esi 352 andl $0xfffff000, %esi
228 353
354 movl %edi, %eax
355 movl %esi, %ebp
356
357 movl %edx, %edi
229 movl $1024, %ecx 358 movl $1024, %ecx
230 rep ; movsl 359 rep ; movsl
231 jmp 0b
232 360
2333: 361 movl %ebp, %edi
234 362 movl %eax, %esi
235 /* To be certain of avoiding problems with self-modifying code 363 movl $1024, %ecx
236 * I need to execute a serializing instruction here. 364 rep ; movsl
237 * So I flush the TLB, it's handy, and not processor dependent.
238 */
239 xorl %eax, %eax
240 movl %eax, %cr3
241 365
242 /* set all of the registers to known values */ 366 movl %eax, %edi
243 /* leave %esp alone */ 367 movl %edx, %esi
368 movl $1024, %ecx
369 rep ; movsl
244 370
245 xorl %eax, %eax 371 lea PAGE_SIZE(%ebp), %esi
246 xorl %ebx, %ebx 372 jmp 0b
247 xorl %ecx, %ecx 3733:
248 xorl %edx, %edx 374 popl %esi
249 xorl %esi, %esi 375 popl %edi
250 xorl %edi, %edi 376 popl %ebx
251 xorl %ebp, %ebp 377 popl %ebp
252 ret 378 ret
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 6f80b852a196..b520dae02bf4 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1,139 +1,885 @@
1#include <linux/kernel.h> 1/*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
5 *
6 * Memory region support
7 * David Parsons <orc@pell.chi.il.us>, July-August 1999
8 *
9 * Added E820 sanitization routine (removes overlapping memory regions);
10 * Brian Moyle <bmoyle@mvista.com>, February 2001
11 *
12 * Moved CPU detection code to cpu/${cpu}.c
13 * Patrick Mochel <mochel@osdl.org>, March 2002
14 *
15 * Provisions for empty E820 memory regions (reported by certain BIOSes).
16 * Alex Achenbach <xela@slit.de>, December 2002.
17 *
18 */
19
20/*
21 * This file handles the architecture-dependent parts of initialization
22 */
23
24#include <linux/sched.h>
25#include <linux/mm.h>
26#include <linux/mmzone.h>
27#include <linux/screen_info.h>
28#include <linux/ioport.h>
29#include <linux/acpi.h>
30#include <linux/apm_bios.h>
31#include <linux/initrd.h>
32#include <linux/bootmem.h>
33#include <linux/seq_file.h>
34#include <linux/console.h>
35#include <linux/mca.h>
36#include <linux/root_dev.h>
37#include <linux/highmem.h>
2#include <linux/module.h> 38#include <linux/module.h>
39#include <linux/efi.h>
3#include <linux/init.h> 40#include <linux/init.h>
4#include <linux/bootmem.h> 41#include <linux/edd.h>
42#include <linux/iscsi_ibft.h>
43#include <linux/nodemask.h>
44#include <linux/kexec.h>
45#include <linux/dmi.h>
46#include <linux/pfn.h>
47#include <linux/pci.h>
48#include <asm/pci-direct.h>
49#include <linux/init_ohci1394_dma.h>
50#include <linux/kvm_para.h>
51
52#include <linux/errno.h>
53#include <linux/kernel.h>
54#include <linux/stddef.h>
55#include <linux/unistd.h>
56#include <linux/ptrace.h>
57#include <linux/slab.h>
58#include <linux/user.h>
59#include <linux/delay.h>
60
61#include <linux/kallsyms.h>
62#include <linux/cpufreq.h>
63#include <linux/dma-mapping.h>
64#include <linux/ctype.h>
65#include <linux/uaccess.h>
66
5#include <linux/percpu.h> 67#include <linux/percpu.h>
6#include <asm/smp.h> 68#include <linux/crash_dump.h>
7#include <asm/percpu.h> 69
70#include <video/edid.h>
71
72#include <asm/mtrr.h>
73#include <asm/apic.h>
74#include <asm/e820.h>
75#include <asm/mpspec.h>
76#include <asm/setup.h>
77#include <asm/arch_hooks.h>
78#include <asm/efi.h>
8#include <asm/sections.h> 79#include <asm/sections.h>
80#include <asm/dmi.h>
81#include <asm/io_apic.h>
82#include <asm/ist.h>
83#include <asm/vmi.h>
84#include <setup_arch.h>
85#include <asm/bios_ebda.h>
86#include <asm/cacheflush.h>
9#include <asm/processor.h> 87#include <asm/processor.h>
10#include <asm/setup.h> 88#include <asm/bugs.h>
89
90#include <asm/system.h>
91#include <asm/vsyscall.h>
92#include <asm/smp.h>
93#include <asm/desc.h>
94#include <asm/dma.h>
95#include <asm/iommu.h>
96#include <asm/mmu_context.h>
97#include <asm/proto.h>
98
99#include <mach_apic.h>
100#include <asm/paravirt.h>
101
102#include <asm/percpu.h>
11#include <asm/topology.h> 103#include <asm/topology.h>
12#include <asm/mpspec.h>
13#include <asm/apicdef.h> 104#include <asm/apicdef.h>
105#ifdef CONFIG_X86_64
106#include <asm/numa_64.h>
107#endif
14 108
15#ifdef CONFIG_X86_LOCAL_APIC 109#ifndef ARCH_SETUP
16unsigned int num_processors; 110#define ARCH_SETUP
17unsigned disabled_cpus __cpuinitdata; 111#endif
18/* Processor that is doing the boot up */
19unsigned int boot_cpu_physical_apicid = -1U;
20EXPORT_SYMBOL(boot_cpu_physical_apicid);
21 112
22DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID; 113#ifndef CONFIG_DEBUG_BOOT_PARAMS
23EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid); 114struct boot_params __initdata boot_params;
115#else
116struct boot_params boot_params;
117#endif
24 118
25/* Bitmask of physically existing CPUs */ 119/*
26physid_mask_t phys_cpu_present_map; 120 * Machine setup..
121 */
122static struct resource data_resource = {
123 .name = "Kernel data",
124 .start = 0,
125 .end = 0,
126 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
127};
128
129static struct resource code_resource = {
130 .name = "Kernel code",
131 .start = 0,
132 .end = 0,
133 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
134};
135
136static struct resource bss_resource = {
137 .name = "Kernel bss",
138 .start = 0,
139 .end = 0,
140 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
141};
142
143
144#ifdef CONFIG_X86_32
145/* This value is set up by the early boot code to point to the value
146 immediately after the boot time page tables. It contains a *physical*
147 address, and must not be in the .bss segment! */
148unsigned long init_pg_tables_start __initdata = ~0UL;
149unsigned long init_pg_tables_end __initdata = ~0UL;
150
151static struct resource video_ram_resource = {
152 .name = "Video RAM area",
153 .start = 0xa0000,
154 .end = 0xbffff,
155 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
156};
157
158/* cpu data as detected by the assembly code in head.S */
159struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};
160/* common cpu data for all cpus */
161struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1};
162EXPORT_SYMBOL(boot_cpu_data);
163static void set_mca_bus(int x)
164{
165#ifdef CONFIG_MCA
166 MCA_bus = x;
167#endif
168}
169
170unsigned int def_to_bigsmp;
171
172/* for MCA, but anyone else can use it if they want */
173unsigned int machine_id;
174unsigned int machine_submodel_id;
175unsigned int BIOS_revision;
176
177struct apm_info apm_info;
178EXPORT_SYMBOL(apm_info);
179
180#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
181 defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
182struct ist_info ist_info;
183EXPORT_SYMBOL(ist_info);
184#else
185struct ist_info ist_info;
27#endif 186#endif
28 187
29#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP) 188#else
189struct cpuinfo_x86 boot_cpu_data __read_mostly;
190EXPORT_SYMBOL(boot_cpu_data);
191#endif
192
193
194#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
195unsigned long mmu_cr4_features;
196#else
197unsigned long mmu_cr4_features = X86_CR4_PAE;
198#endif
199
200/* Boot loader ID as an integer, for the benefit of proc_dointvec */
201int bootloader_type;
202
30/* 203/*
31 * Copy data used in early init routines from the initial arrays to the 204 * Early DMI memory
32 * per cpu data areas. These arrays then become expendable and the
33 * *_early_ptr's are zeroed indicating that the static arrays are gone.
34 */ 205 */
35static void __init setup_per_cpu_maps(void) 206int dmi_alloc_index;
207char dmi_alloc_data[DMI_MAX_DATA];
208
209/*
210 * Setup options
211 */
212struct screen_info screen_info;
213EXPORT_SYMBOL(screen_info);
214struct edid_info edid_info;
215EXPORT_SYMBOL_GPL(edid_info);
216
217extern int root_mountflags;
218
219unsigned long saved_video_mode;
220
221#define RAMDISK_IMAGE_START_MASK 0x07FF
222#define RAMDISK_PROMPT_FLAG 0x8000
223#define RAMDISK_LOAD_FLAG 0x4000
224
225static char __initdata command_line[COMMAND_LINE_SIZE];
226
227#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
228struct edd edd;
229#ifdef CONFIG_EDD_MODULE
230EXPORT_SYMBOL(edd);
231#endif
232/**
233 * copy_edd() - Copy the BIOS EDD information
234 * from boot_params into a safe place.
235 *
236 */
237static inline void copy_edd(void)
238{
239 memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
240 sizeof(edd.mbr_signature));
241 memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
242 edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
243 edd.edd_info_nr = boot_params.eddbuf_entries;
244}
245#else
246static inline void copy_edd(void)
247{
248}
249#endif
250
251#ifdef CONFIG_BLK_DEV_INITRD
252
253#ifdef CONFIG_X86_32
254
255#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
256static void __init relocate_initrd(void)
36{ 257{
37 int cpu;
38 258
39 for_each_possible_cpu(cpu) { 259 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
40 per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu]; 260 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
41 per_cpu(x86_bios_cpu_apicid, cpu) = 261 u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
42 x86_bios_cpu_apicid_init[cpu]; 262 u64 ramdisk_here;
43#ifdef CONFIG_NUMA 263 unsigned long slop, clen, mapaddr;
44 per_cpu(x86_cpu_to_node_map, cpu) = 264 char *p, *q;
45 x86_cpu_to_node_map_init[cpu]; 265
266 /* We need to move the initrd down into lowmem */
267 ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size,
268 PAGE_SIZE);
269
270 if (ramdisk_here == -1ULL)
271 panic("Cannot find place for new RAMDISK of size %lld\n",
272 ramdisk_size);
273
274 /* Note: this includes all the lowmem currently occupied by
275 the initrd, we rely on that fact to keep the data intact. */
276 reserve_early(ramdisk_here, ramdisk_here + ramdisk_size,
277 "NEW RAMDISK");
278 initrd_start = ramdisk_here + PAGE_OFFSET;
279 initrd_end = initrd_start + ramdisk_size;
280 printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
281 ramdisk_here, ramdisk_here + ramdisk_size);
282
283 q = (char *)initrd_start;
284
285 /* Copy any lowmem portion of the initrd */
286 if (ramdisk_image < end_of_lowmem) {
287 clen = end_of_lowmem - ramdisk_image;
288 p = (char *)__va(ramdisk_image);
289 memcpy(q, p, clen);
290 q += clen;
291 ramdisk_image += clen;
292 ramdisk_size -= clen;
293 }
294
295 /* Copy the highmem portion of the initrd */
296 while (ramdisk_size) {
297 slop = ramdisk_image & ~PAGE_MASK;
298 clen = ramdisk_size;
299 if (clen > MAX_MAP_CHUNK-slop)
300 clen = MAX_MAP_CHUNK-slop;
301 mapaddr = ramdisk_image & PAGE_MASK;
302 p = early_ioremap(mapaddr, clen+slop);
303 memcpy(q, p+slop, clen);
304 early_iounmap(p, clen+slop);
305 q += clen;
306 ramdisk_image += clen;
307 ramdisk_size -= clen;
308 }
309 /* high pages is not converted by early_res_to_bootmem */
310 ramdisk_image = boot_params.hdr.ramdisk_image;
311 ramdisk_size = boot_params.hdr.ramdisk_size;
312 printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to"
313 " %08llx - %08llx\n",
314 ramdisk_image, ramdisk_image + ramdisk_size - 1,
315 ramdisk_here, ramdisk_here + ramdisk_size - 1);
316}
46#endif 317#endif
318
319static void __init reserve_initrd(void)
320{
321 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
322 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
323 u64 ramdisk_end = ramdisk_image + ramdisk_size;
324 u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
325
326 if (!boot_params.hdr.type_of_loader ||
327 !ramdisk_image || !ramdisk_size)
328 return; /* No initrd provided by bootloader */
329
330 initrd_start = 0;
331
332 if (ramdisk_size >= (end_of_lowmem>>1)) {
333 free_early(ramdisk_image, ramdisk_end);
334 printk(KERN_ERR "initrd too large to handle, "
335 "disabling initrd\n");
336 return;
337 }
338
339 printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image,
340 ramdisk_end);
341
342
343 if (ramdisk_end <= end_of_lowmem) {
344 /* All in lowmem, easy case */
345 /*
346 * don't need to reserve again, already reserved early
347 * in i386_start_kernel
348 */
349 initrd_start = ramdisk_image + PAGE_OFFSET;
350 initrd_end = initrd_start + ramdisk_size;
351 return;
47 } 352 }
48 353
49 /* indicate the early static arrays will soon be gone */ 354#ifdef CONFIG_X86_32
50 x86_cpu_to_apicid_early_ptr = NULL; 355 relocate_initrd();
51 x86_bios_cpu_apicid_early_ptr = NULL; 356#else
52#ifdef CONFIG_NUMA 357 printk(KERN_ERR "initrd extends beyond end of memory "
53 x86_cpu_to_node_map_early_ptr = NULL; 358 "(0x%08llx > 0x%08llx)\ndisabling initrd\n",
359 ramdisk_end, end_of_lowmem);
360 initrd_start = 0;
54#endif 361#endif
362 free_early(ramdisk_image, ramdisk_end);
55} 363}
364#else
365static void __init reserve_initrd(void)
366{
367}
368#endif /* CONFIG_BLK_DEV_INITRD */
369
370static void __init parse_setup_data(void)
371{
372 struct setup_data *data;
373 u64 pa_data;
374
375 if (boot_params.hdr.version < 0x0209)
376 return;
377 pa_data = boot_params.hdr.setup_data;
378 while (pa_data) {
379 data = early_ioremap(pa_data, PAGE_SIZE);
380 switch (data->type) {
381 case SETUP_E820_EXT:
382 parse_e820_ext(data, pa_data);
383 break;
384 default:
385 break;
386 }
387 pa_data = data->next;
388 early_iounmap(data, PAGE_SIZE);
389 }
390}
391
392static void __init e820_reserve_setup_data(void)
393{
394 struct setup_data *data;
395 u64 pa_data;
396 int found = 0;
397
398 if (boot_params.hdr.version < 0x0209)
399 return;
400 pa_data = boot_params.hdr.setup_data;
401 while (pa_data) {
402 data = early_ioremap(pa_data, sizeof(*data));
403 e820_update_range(pa_data, sizeof(*data)+data->len,
404 E820_RAM, E820_RESERVED_KERN);
405 found = 1;
406 pa_data = data->next;
407 early_iounmap(data, sizeof(*data));
408 }
409 if (!found)
410 return;
56 411
57#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP 412 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
58cpumask_t *cpumask_of_cpu_map __read_mostly; 413 memcpy(&e820_saved, &e820, sizeof(struct e820map));
59EXPORT_SYMBOL(cpumask_of_cpu_map); 414 printk(KERN_INFO "extended physical RAM map:\n");
415 e820_print_map("reserve setup_data");
416}
60 417
61/* requires nr_cpu_ids to be initialized */ 418static void __init reserve_early_setup_data(void)
62static void __init setup_cpumask_of_cpu(void)
63{ 419{
64 int i; 420 struct setup_data *data;
421 u64 pa_data;
422 char buf[32];
423
424 if (boot_params.hdr.version < 0x0209)
425 return;
426 pa_data = boot_params.hdr.setup_data;
427 while (pa_data) {
428 data = early_ioremap(pa_data, sizeof(*data));
429 sprintf(buf, "setup data %x", data->type);
430 reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
431 pa_data = data->next;
432 early_iounmap(data, sizeof(*data));
433 }
434}
435
436/*
437 * --------- Crashkernel reservation ------------------------------
438 */
439
440#ifdef CONFIG_KEXEC
441
442/**
443 * Reserve @size bytes of crashkernel memory at any suitable offset.
444 *
445 * @size: Size of the crashkernel memory to reserve.
446 * Returns the base address on success, and -1ULL on failure.
447 */
448unsigned long long find_and_reserve_crashkernel(unsigned long long size)
449{
450 const unsigned long long alignment = 16<<20; /* 16M */
451 unsigned long long start = 0LL;
452
453 while (1) {
454 int ret;
455
456 start = find_e820_area(start, ULONG_MAX, size, alignment);
457 if (start == -1ULL)
458 return start;
459
460 /* try to reserve it */
461 ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE);
462 if (ret >= 0)
463 return start;
65 464
66 /* alloc_bootmem zeroes memory */ 465 start += alignment;
67 cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids); 466 }
68 for (i = 0; i < nr_cpu_ids; i++) 467}
69 cpu_set(i, cpumask_of_cpu_map[i]); 468
469static inline unsigned long long get_total_mem(void)
470{
471 unsigned long long total;
472
473 total = max_low_pfn - min_low_pfn;
474#ifdef CONFIG_HIGHMEM
475 total += highend_pfn - highstart_pfn;
476#endif
477
478 return total << PAGE_SHIFT;
479}
480
481static void __init reserve_crashkernel(void)
482{
483 unsigned long long total_mem;
484 unsigned long long crash_size, crash_base;
485 int ret;
486
487 total_mem = get_total_mem();
488
489 ret = parse_crashkernel(boot_command_line, total_mem,
490 &crash_size, &crash_base);
491 if (ret != 0 || crash_size <= 0)
492 return;
493
494 /* 0 means: find the address automatically */
495 if (crash_base <= 0) {
496 crash_base = find_and_reserve_crashkernel(crash_size);
497 if (crash_base == -1ULL) {
498 pr_info("crashkernel reservation failed. "
499 "No suitable area found.\n");
500 return;
501 }
502 } else {
503 ret = reserve_bootmem_generic(crash_base, crash_size,
504 BOOTMEM_EXCLUSIVE);
505 if (ret < 0) {
506 pr_info("crashkernel reservation failed - "
507 "memory is in use\n");
508 return;
509 }
510 }
511
512 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
513 "for crashkernel (System RAM: %ldMB)\n",
514 (unsigned long)(crash_size >> 20),
515 (unsigned long)(crash_base >> 20),
516 (unsigned long)(total_mem >> 20));
517
518 crashk_res.start = crash_base;
519 crashk_res.end = crash_base + crash_size - 1;
520 insert_resource(&iomem_resource, &crashk_res);
70} 521}
71#else 522#else
72static inline void setup_cpumask_of_cpu(void) { } 523static void __init reserve_crashkernel(void)
524{
525}
73#endif 526#endif
74 527
75#ifdef CONFIG_X86_32 528static struct resource standard_io_resources[] = {
76/* 529 { .name = "dma1", .start = 0x00, .end = 0x1f,
77 * Great future not-so-futuristic plan: make i386 and x86_64 do it 530 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
78 * the same way 531 { .name = "pic1", .start = 0x20, .end = 0x21,
532 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
533 { .name = "timer0", .start = 0x40, .end = 0x43,
534 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
535 { .name = "timer1", .start = 0x50, .end = 0x53,
536 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
537 { .name = "keyboard", .start = 0x60, .end = 0x60,
538 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
539 { .name = "keyboard", .start = 0x64, .end = 0x64,
540 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
541 { .name = "dma page reg", .start = 0x80, .end = 0x8f,
542 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
543 { .name = "pic2", .start = 0xa0, .end = 0xa1,
544 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
545 { .name = "dma2", .start = 0xc0, .end = 0xdf,
546 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
547 { .name = "fpu", .start = 0xf0, .end = 0xff,
548 .flags = IORESOURCE_BUSY | IORESOURCE_IO }
549};
550
551static void __init reserve_standard_io_resources(void)
552{
553 int i;
554
555 /* request I/O space for devices used on all i[345]86 PCs */
556 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
557 request_resource(&ioport_resource, &standard_io_resources[i]);
558
559}
560
561#ifdef CONFIG_PROC_VMCORE
562/* elfcorehdr= specifies the location of elf core header
563 * stored by the crashed kernel. This option will be passed
564 * by kexec loader to the capture kernel.
79 */ 565 */
80unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; 566static int __init setup_elfcorehdr(char *arg)
81EXPORT_SYMBOL(__per_cpu_offset); 567{
568 char *end;
569 if (!arg)
570 return -EINVAL;
571 elfcorehdr_addr = memparse(arg, &end);
572 return end > arg ? 0 : -EINVAL;
573}
574early_param("elfcorehdr", setup_elfcorehdr);
82#endif 575#endif
83 576
577static struct x86_quirks default_x86_quirks __initdata;
578
579struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
580
581/*
582 * Determine if we were loaded by an EFI loader. If so, then we have also been
583 * passed the efi memmap, systab, etc., so we should use these data structures
584 * for initialization. Note, the efi init code path is determined by the
585 * global efi_enabled. This allows the same kernel image to be used on existing
586 * systems (with a traditional BIOS) as well as on EFI systems.
587 */
84/* 588/*
85 * Great future plan: 589 * setup_arch - architecture-specific boot-time initializations
86 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. 590 *
87 * Always point %gs to its beginning 591 * Note: On x86_64, fixmaps are ready for use even before this is called.
88 */ 592 */
89void __init setup_per_cpu_areas(void) 593
594void __init setup_arch(char **cmdline_p)
90{ 595{
91 int i, highest_cpu = 0; 596#ifdef CONFIG_X86_32
92 unsigned long size; 597 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
598 visws_early_detect();
599 pre_setup_arch_hook();
600#else
601 printk(KERN_INFO "Command line: %s\n", boot_command_line);
602#endif
93 603
94#ifdef CONFIG_HOTPLUG_CPU 604 early_cpu_init();
95 prefill_possible_map(); 605 early_ioremap_init();
606
607 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
608 screen_info = boot_params.screen_info;
609 edid_info = boot_params.edid_info;
610#ifdef CONFIG_X86_32
611 apm_info.bios = boot_params.apm_bios_info;
612 ist_info = boot_params.ist_info;
613 if (boot_params.sys_desc_table.length != 0) {
614 set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
615 machine_id = boot_params.sys_desc_table.table[0];
616 machine_submodel_id = boot_params.sys_desc_table.table[1];
617 BIOS_revision = boot_params.sys_desc_table.table[2];
618 }
619#endif
620 saved_video_mode = boot_params.hdr.vid_mode;
621 bootloader_type = boot_params.hdr.type_of_loader;
622
623#ifdef CONFIG_BLK_DEV_RAM
624 rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
625 rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
626 rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
627#endif
628#ifdef CONFIG_EFI
629 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
630#ifdef CONFIG_X86_32
631 "EL32",
632#else
633 "EL64",
96#endif 634#endif
635 4)) {
636 efi_enabled = 1;
637 efi_reserve_early();
638 }
639#endif
640
641 ARCH_SETUP
642
643 setup_memory_map();
644 parse_setup_data();
645 /* update the e820_saved too */
646 e820_reserve_setup_data();
97 647
98 /* Copy section for each CPU (we discard the original) */ 648 copy_edd();
99 size = PERCPU_ENOUGH_ROOM;
100 printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
101 size);
102 649
103 for_each_possible_cpu(i) { 650 if (!boot_params.hdr.root_flags)
104 char *ptr; 651 root_mountflags &= ~MS_RDONLY;
105#ifndef CONFIG_NEED_MULTIPLE_NODES 652 init_mm.start_code = (unsigned long) _text;
106 ptr = alloc_bootmem_pages(size); 653 init_mm.end_code = (unsigned long) _etext;
654 init_mm.end_data = (unsigned long) _edata;
655#ifdef CONFIG_X86_32
656 init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
107#else 657#else
108 int node = early_cpu_to_node(i); 658 init_mm.brk = (unsigned long) &_end;
109 if (!node_online(node) || !NODE_DATA(node)) {
110 ptr = alloc_bootmem_pages(size);
111 printk(KERN_INFO
112 "cpu %d has no node or node-local memory\n", i);
113 }
114 else
115 ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
116#endif 659#endif
117 if (!ptr) 660
118 panic("Cannot allocate cpu data for CPU %d\n", i); 661 code_resource.start = virt_to_phys(_text);
119#ifdef CONFIG_X86_64 662 code_resource.end = virt_to_phys(_etext)-1;
120 cpu_pda(i)->data_offset = ptr - __per_cpu_start; 663 data_resource.start = virt_to_phys(_etext);
664 data_resource.end = virt_to_phys(_edata)-1;
665 bss_resource.start = virt_to_phys(&__bss_start);
666 bss_resource.end = virt_to_phys(&__bss_stop)-1;
667
668 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
669 *cmdline_p = command_line;
670
671 parse_early_param();
672
673 /* after early param, so could get panic from serial */
674 reserve_early_setup_data();
675
676 if (acpi_mps_check()) {
677#ifdef CONFIG_X86_LOCAL_APIC
678 disable_apic = 1;
679#endif
680 setup_clear_cpu_cap(X86_FEATURE_APIC);
681 }
682
683#ifdef CONFIG_PCI
684 if (pci_early_dump_regs)
685 early_dump_pci_devices();
686#endif
687
688 finish_e820_parsing();
689
690#ifdef CONFIG_X86_32
691 probe_roms();
692#endif
693
694 /* after parse_early_param, so could debug it */
695 insert_resource(&iomem_resource, &code_resource);
696 insert_resource(&iomem_resource, &data_resource);
697 insert_resource(&iomem_resource, &bss_resource);
698
699 if (efi_enabled)
700 efi_init();
701
702#ifdef CONFIG_X86_32
703 if (ppro_with_ram_bug()) {
704 e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
705 E820_RESERVED);
706 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
707 printk(KERN_INFO "fixed physical RAM map:\n");
708 e820_print_map("bad_ppro");
709 }
710#else
711 early_gart_iommu_check();
712#endif
713
714 /*
715 * partially used pages are not usable - thus
716 * we are rounding upwards:
717 */
718 max_pfn = e820_end_of_ram_pfn();
719
720 /* preallocate 4k for mptable mpc */
721 early_reserve_e820_mpc_new();
722 /* update e820 for memory not covered by WB MTRRs */
723 mtrr_bp_init();
724 if (mtrr_trim_uncached_memory(max_pfn))
725 max_pfn = e820_end_of_ram_pfn();
726
727#ifdef CONFIG_X86_32
728 /* max_low_pfn get updated here */
729 find_low_pfn_range();
121#else 730#else
122 __per_cpu_offset[i] = ptr - __per_cpu_start; 731 num_physpages = max_pfn;
732
733 check_efer();
734
735 /* How many end-of-memory variables you have, grandma! */
736 /* need this before calling reserve_initrd */
737 if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
738 max_low_pfn = e820_end_of_low_ram_pfn();
739 else
740 max_low_pfn = max_pfn;
741
742 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
123#endif 743#endif
124 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
125 744
126 highest_cpu = i; 745 /* max_pfn_mapped is updated here */
746 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
747 max_pfn_mapped = max_low_pfn_mapped;
748
749#ifdef CONFIG_X86_64
750 if (max_pfn > max_low_pfn) {
751 max_pfn_mapped = init_memory_mapping(1UL<<32,
752 max_pfn<<PAGE_SHIFT);
753 /* can we preseve max_low_pfn ?*/
754 max_low_pfn = max_pfn;
127 } 755 }
756#endif
128 757
129 nr_cpu_ids = highest_cpu + 1; 758 /*
130 printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids); 759 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
760 */
131 761
132 /* Setup percpu data maps */ 762#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
133 setup_per_cpu_maps(); 763 if (init_ohci1394_dma_early)
764 init_ohci1394_dma_on_all_controllers();
765#endif
134 766
135 /* Setup cpumask_of_cpu map */ 767 reserve_initrd();
136 setup_cpumask_of_cpu(); 768
137} 769#ifdef CONFIG_X86_64
770 vsmp_init();
771#endif
772
773 dmi_scan_machine();
774
775 io_delay_init();
776
777 /*
778 * Parse the ACPI tables for possible boot-time SMP configuration.
779 */
780 acpi_boot_table_init();
781
782#ifdef CONFIG_ACPI_NUMA
783 /*
784 * Parse SRAT to discover nodes.
785 */
786 acpi_numa_init();
787#endif
788
789 initmem_init(0, max_pfn);
790
791#ifdef CONFIG_X86_64
792 dma32_reserve_bootmem();
793#endif
138 794
795#ifdef CONFIG_ACPI_SLEEP
796 /*
797 * Reserve low memory region for sleep support.
798 */
799 acpi_reserve_bootmem();
139#endif 800#endif
801#ifdef CONFIG_X86_FIND_SMP_CONFIG
802 /*
803 * Find and reserve possible boot-time SMP configuration:
804 */
805 find_smp_config();
806#endif
807 reserve_crashkernel();
808
809 reserve_ibft_region();
810
811#ifdef CONFIG_KVM_CLOCK
812 kvmclock_init();
813#endif
814
815#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
816 /*
817 * Must be after max_low_pfn is determined, and before kernel
818 * pagetables are setup.
819 */
820 vmi_init();
821#endif
822
823 paravirt_pagetable_setup_start(swapper_pg_dir);
824 paging_init();
825 paravirt_pagetable_setup_done(swapper_pg_dir);
826 paravirt_post_allocator_init();
827
828#ifdef CONFIG_X86_64
829 map_vsyscall();
830#endif
831
832#ifdef CONFIG_X86_GENERICARCH
833 generic_apic_probe();
834#endif
835
836 early_quirks();
837
838 /*
839 * Read APIC and some other early information from ACPI tables.
840 */
841 acpi_boot_init();
842
843#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
844 /*
845 * get boot-time SMP configuration:
846 */
847 if (smp_found_config)
848 get_smp_config();
849#endif
850
851 prefill_possible_map();
852#ifdef CONFIG_X86_64
853 init_cpu_to_node();
854#endif
855
856 init_apic_mappings();
857 ioapic_init_mappings();
858
859#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) && defined(CONFIG_X86_32)
860 if (def_to_bigsmp)
861 printk(KERN_WARNING "More than 8 CPUs detected and "
862 "CONFIG_X86_PC cannot handle it.\nUse "
863 "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
864#endif
865 kvm_guest_init();
866
867 e820_reserve_resources();
868 e820_mark_nosave_regions(max_low_pfn);
869
870#ifdef CONFIG_X86_32
871 request_resource(&iomem_resource, &video_ram_resource);
872#endif
873 reserve_standard_io_resources();
874
875 e820_setup_gap();
876
877#ifdef CONFIG_VT
878#if defined(CONFIG_VGA_CONSOLE)
879 if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
880 conswitchp = &vga_con;
881#elif defined(CONFIG_DUMMY_CONSOLE)
882 conswitchp = &dummy_con;
883#endif
884#endif
885}
diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
deleted file mode 100644
index aee0e8200777..000000000000
--- a/arch/x86/kernel/setup64.c
+++ /dev/null
@@ -1,287 +0,0 @@
1/*
2 * X86-64 specific CPU setup.
3 * Copyright (C) 1995 Linus Torvalds
4 * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
5 * See setup.c for older changelog.
6 */
7#include <linux/init.h>
8#include <linux/kernel.h>
9#include <linux/sched.h>
10#include <linux/string.h>
11#include <linux/bootmem.h>
12#include <linux/bitops.h>
13#include <linux/module.h>
14#include <linux/kgdb.h>
15#include <asm/pda.h>
16#include <asm/pgtable.h>
17#include <asm/processor.h>
18#include <asm/desc.h>
19#include <asm/atomic.h>
20#include <asm/mmu_context.h>
21#include <asm/smp.h>
22#include <asm/i387.h>
23#include <asm/percpu.h>
24#include <asm/proto.h>
25#include <asm/sections.h>
26#include <asm/setup.h>
27#include <asm/genapic.h>
28
29#ifndef CONFIG_DEBUG_BOOT_PARAMS
30struct boot_params __initdata boot_params;
31#else
32struct boot_params boot_params;
33#endif
34
35cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
36
37struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
38EXPORT_SYMBOL(_cpu_pda);
39struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
40
41struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
42
43char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
44
45unsigned long __supported_pte_mask __read_mostly = ~0UL;
46EXPORT_SYMBOL_GPL(__supported_pte_mask);
47
48static int do_not_nx __cpuinitdata = 0;
49
50/* noexec=on|off
51Control non executable mappings for 64bit processes.
52
53on Enable(default)
54off Disable
55*/
56static int __init nonx_setup(char *str)
57{
58 if (!str)
59 return -EINVAL;
60 if (!strncmp(str, "on", 2)) {
61 __supported_pte_mask |= _PAGE_NX;
62 do_not_nx = 0;
63 } else if (!strncmp(str, "off", 3)) {
64 do_not_nx = 1;
65 __supported_pte_mask &= ~_PAGE_NX;
66 }
67 return 0;
68}
69early_param("noexec", nonx_setup);
70
71int force_personality32 = 0;
72
73/* noexec32=on|off
74Control non executable heap for 32bit processes.
75To control the stack too use noexec=off
76
77on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
78off PROT_READ implies PROT_EXEC
79*/
80static int __init nonx32_setup(char *str)
81{
82 if (!strcmp(str, "on"))
83 force_personality32 &= ~READ_IMPLIES_EXEC;
84 else if (!strcmp(str, "off"))
85 force_personality32 |= READ_IMPLIES_EXEC;
86 return 1;
87}
88__setup("noexec32=", nonx32_setup);
89
90void pda_init(int cpu)
91{
92 struct x8664_pda *pda = cpu_pda(cpu);
93
94 /* Setup up data that may be needed in __get_free_pages early */
95 asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
96 /* Memory clobbers used to order PDA accessed */
97 mb();
98 wrmsrl(MSR_GS_BASE, pda);
99 mb();
100
101 pda->cpunumber = cpu;
102 pda->irqcount = -1;
103 pda->kernelstack =
104 (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE;
105 pda->active_mm = &init_mm;
106 pda->mmu_state = 0;
107
108 if (cpu == 0) {
109 /* others are initialized in smpboot.c */
110 pda->pcurrent = &init_task;
111 pda->irqstackptr = boot_cpu_stack;
112 } else {
113 pda->irqstackptr = (char *)
114 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
115 if (!pda->irqstackptr)
116 panic("cannot allocate irqstack for cpu %d", cpu);
117 }
118
119
120 pda->irqstackptr += IRQSTACKSIZE-64;
121}
122
123char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
124__attribute__((section(".bss.page_aligned")));
125
126extern asmlinkage void ignore_sysret(void);
127
128/* May not be marked __init: used by software suspend */
129void syscall_init(void)
130{
131 /*
132 * LSTAR and STAR live in a bit strange symbiosis.
133 * They both write to the same internal register. STAR allows to set CS/DS
134 * but only a 32bit target. LSTAR sets the 64bit rip.
135 */
136 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
137 wrmsrl(MSR_LSTAR, system_call);
138 wrmsrl(MSR_CSTAR, ignore_sysret);
139
140#ifdef CONFIG_IA32_EMULATION
141 syscall32_cpu_init ();
142#endif
143
144 /* Flags to clear on syscall */
145 wrmsrl(MSR_SYSCALL_MASK,
146 X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
147}
148
149void __cpuinit check_efer(void)
150{
151 unsigned long efer;
152
153 rdmsrl(MSR_EFER, efer);
154 if (!(efer & EFER_NX) || do_not_nx) {
155 __supported_pte_mask &= ~_PAGE_NX;
156 }
157}
158
159unsigned long kernel_eflags;
160
161/*
162 * Copies of the original ist values from the tss are only accessed during
163 * debugging, no special alignment required.
164 */
165DEFINE_PER_CPU(struct orig_ist, orig_ist);
166
167/*
168 * cpu_init() initializes state that is per-CPU. Some data is already
169 * initialized (naturally) in the bootstrap process, such as the GDT
170 * and IDT. We reload them nevertheless, this function acts as a
171 * 'CPU state barrier', nothing should get across.
172 * A lot of state is already set up in PDA init.
173 */
174void __cpuinit cpu_init (void)
175{
176 int cpu = stack_smp_processor_id();
177 struct tss_struct *t = &per_cpu(init_tss, cpu);
178 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
179 unsigned long v;
180 char *estacks = NULL;
181 struct task_struct *me;
182 int i;
183
184 /* CPU 0 is initialised in head64.c */
185 if (cpu != 0) {
186 pda_init(cpu);
187 } else
188 estacks = boot_exception_stacks;
189
190 me = current;
191
192 if (cpu_test_and_set(cpu, cpu_initialized))
193 panic("CPU#%d already initialized!\n", cpu);
194
195 printk("Initializing CPU#%d\n", cpu);
196
197 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
198
199 /*
200 * Initialize the per-CPU GDT with the boot GDT,
201 * and set up the GDT descriptor:
202 */
203 if (cpu)
204 memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE);
205
206 cpu_gdt_descr[cpu].size = GDT_SIZE;
207 load_gdt((const struct desc_ptr *)&cpu_gdt_descr[cpu]);
208 load_idt((const struct desc_ptr *)&idt_descr);
209
210 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
211 syscall_init();
212
213 wrmsrl(MSR_FS_BASE, 0);
214 wrmsrl(MSR_KERNEL_GS_BASE, 0);
215 barrier();
216
217 check_efer();
218
219 /*
220 * set up and load the per-CPU TSS
221 */
222 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
223 static const unsigned int order[N_EXCEPTION_STACKS] = {
224 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
225 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
226 };
227 if (cpu) {
228 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
229 if (!estacks)
230 panic("Cannot allocate exception stack %ld %d\n",
231 v, cpu);
232 }
233 estacks += PAGE_SIZE << order[v];
234 orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
235 }
236
237 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
238 /*
239 * <= is required because the CPU will access up to
240 * 8 bits beyond the end of the IO permission bitmap.
241 */
242 for (i = 0; i <= IO_BITMAP_LONGS; i++)
243 t->io_bitmap[i] = ~0UL;
244
245 atomic_inc(&init_mm.mm_count);
246 me->active_mm = &init_mm;
247 if (me->mm)
248 BUG();
249 enter_lazy_tlb(&init_mm, me);
250
251 set_tss_desc(cpu, t);
252 load_TR_desc();
253 load_LDT(&init_mm.context);
254
255#ifdef CONFIG_KGDB
256 /*
257 * If the kgdb is connected no debug regs should be altered. This
258 * is only applicable when KGDB and a KGDB I/O module are built
259 * into the kernel and you are using early debugging with
260 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
261 */
262 if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
263 arch_kgdb_ops.correct_hw_break();
264 else {
265#endif
266 /*
267 * Clear all 6 debug registers:
268 */
269
270 set_debugreg(0UL, 0);
271 set_debugreg(0UL, 1);
272 set_debugreg(0UL, 2);
273 set_debugreg(0UL, 3);
274 set_debugreg(0UL, 6);
275 set_debugreg(0UL, 7);
276#ifdef CONFIG_KGDB
277 /* If the kgdb is connected no debug regs should be altered. */
278 }
279#endif
280
281 fpu_init();
282
283 raw_local_save_flags(kernel_eflags);
284
285 if (is_uv_system())
286 uv_cpu_init();
287}
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
deleted file mode 100644
index 5a2f8e063887..000000000000
--- a/arch/x86/kernel/setup_32.c
+++ /dev/null
@@ -1,964 +0,0 @@
1/*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
5 *
6 * Memory region support
7 * David Parsons <orc@pell.chi.il.us>, July-August 1999
8 *
9 * Added E820 sanitization routine (removes overlapping memory regions);
10 * Brian Moyle <bmoyle@mvista.com>, February 2001
11 *
12 * Moved CPU detection code to cpu/${cpu}.c
13 * Patrick Mochel <mochel@osdl.org>, March 2002
14 *
15 * Provisions for empty E820 memory regions (reported by certain BIOSes).
16 * Alex Achenbach <xela@slit.de>, December 2002.
17 *
18 */
19
20/*
21 * This file handles the architecture-dependent parts of initialization
22 */
23
24#include <linux/sched.h>
25#include <linux/mm.h>
26#include <linux/mmzone.h>
27#include <linux/screen_info.h>
28#include <linux/ioport.h>
29#include <linux/acpi.h>
30#include <linux/apm_bios.h>
31#include <linux/initrd.h>
32#include <linux/bootmem.h>
33#include <linux/seq_file.h>
34#include <linux/console.h>
35#include <linux/mca.h>
36#include <linux/root_dev.h>
37#include <linux/highmem.h>
38#include <linux/module.h>
39#include <linux/efi.h>
40#include <linux/init.h>
41#include <linux/edd.h>
42#include <linux/iscsi_ibft.h>
43#include <linux/nodemask.h>
44#include <linux/kexec.h>
45#include <linux/crash_dump.h>
46#include <linux/dmi.h>
47#include <linux/pfn.h>
48#include <linux/pci.h>
49#include <linux/init_ohci1394_dma.h>
50#include <linux/kvm_para.h>
51
52#include <video/edid.h>
53
54#include <asm/mtrr.h>
55#include <asm/apic.h>
56#include <asm/e820.h>
57#include <asm/mpspec.h>
58#include <asm/mmzone.h>
59#include <asm/setup.h>
60#include <asm/arch_hooks.h>
61#include <asm/sections.h>
62#include <asm/io_apic.h>
63#include <asm/ist.h>
64#include <asm/io.h>
65#include <asm/vmi.h>
66#include <setup_arch.h>
67#include <asm/bios_ebda.h>
68#include <asm/cacheflush.h>
69#include <asm/processor.h>
70
71/* This value is set up by the early boot code to point to the value
72 immediately after the boot time page tables. It contains a *physical*
73 address, and must not be in the .bss segment! */
74unsigned long init_pg_tables_end __initdata = ~0UL;
75
76/*
77 * Machine setup..
78 */
79static struct resource data_resource = {
80 .name = "Kernel data",
81 .start = 0,
82 .end = 0,
83 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
84};
85
86static struct resource code_resource = {
87 .name = "Kernel code",
88 .start = 0,
89 .end = 0,
90 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
91};
92
93static struct resource bss_resource = {
94 .name = "Kernel bss",
95 .start = 0,
96 .end = 0,
97 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
98};
99
100static struct resource video_ram_resource = {
101 .name = "Video RAM area",
102 .start = 0xa0000,
103 .end = 0xbffff,
104 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
105};
106
107static struct resource standard_io_resources[] = { {
108 .name = "dma1",
109 .start = 0x0000,
110 .end = 0x001f,
111 .flags = IORESOURCE_BUSY | IORESOURCE_IO
112}, {
113 .name = "pic1",
114 .start = 0x0020,
115 .end = 0x0021,
116 .flags = IORESOURCE_BUSY | IORESOURCE_IO
117}, {
118 .name = "timer0",
119 .start = 0x0040,
120 .end = 0x0043,
121 .flags = IORESOURCE_BUSY | IORESOURCE_IO
122}, {
123 .name = "timer1",
124 .start = 0x0050,
125 .end = 0x0053,
126 .flags = IORESOURCE_BUSY | IORESOURCE_IO
127}, {
128 .name = "keyboard",
129 .start = 0x0060,
130 .end = 0x0060,
131 .flags = IORESOURCE_BUSY | IORESOURCE_IO
132}, {
133 .name = "keyboard",
134 .start = 0x0064,
135 .end = 0x0064,
136 .flags = IORESOURCE_BUSY | IORESOURCE_IO
137}, {
138 .name = "dma page reg",
139 .start = 0x0080,
140 .end = 0x008f,
141 .flags = IORESOURCE_BUSY | IORESOURCE_IO
142}, {
143 .name = "pic2",
144 .start = 0x00a0,
145 .end = 0x00a1,
146 .flags = IORESOURCE_BUSY | IORESOURCE_IO
147}, {
148 .name = "dma2",
149 .start = 0x00c0,
150 .end = 0x00df,
151 .flags = IORESOURCE_BUSY | IORESOURCE_IO
152}, {
153 .name = "fpu",
154 .start = 0x00f0,
155 .end = 0x00ff,
156 .flags = IORESOURCE_BUSY | IORESOURCE_IO
157} };
158
159/* cpu data as detected by the assembly code in head.S */
160struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
161/* common cpu data for all cpus */
162struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
163EXPORT_SYMBOL(boot_cpu_data);
164
165unsigned int def_to_bigsmp;
166
167#ifndef CONFIG_X86_PAE
168unsigned long mmu_cr4_features;
169#else
170unsigned long mmu_cr4_features = X86_CR4_PAE;
171#endif
172
173/* for MCA, but anyone else can use it if they want */
174unsigned int machine_id;
175unsigned int machine_submodel_id;
176unsigned int BIOS_revision;
177
178/* Boot loader ID as an integer, for the benefit of proc_dointvec */
179int bootloader_type;
180
181/* user-defined highmem size */
182static unsigned int highmem_pages = -1;
183
184/*
185 * Setup options
186 */
187struct screen_info screen_info;
188EXPORT_SYMBOL(screen_info);
189struct apm_info apm_info;
190EXPORT_SYMBOL(apm_info);
191struct edid_info edid_info;
192EXPORT_SYMBOL_GPL(edid_info);
193struct ist_info ist_info;
194#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
195 defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
196EXPORT_SYMBOL(ist_info);
197#endif
198
199extern void early_cpu_init(void);
200extern int root_mountflags;
201
202unsigned long saved_video_mode;
203
204#define RAMDISK_IMAGE_START_MASK 0x07FF
205#define RAMDISK_PROMPT_FLAG 0x8000
206#define RAMDISK_LOAD_FLAG 0x4000
207
208static char __initdata command_line[COMMAND_LINE_SIZE];
209
210#ifndef CONFIG_DEBUG_BOOT_PARAMS
211struct boot_params __initdata boot_params;
212#else
213struct boot_params boot_params;
214#endif
215
216#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
217struct edd edd;
218#ifdef CONFIG_EDD_MODULE
219EXPORT_SYMBOL(edd);
220#endif
221/**
222 * copy_edd() - Copy the BIOS EDD information
223 * from boot_params into a safe place.
224 *
225 */
226static inline void copy_edd(void)
227{
228 memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
229 sizeof(edd.mbr_signature));
230 memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
231 edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
232 edd.edd_info_nr = boot_params.eddbuf_entries;
233}
234#else
235static inline void copy_edd(void)
236{
237}
238#endif
239
240int __initdata user_defined_memmap;
241
242/*
243 * "mem=nopentium" disables the 4MB page tables.
244 * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
245 * to <mem>, overriding the bios size.
246 * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
247 * <start> to <start>+<mem>, overriding the bios size.
248 *
249 * HPA tells me bootloaders need to parse mem=, so no new
250 * option should be mem= [also see Documentation/i386/boot.txt]
251 */
252static int __init parse_mem(char *arg)
253{
254 if (!arg)
255 return -EINVAL;
256
257 if (strcmp(arg, "nopentium") == 0) {
258 setup_clear_cpu_cap(X86_FEATURE_PSE);
259 } else {
260 /* If the user specifies memory size, we
261 * limit the BIOS-provided memory map to
262 * that size. exactmap can be used to specify
263 * the exact map. mem=number can be used to
264 * trim the existing memory map.
265 */
266 unsigned long long mem_size;
267
268 mem_size = memparse(arg, &arg);
269 limit_regions(mem_size);
270 user_defined_memmap = 1;
271 }
272 return 0;
273}
274early_param("mem", parse_mem);
275
276#ifdef CONFIG_PROC_VMCORE
277/* elfcorehdr= specifies the location of elf core header
278 * stored by the crashed kernel.
279 */
280static int __init parse_elfcorehdr(char *arg)
281{
282 if (!arg)
283 return -EINVAL;
284
285 elfcorehdr_addr = memparse(arg, &arg);
286 return 0;
287}
288early_param("elfcorehdr", parse_elfcorehdr);
289#endif /* CONFIG_PROC_VMCORE */
290
291/*
292 * highmem=size forces highmem to be exactly 'size' bytes.
293 * This works even on boxes that have no highmem otherwise.
294 * This also works to reduce highmem size on bigger boxes.
295 */
296static int __init parse_highmem(char *arg)
297{
298 if (!arg)
299 return -EINVAL;
300
301 highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
302 return 0;
303}
304early_param("highmem", parse_highmem);
305
306/*
307 * vmalloc=size forces the vmalloc area to be exactly 'size'
308 * bytes. This can be used to increase (or decrease) the
309 * vmalloc area - the default is 128m.
310 */
311static int __init parse_vmalloc(char *arg)
312{
313 if (!arg)
314 return -EINVAL;
315
316 __VMALLOC_RESERVE = memparse(arg, &arg);
317 return 0;
318}
319early_param("vmalloc", parse_vmalloc);
320
321/*
322 * reservetop=size reserves a hole at the top of the kernel address space which
323 * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
324 * so relocating the fixmap can be done before paging initialization.
325 */
326static int __init parse_reservetop(char *arg)
327{
328 unsigned long address;
329
330 if (!arg)
331 return -EINVAL;
332
333 address = memparse(arg, &arg);
334 reserve_top_address(address);
335 return 0;
336}
337early_param("reservetop", parse_reservetop);
338
339/*
340 * Determine low and high memory ranges:
341 */
342unsigned long __init find_max_low_pfn(void)
343{
344 unsigned long max_low_pfn;
345
346 max_low_pfn = max_pfn;
347 if (max_low_pfn > MAXMEM_PFN) {
348 if (highmem_pages == -1)
349 highmem_pages = max_pfn - MAXMEM_PFN;
350 if (highmem_pages + MAXMEM_PFN < max_pfn)
351 max_pfn = MAXMEM_PFN + highmem_pages;
352 if (highmem_pages + MAXMEM_PFN > max_pfn) {
353 printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
354 highmem_pages = 0;
355 }
356 max_low_pfn = MAXMEM_PFN;
357#ifndef CONFIG_HIGHMEM
358 /* Maximum memory usable is what is directly addressable */
359 printk(KERN_WARNING "Warning only %ldMB will be used.\n",
360 MAXMEM>>20);
361 if (max_pfn > MAX_NONPAE_PFN)
362 printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
363 else
364 printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
365 max_pfn = MAXMEM_PFN;
366#else /* !CONFIG_HIGHMEM */
367#ifndef CONFIG_HIGHMEM64G
368 if (max_pfn > MAX_NONPAE_PFN) {
369 max_pfn = MAX_NONPAE_PFN;
370 printk(KERN_WARNING "Warning only 4GB will be used.\n");
371 printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
372 }
373#endif /* !CONFIG_HIGHMEM64G */
374#endif /* !CONFIG_HIGHMEM */
375 } else {
376 if (highmem_pages == -1)
377 highmem_pages = 0;
378#ifdef CONFIG_HIGHMEM
379 if (highmem_pages >= max_pfn) {
380 printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
381 highmem_pages = 0;
382 }
383 if (highmem_pages) {
384 if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
385 printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
386 highmem_pages = 0;
387 }
388 max_low_pfn -= highmem_pages;
389 }
390#else
391 if (highmem_pages)
392 printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
393#endif
394 }
395 return max_low_pfn;
396}
397
398#define BIOS_LOWMEM_KILOBYTES 0x413
399
400/*
401 * The BIOS places the EBDA/XBDA at the top of conventional
402 * memory, and usually decreases the reported amount of
403 * conventional memory (int 0x12) too. This also contains a
404 * workaround for Dell systems that neglect to reserve EBDA.
405 * The same workaround also avoids a problem with the AMD768MPX
406 * chipset: reserve a page before VGA to prevent PCI prefetch
407 * into it (errata #56). Usually the page is reserved anyways,
408 * unless you have no PS/2 mouse plugged in.
409 */
410static void __init reserve_ebda_region(void)
411{
412 unsigned int lowmem, ebda_addr;
413
414 /* To determine the position of the EBDA and the */
415 /* end of conventional memory, we need to look at */
416 /* the BIOS data area. In a paravirtual environment */
417 /* that area is absent. We'll just have to assume */
418 /* that the paravirt case can handle memory setup */
419 /* correctly, without our help. */
420 if (paravirt_enabled())
421 return;
422
423 /* end of low (conventional) memory */
424 lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
425 lowmem <<= 10;
426
427 /* start of EBDA area */
428 ebda_addr = get_bios_ebda();
429
430 /* Fixup: bios puts an EBDA in the top 64K segment */
431 /* of conventional memory, but does not adjust lowmem. */
432 if ((lowmem - ebda_addr) <= 0x10000)
433 lowmem = ebda_addr;
434
435 /* Fixup: bios does not report an EBDA at all. */
436 /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
437 if ((ebda_addr == 0) && (lowmem >= 0x9f000))
438 lowmem = 0x9f000;
439
440 /* Paranoia: should never happen, but... */
441 if ((lowmem == 0) || (lowmem >= 0x100000))
442 lowmem = 0x9f000;
443
444 /* reserve all memory between lowmem and the 1MB mark */
445 reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT);
446}
447
448#ifndef CONFIG_NEED_MULTIPLE_NODES
449static void __init setup_bootmem_allocator(void);
450static unsigned long __init setup_memory(void)
451{
452 /*
453 * partially used pages are not usable - thus
454 * we are rounding upwards:
455 */
456 min_low_pfn = PFN_UP(init_pg_tables_end);
457
458 max_low_pfn = find_max_low_pfn();
459
460#ifdef CONFIG_HIGHMEM
461 highstart_pfn = highend_pfn = max_pfn;
462 if (max_pfn > max_low_pfn) {
463 highstart_pfn = max_low_pfn;
464 }
465 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
466 pages_to_mb(highend_pfn - highstart_pfn));
467 num_physpages = highend_pfn;
468 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
469#else
470 num_physpages = max_low_pfn;
471 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
472#endif
473#ifdef CONFIG_FLATMEM
474 max_mapnr = num_physpages;
475#endif
476 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
477 pages_to_mb(max_low_pfn));
478
479 setup_bootmem_allocator();
480
481 return max_low_pfn;
482}
483
484static void __init zone_sizes_init(void)
485{
486 unsigned long max_zone_pfns[MAX_NR_ZONES];
487 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
488 max_zone_pfns[ZONE_DMA] =
489 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
490 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
491#ifdef CONFIG_HIGHMEM
492 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
493 add_active_range(0, 0, highend_pfn);
494#else
495 add_active_range(0, 0, max_low_pfn);
496#endif
497
498 free_area_init_nodes(max_zone_pfns);
499}
500#else
501extern unsigned long __init setup_memory(void);
502extern void zone_sizes_init(void);
503#endif /* !CONFIG_NEED_MULTIPLE_NODES */
504
505static inline unsigned long long get_total_mem(void)
506{
507 unsigned long long total;
508
509 total = max_low_pfn - min_low_pfn;
510#ifdef CONFIG_HIGHMEM
511 total += highend_pfn - highstart_pfn;
512#endif
513
514 return total << PAGE_SHIFT;
515}
516
517#ifdef CONFIG_KEXEC
518static void __init reserve_crashkernel(void)
519{
520 unsigned long long total_mem;
521 unsigned long long crash_size, crash_base;
522 int ret;
523
524 total_mem = get_total_mem();
525
526 ret = parse_crashkernel(boot_command_line, total_mem,
527 &crash_size, &crash_base);
528 if (ret == 0 && crash_size > 0) {
529 if (crash_base > 0) {
530 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
531 "for crashkernel (System RAM: %ldMB)\n",
532 (unsigned long)(crash_size >> 20),
533 (unsigned long)(crash_base >> 20),
534 (unsigned long)(total_mem >> 20));
535
536 if (reserve_bootmem(crash_base, crash_size,
537 BOOTMEM_EXCLUSIVE) < 0) {
538 printk(KERN_INFO "crashkernel reservation "
539 "failed - memory is in use\n");
540 return;
541 }
542
543 crashk_res.start = crash_base;
544 crashk_res.end = crash_base + crash_size - 1;
545 } else
546 printk(KERN_INFO "crashkernel reservation failed - "
547 "you have to specify a base address\n");
548 }
549}
550#else
551static inline void __init reserve_crashkernel(void)
552{}
553#endif
554
555#ifdef CONFIG_BLK_DEV_INITRD
556
557static bool do_relocate_initrd = false;
558
559static void __init reserve_initrd(void)
560{
561 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
562 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
563 unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
564 unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
565 unsigned long ramdisk_here;
566
567 initrd_start = 0;
568
569 if (!boot_params.hdr.type_of_loader ||
570 !ramdisk_image || !ramdisk_size)
571 return; /* No initrd provided by bootloader */
572
573 if (ramdisk_end < ramdisk_image) {
574 printk(KERN_ERR "initrd wraps around end of memory, "
575 "disabling initrd\n");
576 return;
577 }
578 if (ramdisk_size >= end_of_lowmem/2) {
579 printk(KERN_ERR "initrd too large to handle, "
580 "disabling initrd\n");
581 return;
582 }
583 if (ramdisk_end <= end_of_lowmem) {
584 /* All in lowmem, easy case */
585 reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT);
586 initrd_start = ramdisk_image + PAGE_OFFSET;
587 initrd_end = initrd_start+ramdisk_size;
588 return;
589 }
590
591 /* We need to move the initrd down into lowmem */
592 ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
593
594 /* Note: this includes all the lowmem currently occupied by
595 the initrd, we rely on that fact to keep the data intact. */
596 reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT);
597 initrd_start = ramdisk_here + PAGE_OFFSET;
598 initrd_end = initrd_start + ramdisk_size;
599
600 do_relocate_initrd = true;
601}
602
603#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
604
605static void __init relocate_initrd(void)
606{
607 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
608 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
609 unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
610 unsigned long ramdisk_here;
611 unsigned long slop, clen, mapaddr;
612 char *p, *q;
613
614 if (!do_relocate_initrd)
615 return;
616
617 ramdisk_here = initrd_start - PAGE_OFFSET;
618
619 q = (char *)initrd_start;
620
621 /* Copy any lowmem portion of the initrd */
622 if (ramdisk_image < end_of_lowmem) {
623 clen = end_of_lowmem - ramdisk_image;
624 p = (char *)__va(ramdisk_image);
625 memcpy(q, p, clen);
626 q += clen;
627 ramdisk_image += clen;
628 ramdisk_size -= clen;
629 }
630
631 /* Copy the highmem portion of the initrd */
632 while (ramdisk_size) {
633 slop = ramdisk_image & ~PAGE_MASK;
634 clen = ramdisk_size;
635 if (clen > MAX_MAP_CHUNK-slop)
636 clen = MAX_MAP_CHUNK-slop;
637 mapaddr = ramdisk_image & PAGE_MASK;
638 p = early_ioremap(mapaddr, clen+slop);
639 memcpy(q, p+slop, clen);
640 early_iounmap(p, clen+slop);
641 q += clen;
642 ramdisk_image += clen;
643 ramdisk_size -= clen;
644 }
645}
646
647#endif /* CONFIG_BLK_DEV_INITRD */
648
649void __init setup_bootmem_allocator(void)
650{
651 unsigned long bootmap_size;
652 /*
653 * Initialize the boot-time allocator (with low memory only):
654 */
655 bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
656
657 register_bootmem_low_pages(max_low_pfn);
658
659 /*
660 * Reserve the bootmem bitmap itself as well. We do this in two
661 * steps (first step was init_bootmem()) because this catches
662 * the (very unlikely) case of us accidentally initializing the
663 * bootmem allocator with an invalid RAM area.
664 */
665 reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
666 bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text),
667 BOOTMEM_DEFAULT);
668
669 /*
670 * reserve physical page 0 - it's a special BIOS page on many boxes,
671 * enabling clean reboots, SMP operation, laptop functions.
672 */
673 reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
674
675 /* reserve EBDA region */
676 reserve_ebda_region();
677
678#ifdef CONFIG_SMP
679 /*
680 * But first pinch a few for the stack/trampoline stuff
681 * FIXME: Don't need the extra page at 4K, but need to fix
682 * trampoline before removing it. (see the GDT stuff)
683 */
684 reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT);
685#endif
686#ifdef CONFIG_ACPI_SLEEP
687 /*
688 * Reserve low memory region for sleep support.
689 */
690 acpi_reserve_bootmem();
691#endif
692#ifdef CONFIG_X86_FIND_SMP_CONFIG
693 /*
694 * Find and reserve possible boot-time SMP configuration:
695 */
696 find_smp_config();
697#endif
698#ifdef CONFIG_BLK_DEV_INITRD
699 reserve_initrd();
700#endif
701 numa_kva_reserve();
702 reserve_crashkernel();
703
704 reserve_ibft_region();
705}
706
707/*
708 * The node 0 pgdat is initialized before all of these because
709 * it's needed for bootmem. node>0 pgdats have their virtual
710 * space allocated before the pagetables are in place to access
711 * them, so they can't be cleared then.
712 *
713 * This should all compile down to nothing when NUMA is off.
714 */
715static void __init remapped_pgdat_init(void)
716{
717 int nid;
718
719 for_each_online_node(nid) {
720 if (nid != 0)
721 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
722 }
723}
724
725#ifdef CONFIG_MCA
726static void set_mca_bus(int x)
727{
728 MCA_bus = x;
729}
730#else
731static void set_mca_bus(int x) { }
732#endif
733
734/* Overridden in paravirt.c if CONFIG_PARAVIRT */
735char * __init __attribute__((weak)) memory_setup(void)
736{
737 return machine_specific_memory_setup();
738}
739
740#ifdef CONFIG_NUMA
741/*
742 * In the golden day, when everything among i386 and x86_64 will be
743 * integrated, this will not live here
744 */
745void *x86_cpu_to_node_map_early_ptr;
746int x86_cpu_to_node_map_init[NR_CPUS] = {
747 [0 ... NR_CPUS-1] = NUMA_NO_NODE
748};
749DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
750#endif
751
752/*
753 * Determine if we were loaded by an EFI loader. If so, then we have also been
754 * passed the efi memmap, systab, etc., so we should use these data structures
755 * for initialization. Note, the efi init code path is determined by the
756 * global efi_enabled. This allows the same kernel image to be used on existing
757 * systems (with a traditional BIOS) as well as on EFI systems.
758 */
759void __init setup_arch(char **cmdline_p)
760{
761 unsigned long max_low_pfn;
762
763 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
764 pre_setup_arch_hook();
765 early_cpu_init();
766 early_ioremap_init();
767
768#ifdef CONFIG_EFI
769 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
770 "EL32", 4))
771 efi_enabled = 1;
772#endif
773
774 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
775 screen_info = boot_params.screen_info;
776 edid_info = boot_params.edid_info;
777 apm_info.bios = boot_params.apm_bios_info;
778 ist_info = boot_params.ist_info;
779 saved_video_mode = boot_params.hdr.vid_mode;
780 if( boot_params.sys_desc_table.length != 0 ) {
781 set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
782 machine_id = boot_params.sys_desc_table.table[0];
783 machine_submodel_id = boot_params.sys_desc_table.table[1];
784 BIOS_revision = boot_params.sys_desc_table.table[2];
785 }
786 bootloader_type = boot_params.hdr.type_of_loader;
787
788#ifdef CONFIG_BLK_DEV_RAM
789 rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
790 rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
791 rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
792#endif
793 ARCH_SETUP
794
795 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
796 print_memory_map(memory_setup());
797
798 copy_edd();
799
800 if (!boot_params.hdr.root_flags)
801 root_mountflags &= ~MS_RDONLY;
802 init_mm.start_code = (unsigned long) _text;
803 init_mm.end_code = (unsigned long) _etext;
804 init_mm.end_data = (unsigned long) _edata;
805 init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
806
807 code_resource.start = virt_to_phys(_text);
808 code_resource.end = virt_to_phys(_etext)-1;
809 data_resource.start = virt_to_phys(_etext);
810 data_resource.end = virt_to_phys(_edata)-1;
811 bss_resource.start = virt_to_phys(&__bss_start);
812 bss_resource.end = virt_to_phys(&__bss_stop)-1;
813
814 parse_early_param();
815
816 if (user_defined_memmap) {
817 printk(KERN_INFO "user-defined physical RAM map:\n");
818 print_memory_map("user");
819 }
820
821 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
822 *cmdline_p = command_line;
823
824 if (efi_enabled)
825 efi_init();
826
827 /* update e820 for memory not covered by WB MTRRs */
828 propagate_e820_map();
829 mtrr_bp_init();
830 if (mtrr_trim_uncached_memory(max_pfn))
831 propagate_e820_map();
832
833 max_low_pfn = setup_memory();
834
835#ifdef CONFIG_KVM_CLOCK
836 kvmclock_init();
837#endif
838
839#ifdef CONFIG_VMI
840 /*
841 * Must be after max_low_pfn is determined, and before kernel
842 * pagetables are setup.
843 */
844 vmi_init();
845#endif
846 kvm_guest_init();
847
848 /*
849 * NOTE: before this point _nobody_ is allowed to allocate
850 * any memory using the bootmem allocator. Although the
851 * allocator is now initialised only the first 8Mb of the kernel
852 * virtual address space has been mapped. All allocations before
853 * paging_init() has completed must use the alloc_bootmem_low_pages()
854 * variant (which allocates DMA'able memory) and care must be taken
855 * not to exceed the 8Mb limit.
856 */
857
858#ifdef CONFIG_SMP
859 smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
860#endif
861 paging_init();
862
863 /*
864 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
865 */
866
867#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
868 if (init_ohci1394_dma_early)
869 init_ohci1394_dma_on_all_controllers();
870#endif
871
872 remapped_pgdat_init();
873 sparse_init();
874 zone_sizes_init();
875
876 /*
877 * NOTE: at this point the bootmem allocator is fully available.
878 */
879
880#ifdef CONFIG_BLK_DEV_INITRD
881 relocate_initrd();
882#endif
883
884 paravirt_post_allocator_init();
885
886 dmi_scan_machine();
887
888 io_delay_init();
889
890#ifdef CONFIG_X86_SMP
891 /*
892 * setup to use the early static init tables during kernel startup
893 * X86_SMP will exclude sub-arches that don't deal well with it.
894 */
895 x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
896 x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
897#ifdef CONFIG_NUMA
898 x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
899#endif
900#endif
901
902#ifdef CONFIG_X86_GENERICARCH
903 generic_apic_probe();
904#endif
905
906#ifdef CONFIG_ACPI
907 /*
908 * Parse the ACPI tables for possible boot-time SMP configuration.
909 */
910 acpi_boot_table_init();
911#endif
912
913 early_quirks();
914
915#ifdef CONFIG_ACPI
916 acpi_boot_init();
917
918#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
919 if (def_to_bigsmp)
920 printk(KERN_WARNING "More than 8 CPUs detected and "
921 "CONFIG_X86_PC cannot handle it.\nUse "
922 "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
923#endif
924#endif
925#ifdef CONFIG_X86_LOCAL_APIC
926 if (smp_found_config)
927 get_smp_config();
928#endif
929
930 e820_register_memory();
931 e820_mark_nosave_regions();
932
933#ifdef CONFIG_VT
934#if defined(CONFIG_VGA_CONSOLE)
935 if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
936 conswitchp = &vga_con;
937#elif defined(CONFIG_DUMMY_CONSOLE)
938 conswitchp = &dummy_con;
939#endif
940#endif
941}
942
943/*
944 * Request address space for all standard resources
945 *
946 * This is called just before pcibios_init(), which is also a
947 * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
948 */
949static int __init request_standard_resources(void)
950{
951 int i;
952
953 printk(KERN_INFO "Setting up standard PCI resources\n");
954 init_iomem_resources(&code_resource, &data_resource, &bss_resource);
955
956 request_resource(&iomem_resource, &video_ram_resource);
957
958 /* request I/O space for devices used on all i[345]86 PCs */
959 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
960 request_resource(&ioport_resource, &standard_io_resources[i]);
961 return 0;
962}
963
964subsys_initcall(request_standard_resources);
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
deleted file mode 100644
index 6dff1286ad8a..000000000000
--- a/arch/x86/kernel/setup_64.c
+++ /dev/null
@@ -1,1194 +0,0 @@
1/*
2 * Copyright (C) 1995 Linus Torvalds
3 */
4
5/*
6 * This file handles the architecture-dependent parts of initialization
7 */
8
9#include <linux/errno.h>
10#include <linux/sched.h>
11#include <linux/kernel.h>
12#include <linux/mm.h>
13#include <linux/stddef.h>
14#include <linux/unistd.h>
15#include <linux/ptrace.h>
16#include <linux/slab.h>
17#include <linux/user.h>
18#include <linux/screen_info.h>
19#include <linux/ioport.h>
20#include <linux/delay.h>
21#include <linux/init.h>
22#include <linux/initrd.h>
23#include <linux/highmem.h>
24#include <linux/bootmem.h>
25#include <linux/module.h>
26#include <asm/processor.h>
27#include <linux/console.h>
28#include <linux/seq_file.h>
29#include <linux/crash_dump.h>
30#include <linux/root_dev.h>
31#include <linux/pci.h>
32#include <asm/pci-direct.h>
33#include <linux/efi.h>
34#include <linux/acpi.h>
35#include <linux/kallsyms.h>
36#include <linux/edd.h>
37#include <linux/iscsi_ibft.h>
38#include <linux/mmzone.h>
39#include <linux/kexec.h>
40#include <linux/cpufreq.h>
41#include <linux/dmi.h>
42#include <linux/dma-mapping.h>
43#include <linux/ctype.h>
44#include <linux/sort.h>
45#include <linux/uaccess.h>
46#include <linux/init_ohci1394_dma.h>
47#include <linux/kvm_para.h>
48
49#include <asm/mtrr.h>
50#include <asm/uaccess.h>
51#include <asm/system.h>
52#include <asm/vsyscall.h>
53#include <asm/io.h>
54#include <asm/smp.h>
55#include <asm/msr.h>
56#include <asm/desc.h>
57#include <video/edid.h>
58#include <asm/e820.h>
59#include <asm/dma.h>
60#include <asm/gart.h>
61#include <asm/mpspec.h>
62#include <asm/mmu_context.h>
63#include <asm/proto.h>
64#include <asm/setup.h>
65#include <asm/numa.h>
66#include <asm/sections.h>
67#include <asm/dmi.h>
68#include <asm/cacheflush.h>
69#include <asm/mce.h>
70#include <asm/ds.h>
71#include <asm/topology.h>
72#include <asm/trampoline.h>
73#include <asm/pat.h>
74
75#include <mach_apic.h>
76#ifdef CONFIG_PARAVIRT
77#include <asm/paravirt.h>
78#else
79#define ARCH_SETUP
80#endif
81
82/*
83 * Machine setup..
84 */
85
86struct cpuinfo_x86 boot_cpu_data __read_mostly;
87EXPORT_SYMBOL(boot_cpu_data);
88
89__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
90
91unsigned long mmu_cr4_features;
92
93/* Boot loader ID as an integer, for the benefit of proc_dointvec */
94int bootloader_type;
95
96unsigned long saved_video_mode;
97
98int force_mwait __cpuinitdata;
99
100/*
101 * Early DMI memory
102 */
103int dmi_alloc_index;
104char dmi_alloc_data[DMI_MAX_DATA];
105
106/*
107 * Setup options
108 */
109struct screen_info screen_info;
110EXPORT_SYMBOL(screen_info);
111struct sys_desc_table_struct {
112 unsigned short length;
113 unsigned char table[0];
114};
115
116struct edid_info edid_info;
117EXPORT_SYMBOL_GPL(edid_info);
118
119extern int root_mountflags;
120
121char __initdata command_line[COMMAND_LINE_SIZE];
122
123static struct resource standard_io_resources[] = {
124 { .name = "dma1", .start = 0x00, .end = 0x1f,
125 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
126 { .name = "pic1", .start = 0x20, .end = 0x21,
127 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
128 { .name = "timer0", .start = 0x40, .end = 0x43,
129 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
130 { .name = "timer1", .start = 0x50, .end = 0x53,
131 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
132 { .name = "keyboard", .start = 0x60, .end = 0x60,
133 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
134 { .name = "keyboard", .start = 0x64, .end = 0x64,
135 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
136 { .name = "dma page reg", .start = 0x80, .end = 0x8f,
137 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
138 { .name = "pic2", .start = 0xa0, .end = 0xa1,
139 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
140 { .name = "dma2", .start = 0xc0, .end = 0xdf,
141 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
142 { .name = "fpu", .start = 0xf0, .end = 0xff,
143 .flags = IORESOURCE_BUSY | IORESOURCE_IO }
144};
145
146#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
147
148static struct resource data_resource = {
149 .name = "Kernel data",
150 .start = 0,
151 .end = 0,
152 .flags = IORESOURCE_RAM,
153};
154static struct resource code_resource = {
155 .name = "Kernel code",
156 .start = 0,
157 .end = 0,
158 .flags = IORESOURCE_RAM,
159};
160static struct resource bss_resource = {
161 .name = "Kernel bss",
162 .start = 0,
163 .end = 0,
164 .flags = IORESOURCE_RAM,
165};
166
167static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
168
169#ifdef CONFIG_PROC_VMCORE
170/* elfcorehdr= specifies the location of elf core header
171 * stored by the crashed kernel. This option will be passed
172 * by kexec loader to the capture kernel.
173 */
174static int __init setup_elfcorehdr(char *arg)
175{
176 char *end;
177 if (!arg)
178 return -EINVAL;
179 elfcorehdr_addr = memparse(arg, &end);
180 return end > arg ? 0 : -EINVAL;
181}
182early_param("elfcorehdr", setup_elfcorehdr);
183#endif
184
185#ifndef CONFIG_NUMA
186static void __init
187contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
188{
189 unsigned long bootmap_size, bootmap;
190
191 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
192 bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
193 PAGE_SIZE);
194 if (bootmap == -1L)
195 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
196 bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
197 e820_register_active_regions(0, start_pfn, end_pfn);
198 free_bootmem_with_active_regions(0, end_pfn);
199 early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
200 reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
201}
202#endif
203
204#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
205struct edd edd;
206#ifdef CONFIG_EDD_MODULE
207EXPORT_SYMBOL(edd);
208#endif
209/**
210 * copy_edd() - Copy the BIOS EDD information
211 * from boot_params into a safe place.
212 *
213 */
214static inline void copy_edd(void)
215{
216 memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
217 sizeof(edd.mbr_signature));
218 memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
219 edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
220 edd.edd_info_nr = boot_params.eddbuf_entries;
221}
222#else
223static inline void copy_edd(void)
224{
225}
226#endif
227
228#ifdef CONFIG_KEXEC
229static void __init reserve_crashkernel(void)
230{
231 unsigned long long total_mem;
232 unsigned long long crash_size, crash_base;
233 int ret;
234
235 total_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
236
237 ret = parse_crashkernel(boot_command_line, total_mem,
238 &crash_size, &crash_base);
239 if (ret == 0 && crash_size) {
240 if (crash_base <= 0) {
241 printk(KERN_INFO "crashkernel reservation failed - "
242 "you have to specify a base address\n");
243 return;
244 }
245
246 if (reserve_bootmem(crash_base, crash_size,
247 BOOTMEM_EXCLUSIVE) < 0) {
248 printk(KERN_INFO "crashkernel reservation failed - "
249 "memory is in use\n");
250 return;
251 }
252
253 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
254 "for crashkernel (System RAM: %ldMB)\n",
255 (unsigned long)(crash_size >> 20),
256 (unsigned long)(crash_base >> 20),
257 (unsigned long)(total_mem >> 20));
258 crashk_res.start = crash_base;
259 crashk_res.end = crash_base + crash_size - 1;
260 insert_resource(&iomem_resource, &crashk_res);
261 }
262}
263#else
264static inline void __init reserve_crashkernel(void)
265{}
266#endif
267
268/* Overridden in paravirt.c if CONFIG_PARAVIRT */
269void __attribute__((weak)) __init memory_setup(void)
270{
271 machine_specific_memory_setup();
272}
273
274static void __init parse_setup_data(void)
275{
276 struct setup_data *data;
277 unsigned long pa_data;
278
279 if (boot_params.hdr.version < 0x0209)
280 return;
281 pa_data = boot_params.hdr.setup_data;
282 while (pa_data) {
283 data = early_ioremap(pa_data, PAGE_SIZE);
284 switch (data->type) {
285 default:
286 break;
287 }
288#ifndef CONFIG_DEBUG_BOOT_PARAMS
289 free_early(pa_data, pa_data+sizeof(*data)+data->len);
290#endif
291 pa_data = data->next;
292 early_iounmap(data, PAGE_SIZE);
293 }
294}
295
296#ifdef CONFIG_PCI_MMCONFIG
297extern void __cpuinit fam10h_check_enable_mmcfg(void);
298extern void __init check_enable_amd_mmconf_dmi(void);
299#else
300void __cpuinit fam10h_check_enable_mmcfg(void)
301{
302}
303void __init check_enable_amd_mmconf_dmi(void)
304{
305}
306#endif
307
308/*
309 * setup_arch - architecture-specific boot-time initializations
310 *
311 * Note: On x86_64, fixmaps are ready for use even before this is called.
312 */
313void __init setup_arch(char **cmdline_p)
314{
315 unsigned i;
316
317 printk(KERN_INFO "Command line: %s\n", boot_command_line);
318
319 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
320 screen_info = boot_params.screen_info;
321 edid_info = boot_params.edid_info;
322 saved_video_mode = boot_params.hdr.vid_mode;
323 bootloader_type = boot_params.hdr.type_of_loader;
324
325#ifdef CONFIG_BLK_DEV_RAM
326 rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
327 rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
328 rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
329#endif
330#ifdef CONFIG_EFI
331 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
332 "EL64", 4))
333 efi_enabled = 1;
334#endif
335
336 ARCH_SETUP
337
338 memory_setup();
339 copy_edd();
340
341 if (!boot_params.hdr.root_flags)
342 root_mountflags &= ~MS_RDONLY;
343 init_mm.start_code = (unsigned long) &_text;
344 init_mm.end_code = (unsigned long) &_etext;
345 init_mm.end_data = (unsigned long) &_edata;
346 init_mm.brk = (unsigned long) &_end;
347
348 code_resource.start = virt_to_phys(&_text);
349 code_resource.end = virt_to_phys(&_etext)-1;
350 data_resource.start = virt_to_phys(&_etext);
351 data_resource.end = virt_to_phys(&_edata)-1;
352 bss_resource.start = virt_to_phys(&__bss_start);
353 bss_resource.end = virt_to_phys(&__bss_stop)-1;
354
355 early_identify_cpu(&boot_cpu_data);
356
357 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
358 *cmdline_p = command_line;
359
360 parse_setup_data();
361
362 parse_early_param();
363
364#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
365 if (init_ohci1394_dma_early)
366 init_ohci1394_dma_on_all_controllers();
367#endif
368
369 finish_e820_parsing();
370
371 /* after parse_early_param, so could debug it */
372 insert_resource(&iomem_resource, &code_resource);
373 insert_resource(&iomem_resource, &data_resource);
374 insert_resource(&iomem_resource, &bss_resource);
375
376 early_gart_iommu_check();
377
378 e820_register_active_regions(0, 0, -1UL);
379 /*
380 * partially used pages are not usable - thus
381 * we are rounding upwards:
382 */
383 end_pfn = e820_end_of_ram();
384 /* update e820 for memory not covered by WB MTRRs */
385 mtrr_bp_init();
386 if (mtrr_trim_uncached_memory(end_pfn)) {
387 e820_register_active_regions(0, 0, -1UL);
388 end_pfn = e820_end_of_ram();
389 }
390
391 num_physpages = end_pfn;
392
393 check_efer();
394
395 max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT));
396 if (efi_enabled)
397 efi_init();
398
399 vsmp_init();
400
401 dmi_scan_machine();
402
403 io_delay_init();
404
405#ifdef CONFIG_KVM_CLOCK
406 kvmclock_init();
407#endif
408
409#ifdef CONFIG_SMP
410 /* setup to use the early static init tables during kernel startup */
411 x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
412 x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
413#ifdef CONFIG_NUMA
414 x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
415#endif
416#endif
417
418#ifdef CONFIG_ACPI
419 /*
420 * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
421 * Call this early for SRAT node setup.
422 */
423 acpi_boot_table_init();
424#endif
425
426 /* How many end-of-memory variables you have, grandma! */
427 max_low_pfn = end_pfn;
428 max_pfn = end_pfn;
429 high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
430
431 /* Remove active ranges so rediscovery with NUMA-awareness happens */
432 remove_all_active_ranges();
433
434#ifdef CONFIG_ACPI_NUMA
435 /*
436 * Parse SRAT to discover nodes.
437 */
438 acpi_numa_init();
439#endif
440
441#ifdef CONFIG_NUMA
442 numa_initmem_init(0, end_pfn);
443#else
444 contig_initmem_init(0, end_pfn);
445#endif
446
447 dma32_reserve_bootmem();
448
449#ifdef CONFIG_ACPI_SLEEP
450 /*
451 * Reserve low memory region for sleep support.
452 */
453 acpi_reserve_bootmem();
454#endif
455
456 if (efi_enabled)
457 efi_reserve_bootmem();
458
459 /*
460 * Find and reserve possible boot-time SMP configuration:
461 */
462 find_smp_config();
463#ifdef CONFIG_BLK_DEV_INITRD
464 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
465 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
466 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
467 unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
468 unsigned long end_of_mem = end_pfn << PAGE_SHIFT;
469
470 if (ramdisk_end <= end_of_mem) {
471 /*
472 * don't need to reserve again, already reserved early
473 * in x86_64_start_kernel, and early_res_to_bootmem
474 * convert that to reserved in bootmem
475 */
476 initrd_start = ramdisk_image + PAGE_OFFSET;
477 initrd_end = initrd_start+ramdisk_size;
478 } else {
479 free_bootmem(ramdisk_image, ramdisk_size);
480 printk(KERN_ERR "initrd extends beyond end of memory "
481 "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
482 ramdisk_end, end_of_mem);
483 initrd_start = 0;
484 }
485 }
486#endif
487 reserve_crashkernel();
488
489 reserve_ibft_region();
490
491 paging_init();
492 map_vsyscall();
493
494 early_quirks();
495
496#ifdef CONFIG_ACPI
497 /*
498 * Read APIC and some other early information from ACPI tables.
499 */
500 acpi_boot_init();
501#endif
502
503 init_cpu_to_node();
504
505 /*
506 * get boot-time SMP configuration:
507 */
508 if (smp_found_config)
509 get_smp_config();
510 init_apic_mappings();
511 ioapic_init_mappings();
512
513 kvm_guest_init();
514
515 /*
516 * We trust e820 completely. No explicit ROM probing in memory.
517 */
518 e820_reserve_resources();
519 e820_mark_nosave_regions();
520
521 /* request I/O space for devices used on all i[345]86 PCs */
522 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
523 request_resource(&ioport_resource, &standard_io_resources[i]);
524
525 e820_setup_gap();
526
527#ifdef CONFIG_VT
528#if defined(CONFIG_VGA_CONSOLE)
529 if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
530 conswitchp = &vga_con;
531#elif defined(CONFIG_DUMMY_CONSOLE)
532 conswitchp = &dummy_con;
533#endif
534#endif
535
536 /* do this before identify_cpu for boot cpu */
537 check_enable_amd_mmconf_dmi();
538}
539
540static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
541{
542 unsigned int *v;
543
544 if (c->extended_cpuid_level < 0x80000004)
545 return 0;
546
547 v = (unsigned int *) c->x86_model_id;
548 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
549 cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
550 cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
551 c->x86_model_id[48] = 0;
552 return 1;
553}
554
555
556static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
557{
558 unsigned int n, dummy, eax, ebx, ecx, edx;
559
560 n = c->extended_cpuid_level;
561
562 if (n >= 0x80000005) {
563 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
564 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
565 "D cache %dK (%d bytes/line)\n",
566 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
567 c->x86_cache_size = (ecx>>24) + (edx>>24);
568 /* On K8 L1 TLB is inclusive, so don't count it */
569 c->x86_tlbsize = 0;
570 }
571
572 if (n >= 0x80000006) {
573 cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
574 ecx = cpuid_ecx(0x80000006);
575 c->x86_cache_size = ecx >> 16;
576 c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
577
578 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
579 c->x86_cache_size, ecx & 0xFF);
580 }
581 if (n >= 0x80000008) {
582 cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
583 c->x86_virt_bits = (eax >> 8) & 0xff;
584 c->x86_phys_bits = eax & 0xff;
585 }
586}
587
588#ifdef CONFIG_NUMA
589static int __cpuinit nearby_node(int apicid)
590{
591 int i, node;
592
593 for (i = apicid - 1; i >= 0; i--) {
594 node = apicid_to_node[i];
595 if (node != NUMA_NO_NODE && node_online(node))
596 return node;
597 }
598 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
599 node = apicid_to_node[i];
600 if (node != NUMA_NO_NODE && node_online(node))
601 return node;
602 }
603 return first_node(node_online_map); /* Shouldn't happen */
604}
605#endif
606
607/*
608 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
609 * Assumes number of cores is a power of two.
610 */
611static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
612{
613#ifdef CONFIG_SMP
614 unsigned bits;
615#ifdef CONFIG_NUMA
616 int cpu = smp_processor_id();
617 int node = 0;
618 unsigned apicid = hard_smp_processor_id();
619#endif
620 bits = c->x86_coreid_bits;
621
622 /* Low order bits define the core id (index of core in socket) */
623 c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
624 /* Convert the initial APIC ID into the socket ID */
625 c->phys_proc_id = c->initial_apicid >> bits;
626
627#ifdef CONFIG_NUMA
628 node = c->phys_proc_id;
629 if (apicid_to_node[apicid] != NUMA_NO_NODE)
630 node = apicid_to_node[apicid];
631 if (!node_online(node)) {
632 /* Two possibilities here:
633 - The CPU is missing memory and no node was created.
634 In that case try picking one from a nearby CPU
635 - The APIC IDs differ from the HyperTransport node IDs
636 which the K8 northbridge parsing fills in.
637 Assume they are all increased by a constant offset,
638 but in the same order as the HT nodeids.
639 If that doesn't result in a usable node fall back to the
640 path for the previous case. */
641
642 int ht_nodeid = c->initial_apicid;
643
644 if (ht_nodeid >= 0 &&
645 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
646 node = apicid_to_node[ht_nodeid];
647 /* Pick a nearby node */
648 if (!node_online(node))
649 node = nearby_node(apicid);
650 }
651 numa_set_node(cpu, node);
652
653 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
654#endif
655#endif
656}
657
658static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
659{
660#ifdef CONFIG_SMP
661 unsigned bits, ecx;
662
663 /* Multi core CPU? */
664 if (c->extended_cpuid_level < 0x80000008)
665 return;
666
667 ecx = cpuid_ecx(0x80000008);
668
669 c->x86_max_cores = (ecx & 0xff) + 1;
670
671 /* CPU telling us the core id bits shift? */
672 bits = (ecx >> 12) & 0xF;
673
674 /* Otherwise recompute */
675 if (bits == 0) {
676 while ((1 << bits) < c->x86_max_cores)
677 bits++;
678 }
679
680 c->x86_coreid_bits = bits;
681
682#endif
683}
684
685#define ENABLE_C1E_MASK 0x18000000
686#define CPUID_PROCESSOR_SIGNATURE 1
687#define CPUID_XFAM 0x0ff00000
688#define CPUID_XFAM_K8 0x00000000
689#define CPUID_XFAM_10H 0x00100000
690#define CPUID_XFAM_11H 0x00200000
691#define CPUID_XMOD 0x000f0000
692#define CPUID_XMOD_REV_F 0x00040000
693
694/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
695static __cpuinit int amd_apic_timer_broken(void)
696{
697 u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
698
699 switch (eax & CPUID_XFAM) {
700 case CPUID_XFAM_K8:
701 if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
702 break;
703 case CPUID_XFAM_10H:
704 case CPUID_XFAM_11H:
705 rdmsr(MSR_K8_ENABLE_C1E, lo, hi);
706 if (lo & ENABLE_C1E_MASK)
707 return 1;
708 break;
709 default:
710 /* err on the side of caution */
711 return 1;
712 }
713 return 0;
714}
715
716static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
717{
718 early_init_amd_mc(c);
719
720 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
721 if (c->x86_power & (1<<8))
722 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
723}
724
725static void __cpuinit init_amd(struct cpuinfo_x86 *c)
726{
727 unsigned level;
728
729#ifdef CONFIG_SMP
730 unsigned long value;
731
732 /*
733 * Disable TLB flush filter by setting HWCR.FFDIS on K8
734 * bit 6 of msr C001_0015
735 *
736 * Errata 63 for SH-B3 steppings
737 * Errata 122 for all steppings (F+ have it disabled by default)
738 */
739 if (c->x86 == 15) {
740 rdmsrl(MSR_K8_HWCR, value);
741 value |= 1 << 6;
742 wrmsrl(MSR_K8_HWCR, value);
743 }
744#endif
745
746 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
747 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
748 clear_cpu_cap(c, 0*32+31);
749
750 /* On C+ stepping K8 rep microcode works well for copy/memset */
751 level = cpuid_eax(1);
752 if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
753 level >= 0x0f58))
754 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
755 if (c->x86 == 0x10 || c->x86 == 0x11)
756 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
757
758 /* Enable workaround for FXSAVE leak */
759 if (c->x86 >= 6)
760 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
761
762 level = get_model_name(c);
763 if (!level) {
764 switch (c->x86) {
765 case 15:
766 /* Should distinguish Models here, but this is only
767 a fallback anyways. */
768 strcpy(c->x86_model_id, "Hammer");
769 break;
770 }
771 }
772 display_cacheinfo(c);
773
774 /* Multi core CPU? */
775 if (c->extended_cpuid_level >= 0x80000008)
776 amd_detect_cmp(c);
777
778 if (c->extended_cpuid_level >= 0x80000006 &&
779 (cpuid_edx(0x80000006) & 0xf000))
780 num_cache_leaves = 4;
781 else
782 num_cache_leaves = 3;
783
784 if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
785 set_cpu_cap(c, X86_FEATURE_K8);
786
787 /* MFENCE stops RDTSC speculation */
788 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
789
790 if (c->x86 == 0x10)
791 fam10h_check_enable_mmcfg();
792
793 if (amd_apic_timer_broken())
794 disable_apic_timer = 1;
795
796 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
797 unsigned long long tseg;
798
799 /*
800 * Split up direct mapping around the TSEG SMM area.
801 * Don't do it for gbpages because there seems very little
802 * benefit in doing so.
803 */
804 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
805 (tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
806 set_memory_4k((unsigned long)__va(tseg), 1);
807 }
808}
809
810void __cpuinit detect_ht(struct cpuinfo_x86 *c)
811{
812#ifdef CONFIG_SMP
813 u32 eax, ebx, ecx, edx;
814 int index_msb, core_bits;
815
816 cpuid(1, &eax, &ebx, &ecx, &edx);
817
818
819 if (!cpu_has(c, X86_FEATURE_HT))
820 return;
821 if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
822 goto out;
823
824 smp_num_siblings = (ebx & 0xff0000) >> 16;
825
826 if (smp_num_siblings == 1) {
827 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
828 } else if (smp_num_siblings > 1) {
829
830 if (smp_num_siblings > NR_CPUS) {
831 printk(KERN_WARNING "CPU: Unsupported number of "
832 "siblings %d", smp_num_siblings);
833 smp_num_siblings = 1;
834 return;
835 }
836
837 index_msb = get_count_order(smp_num_siblings);
838 c->phys_proc_id = phys_pkg_id(index_msb);
839
840 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
841
842 index_msb = get_count_order(smp_num_siblings);
843
844 core_bits = get_count_order(c->x86_max_cores);
845
846 c->cpu_core_id = phys_pkg_id(index_msb) &
847 ((1 << core_bits) - 1);
848 }
849out:
850 if ((c->x86_max_cores * smp_num_siblings) > 1) {
851 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
852 c->phys_proc_id);
853 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
854 c->cpu_core_id);
855 }
856
857#endif
858}
859
860/*
861 * find out the number of processor cores on the die
862 */
863static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
864{
865 unsigned int eax, t;
866
867 if (c->cpuid_level < 4)
868 return 1;
869
870 cpuid_count(4, 0, &eax, &t, &t, &t);
871
872 if (eax & 0x1f)
873 return ((eax >> 26) + 1);
874 else
875 return 1;
876}
877
878static void __cpuinit srat_detect_node(void)
879{
880#ifdef CONFIG_NUMA
881 unsigned node;
882 int cpu = smp_processor_id();
883 int apicid = hard_smp_processor_id();
884
885 /* Don't do the funky fallback heuristics the AMD version employs
886 for now. */
887 node = apicid_to_node[apicid];
888 if (node == NUMA_NO_NODE || !node_online(node))
889 node = first_node(node_online_map);
890 numa_set_node(cpu, node);
891
892 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
893#endif
894}
895
896static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
897{
898 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
899 (c->x86 == 0x6 && c->x86_model >= 0x0e))
900 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
901}
902
903static void __cpuinit init_intel(struct cpuinfo_x86 *c)
904{
905 /* Cache sizes */
906 unsigned n;
907
908 init_intel_cacheinfo(c);
909 if (c->cpuid_level > 9) {
910 unsigned eax = cpuid_eax(10);
911 /* Check for version and the number of counters */
912 if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
913 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
914 }
915
916 if (cpu_has_ds) {
917 unsigned int l1, l2;
918 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
919 if (!(l1 & (1<<11)))
920 set_cpu_cap(c, X86_FEATURE_BTS);
921 if (!(l1 & (1<<12)))
922 set_cpu_cap(c, X86_FEATURE_PEBS);
923 }
924
925
926 if (cpu_has_bts)
927 ds_init_intel(c);
928
929 n = c->extended_cpuid_level;
930 if (n >= 0x80000008) {
931 unsigned eax = cpuid_eax(0x80000008);
932 c->x86_virt_bits = (eax >> 8) & 0xff;
933 c->x86_phys_bits = eax & 0xff;
934 /* CPUID workaround for Intel 0F34 CPU */
935 if (c->x86_vendor == X86_VENDOR_INTEL &&
936 c->x86 == 0xF && c->x86_model == 0x3 &&
937 c->x86_mask == 0x4)
938 c->x86_phys_bits = 36;
939 }
940
941 if (c->x86 == 15)
942 c->x86_cache_alignment = c->x86_clflush_size * 2;
943 if (c->x86 == 6)
944 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
945 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
946 c->x86_max_cores = intel_num_cpu_cores(c);
947
948 srat_detect_node();
949}
950
951static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
952{
953 if (c->x86 == 0x6 && c->x86_model >= 0xf)
954 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
955}
956
957static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
958{
959 /* Cache sizes */
960 unsigned n;
961
962 n = c->extended_cpuid_level;
963 if (n >= 0x80000008) {
964 unsigned eax = cpuid_eax(0x80000008);
965 c->x86_virt_bits = (eax >> 8) & 0xff;
966 c->x86_phys_bits = eax & 0xff;
967 }
968
969 if (c->x86 == 0x6 && c->x86_model >= 0xf) {
970 c->x86_cache_alignment = c->x86_clflush_size * 2;
971 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
972 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
973 }
974 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
975}
976
977static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
978{
979 char *v = c->x86_vendor_id;
980
981 if (!strcmp(v, "AuthenticAMD"))
982 c->x86_vendor = X86_VENDOR_AMD;
983 else if (!strcmp(v, "GenuineIntel"))
984 c->x86_vendor = X86_VENDOR_INTEL;
985 else if (!strcmp(v, "CentaurHauls"))
986 c->x86_vendor = X86_VENDOR_CENTAUR;
987 else
988 c->x86_vendor = X86_VENDOR_UNKNOWN;
989}
990
991/* Do some early cpuid on the boot CPU to get some parameter that are
992 needed before check_bugs. Everything advanced is in identify_cpu
993 below. */
994static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
995{
996 u32 tfms, xlvl;
997
998 c->loops_per_jiffy = loops_per_jiffy;
999 c->x86_cache_size = -1;
1000 c->x86_vendor = X86_VENDOR_UNKNOWN;
1001 c->x86_model = c->x86_mask = 0; /* So far unknown... */
1002 c->x86_vendor_id[0] = '\0'; /* Unset */
1003 c->x86_model_id[0] = '\0'; /* Unset */
1004 c->x86_clflush_size = 64;
1005 c->x86_cache_alignment = c->x86_clflush_size;
1006 c->x86_max_cores = 1;
1007 c->x86_coreid_bits = 0;
1008 c->extended_cpuid_level = 0;
1009 memset(&c->x86_capability, 0, sizeof c->x86_capability);
1010
1011 /* Get vendor name */
1012 cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
1013 (unsigned int *)&c->x86_vendor_id[0],
1014 (unsigned int *)&c->x86_vendor_id[8],
1015 (unsigned int *)&c->x86_vendor_id[4]);
1016
1017 get_cpu_vendor(c);
1018
1019 /* Initialize the standard set of capabilities */
1020 /* Note that the vendor-specific code below might override */
1021
1022 /* Intel-defined flags: level 0x00000001 */
1023 if (c->cpuid_level >= 0x00000001) {
1024 __u32 misc;
1025 cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
1026 &c->x86_capability[0]);
1027 c->x86 = (tfms >> 8) & 0xf;
1028 c->x86_model = (tfms >> 4) & 0xf;
1029 c->x86_mask = tfms & 0xf;
1030 if (c->x86 == 0xf)
1031 c->x86 += (tfms >> 20) & 0xff;
1032 if (c->x86 >= 0x6)
1033 c->x86_model += ((tfms >> 16) & 0xF) << 4;
1034 if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
1035 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
1036 } else {
1037 /* Have CPUID level 0 only - unheard of */
1038 c->x86 = 4;
1039 }
1040
1041 c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
1042#ifdef CONFIG_SMP
1043 c->phys_proc_id = c->initial_apicid;
1044#endif
1045 /* AMD-defined flags: level 0x80000001 */
1046 xlvl = cpuid_eax(0x80000000);
1047 c->extended_cpuid_level = xlvl;
1048 if ((xlvl & 0xffff0000) == 0x80000000) {
1049 if (xlvl >= 0x80000001) {
1050 c->x86_capability[1] = cpuid_edx(0x80000001);
1051 c->x86_capability[6] = cpuid_ecx(0x80000001);
1052 }
1053 if (xlvl >= 0x80000004)
1054 get_model_name(c); /* Default name */
1055 }
1056
1057 /* Transmeta-defined flags: level 0x80860001 */
1058 xlvl = cpuid_eax(0x80860000);
1059 if ((xlvl & 0xffff0000) == 0x80860000) {
1060 /* Don't set x86_cpuid_level here for now to not confuse. */
1061 if (xlvl >= 0x80860001)
1062 c->x86_capability[2] = cpuid_edx(0x80860001);
1063 }
1064
1065 c->extended_cpuid_level = cpuid_eax(0x80000000);
1066 if (c->extended_cpuid_level >= 0x80000007)
1067 c->x86_power = cpuid_edx(0x80000007);
1068
1069 switch (c->x86_vendor) {
1070 case X86_VENDOR_AMD:
1071 early_init_amd(c);
1072 break;
1073 case X86_VENDOR_INTEL:
1074 early_init_intel(c);
1075 break;
1076 case X86_VENDOR_CENTAUR:
1077 early_init_centaur(c);
1078 break;
1079 }
1080
1081 validate_pat_support(c);
1082}
1083
1084/*
1085 * This does the hard work of actually picking apart the CPU stuff...
1086 */
1087void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
1088{
1089 int i;
1090
1091 early_identify_cpu(c);
1092
1093 init_scattered_cpuid_features(c);
1094
1095 c->apicid = phys_pkg_id(0);
1096
1097 /*
1098 * Vendor-specific initialization. In this section we
1099 * canonicalize the feature flags, meaning if there are
1100 * features a certain CPU supports which CPUID doesn't
1101 * tell us, CPUID claiming incorrect flags, or other bugs,
1102 * we handle them here.
1103 *
1104 * At the end of this section, c->x86_capability better
1105 * indicate the features this CPU genuinely supports!
1106 */
1107 switch (c->x86_vendor) {
1108 case X86_VENDOR_AMD:
1109 init_amd(c);
1110 break;
1111
1112 case X86_VENDOR_INTEL:
1113 init_intel(c);
1114 break;
1115
1116 case X86_VENDOR_CENTAUR:
1117 init_centaur(c);
1118 break;
1119
1120 case X86_VENDOR_UNKNOWN:
1121 default:
1122 display_cacheinfo(c);
1123 break;
1124 }
1125
1126 detect_ht(c);
1127
1128 /*
1129 * On SMP, boot_cpu_data holds the common feature set between
1130 * all CPUs; so make sure that we indicate which features are
1131 * common between the CPUs. The first time this routine gets
1132 * executed, c == &boot_cpu_data.
1133 */
1134 if (c != &boot_cpu_data) {
1135 /* AND the already accumulated flags with these */
1136 for (i = 0; i < NCAPINTS; i++)
1137 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
1138 }
1139
1140 /* Clear all flags overriden by options */
1141 for (i = 0; i < NCAPINTS; i++)
1142 c->x86_capability[i] &= ~cleared_cpu_caps[i];
1143
1144#ifdef CONFIG_X86_MCE
1145 mcheck_init(c);
1146#endif
1147 select_idle_routine(c);
1148
1149#ifdef CONFIG_NUMA
1150 numa_add_cpu(smp_processor_id());
1151#endif
1152
1153}
1154
1155void __cpuinit identify_boot_cpu(void)
1156{
1157 identify_cpu(&boot_cpu_data);
1158}
1159
1160void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
1161{
1162 BUG_ON(c == &boot_cpu_data);
1163 identify_cpu(c);
1164 mtrr_ap_init();
1165}
1166
1167static __init int setup_noclflush(char *arg)
1168{
1169 setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
1170 return 1;
1171}
1172__setup("noclflush", setup_noclflush);
1173
1174void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
1175{
1176 if (c->x86_model_id[0])
1177 printk(KERN_CONT "%s", c->x86_model_id);
1178
1179 if (c->x86_mask || c->cpuid_level >= 0)
1180 printk(KERN_CONT " stepping %02x\n", c->x86_mask);
1181 else
1182 printk(KERN_CONT "\n");
1183}
1184
1185static __init int setup_disablecpuid(char *arg)
1186{
1187 int bit;
1188 if (get_option(&arg, &bit) && bit < NCAPINTS*32)
1189 setup_clear_cpu_cap(bit);
1190 else
1191 return 0;
1192 return 1;
1193}
1194__setup("clearcpuid=", setup_disablecpuid);
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
new file mode 100644
index 000000000000..f7745f94c006
--- /dev/null
+++ b/arch/x86/kernel/setup_percpu.c
@@ -0,0 +1,399 @@
1#include <linux/kernel.h>
2#include <linux/module.h>
3#include <linux/init.h>
4#include <linux/bootmem.h>
5#include <linux/percpu.h>
6#include <linux/kexec.h>
7#include <linux/crash_dump.h>
8#include <asm/smp.h>
9#include <asm/percpu.h>
10#include <asm/sections.h>
11#include <asm/processor.h>
12#include <asm/setup.h>
13#include <asm/topology.h>
14#include <asm/mpspec.h>
15#include <asm/apicdef.h>
16#include <asm/highmem.h>
17
18#ifdef CONFIG_X86_LOCAL_APIC
19unsigned int num_processors;
20unsigned disabled_cpus __cpuinitdata;
21/* Processor that is doing the boot up */
22unsigned int boot_cpu_physical_apicid = -1U;
23unsigned int max_physical_apicid;
24EXPORT_SYMBOL(boot_cpu_physical_apicid);
25
26/* Bitmask of physically existing CPUs */
27physid_mask_t phys_cpu_present_map;
28#endif
29
30/* map cpu index to physical APIC ID */
31DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
32DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
33EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
34EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
35
36#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
37#define X86_64_NUMA 1
38
39/* map cpu index to node index */
40DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
41EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
42
43/* which logical CPUs are on which nodes */
44cpumask_t *node_to_cpumask_map;
45EXPORT_SYMBOL(node_to_cpumask_map);
46
47/* setup node_to_cpumask_map */
48static void __init setup_node_to_cpumask_map(void);
49
50#else
51static inline void setup_node_to_cpumask_map(void) { }
52#endif
53
54#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
55/*
56 * Copy data used in early init routines from the initial arrays to the
57 * per cpu data areas. These arrays then become expendable and the
58 * *_early_ptr's are zeroed indicating that the static arrays are gone.
59 */
60static void __init setup_per_cpu_maps(void)
61{
62 int cpu;
63
64 for_each_possible_cpu(cpu) {
65 per_cpu(x86_cpu_to_apicid, cpu) =
66 early_per_cpu_map(x86_cpu_to_apicid, cpu);
67 per_cpu(x86_bios_cpu_apicid, cpu) =
68 early_per_cpu_map(x86_bios_cpu_apicid, cpu);
69#ifdef X86_64_NUMA
70 per_cpu(x86_cpu_to_node_map, cpu) =
71 early_per_cpu_map(x86_cpu_to_node_map, cpu);
72#endif
73 }
74
75 /* indicate the early static arrays will soon be gone */
76 early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
77 early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
78#ifdef X86_64_NUMA
79 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
80#endif
81}
82
83#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
84cpumask_t *cpumask_of_cpu_map __read_mostly;
85EXPORT_SYMBOL(cpumask_of_cpu_map);
86
87/* requires nr_cpu_ids to be initialized */
88static void __init setup_cpumask_of_cpu(void)
89{
90 int i;
91
92 /* alloc_bootmem zeroes memory */
93 cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
94 for (i = 0; i < nr_cpu_ids; i++)
95 cpu_set(i, cpumask_of_cpu_map[i]);
96}
97#else
98static inline void setup_cpumask_of_cpu(void) { }
99#endif
100
101#ifdef CONFIG_X86_32
102/*
103 * Great future not-so-futuristic plan: make i386 and x86_64 do it
104 * the same way
105 */
106unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
107EXPORT_SYMBOL(__per_cpu_offset);
108static inline void setup_cpu_pda_map(void) { }
109
110#elif !defined(CONFIG_SMP)
111static inline void setup_cpu_pda_map(void) { }
112
113#else /* CONFIG_SMP && CONFIG_X86_64 */
114
115/*
116 * Allocate cpu_pda pointer table and array via alloc_bootmem.
117 */
118static void __init setup_cpu_pda_map(void)
119{
120 char *pda;
121 struct x8664_pda **new_cpu_pda;
122 unsigned long size;
123 int cpu;
124
125 size = roundup(sizeof(struct x8664_pda), cache_line_size());
126
127 /* allocate cpu_pda array and pointer table */
128 {
129 unsigned long tsize = nr_cpu_ids * sizeof(void *);
130 unsigned long asize = size * (nr_cpu_ids - 1);
131
132 tsize = roundup(tsize, cache_line_size());
133 new_cpu_pda = alloc_bootmem(tsize + asize);
134 pda = (char *)new_cpu_pda + tsize;
135 }
136
137 /* initialize pointer table to static pda's */
138 for_each_possible_cpu(cpu) {
139 if (cpu == 0) {
140 /* leave boot cpu pda in place */
141 new_cpu_pda[0] = cpu_pda(0);
142 continue;
143 }
144 new_cpu_pda[cpu] = (struct x8664_pda *)pda;
145 new_cpu_pda[cpu]->in_bootmem = 1;
146 pda += size;
147 }
148
149 /* point to new pointer table */
150 _cpu_pda = new_cpu_pda;
151}
152#endif
153
154/*
155 * Great future plan:
156 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
157 * Always point %gs to its beginning
158 */
159void __init setup_per_cpu_areas(void)
160{
161 ssize_t size = PERCPU_ENOUGH_ROOM;
162 char *ptr;
163 int cpu;
164
165 /* Setup cpu_pda map */
166 setup_cpu_pda_map();
167
168 /* Copy section for each CPU (we discard the original) */
169 size = PERCPU_ENOUGH_ROOM;
170 printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
171 size);
172
173 for_each_possible_cpu(cpu) {
174#ifndef CONFIG_NEED_MULTIPLE_NODES
175 ptr = alloc_bootmem_pages(size);
176#else
177 int node = early_cpu_to_node(cpu);
178 if (!node_online(node) || !NODE_DATA(node)) {
179 ptr = alloc_bootmem_pages(size);
180 printk(KERN_INFO
181 "cpu %d has no node %d or node-local memory\n",
182 cpu, node);
183 }
184 else
185 ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
186#endif
187 per_cpu_offset(cpu) = ptr - __per_cpu_start;
188 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
189
190 }
191
192 printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
193 NR_CPUS, nr_cpu_ids, nr_node_ids);
194
195 /* Setup percpu data maps */
196 setup_per_cpu_maps();
197
198 /* Setup node to cpumask map */
199 setup_node_to_cpumask_map();
200
201 /* Setup cpumask_of_cpu map */
202 setup_cpumask_of_cpu();
203}
204
205#endif
206
207#ifdef X86_64_NUMA
208
209/*
210 * Allocate node_to_cpumask_map based on number of available nodes
211 * Requires node_possible_map to be valid.
212 *
213 * Note: node_to_cpumask() is not valid until after this is done.
214 */
215static void __init setup_node_to_cpumask_map(void)
216{
217 unsigned int node, num = 0;
218 cpumask_t *map;
219
220 /* setup nr_node_ids if not done yet */
221 if (nr_node_ids == MAX_NUMNODES) {
222 for_each_node_mask(node, node_possible_map)
223 num = node;
224 nr_node_ids = num + 1;
225 }
226
227 /* allocate the map */
228 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
229
230 pr_debug(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n",
231 map, nr_node_ids);
232
233 /* node_to_cpumask() will now work */
234 node_to_cpumask_map = map;
235}
236
237void __cpuinit numa_set_node(int cpu, int node)
238{
239 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
240
241 if (cpu_pda(cpu) && node != NUMA_NO_NODE)
242 cpu_pda(cpu)->nodenumber = node;
243
244 if (cpu_to_node_map)
245 cpu_to_node_map[cpu] = node;
246
247 else if (per_cpu_offset(cpu))
248 per_cpu(x86_cpu_to_node_map, cpu) = node;
249
250 else
251 pr_debug("Setting node for non-present cpu %d\n", cpu);
252}
253
254void __cpuinit numa_clear_node(int cpu)
255{
256 numa_set_node(cpu, NUMA_NO_NODE);
257}
258
259#ifndef CONFIG_DEBUG_PER_CPU_MAPS
260
261void __cpuinit numa_add_cpu(int cpu)
262{
263 cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
264}
265
266void __cpuinit numa_remove_cpu(int cpu)
267{
268 cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]);
269}
270
271#else /* CONFIG_DEBUG_PER_CPU_MAPS */
272
273/*
274 * --------- debug versions of the numa functions ---------
275 */
276static void __cpuinit numa_set_cpumask(int cpu, int enable)
277{
278 int node = cpu_to_node(cpu);
279 cpumask_t *mask;
280 char buf[64];
281
282 if (node_to_cpumask_map == NULL) {
283 printk(KERN_ERR "node_to_cpumask_map NULL\n");
284 dump_stack();
285 return;
286 }
287
288 mask = &node_to_cpumask_map[node];
289 if (enable)
290 cpu_set(cpu, *mask);
291 else
292 cpu_clear(cpu, *mask);
293
294 cpulist_scnprintf(buf, sizeof(buf), *mask);
295 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
296 enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf);
297 }
298
299void __cpuinit numa_add_cpu(int cpu)
300{
301 numa_set_cpumask(cpu, 1);
302}
303
304void __cpuinit numa_remove_cpu(int cpu)
305{
306 numa_set_cpumask(cpu, 0);
307}
308
309int cpu_to_node(int cpu)
310{
311 if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
312 printk(KERN_WARNING
313 "cpu_to_node(%d): usage too early!\n", cpu);
314 dump_stack();
315 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
316 }
317 return per_cpu(x86_cpu_to_node_map, cpu);
318}
319EXPORT_SYMBOL(cpu_to_node);
320
321/*
322 * Same function as cpu_to_node() but used if called before the
323 * per_cpu areas are setup.
324 */
325int early_cpu_to_node(int cpu)
326{
327 if (early_per_cpu_ptr(x86_cpu_to_node_map))
328 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
329
330 if (!per_cpu_offset(cpu)) {
331 printk(KERN_WARNING
332 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
333 dump_stack();
334 return NUMA_NO_NODE;
335 }
336 return per_cpu(x86_cpu_to_node_map, cpu);
337}
338
339
340/* empty cpumask */
341static const cpumask_t cpu_mask_none;
342
343/*
344 * Returns a pointer to the bitmask of CPUs on Node 'node'.
345 */
346const cpumask_t *_node_to_cpumask_ptr(int node)
347{
348 if (node_to_cpumask_map == NULL) {
349 printk(KERN_WARNING
350 "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n",
351 node);
352 dump_stack();
353 return (const cpumask_t *)&cpu_online_map;
354 }
355 if (node >= nr_node_ids) {
356 printk(KERN_WARNING
357 "_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n",
358 node, nr_node_ids);
359 dump_stack();
360 return &cpu_mask_none;
361 }
362 return &node_to_cpumask_map[node];
363}
364EXPORT_SYMBOL(_node_to_cpumask_ptr);
365
366/*
367 * Returns a bitmask of CPUs on Node 'node'.
368 *
369 * Side note: this function creates the returned cpumask on the stack
370 * so with a high NR_CPUS count, excessive stack space is used. The
371 * node_to_cpumask_ptr function should be used whenever possible.
372 */
373cpumask_t node_to_cpumask(int node)
374{
375 if (node_to_cpumask_map == NULL) {
376 printk(KERN_WARNING
377 "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
378 dump_stack();
379 return cpu_online_map;
380 }
381 if (node >= nr_node_ids) {
382 printk(KERN_WARNING
383 "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
384 node, nr_node_ids);
385 dump_stack();
386 return cpu_mask_none;
387 }
388 return node_to_cpumask_map[node];
389}
390EXPORT_SYMBOL(node_to_cpumask);
391
392/*
393 * --------- end of debug versions of the numa functions ---------
394 */
395
396#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
397
398#endif /* X86_64_NUMA */
399
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index d92373630963..6fb5bcdd8933 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -212,7 +212,7 @@ asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
212 212
213badframe: 213badframe:
214 if (show_unhandled_signals && printk_ratelimit()) { 214 if (show_unhandled_signals && printk_ratelimit()) {
215 printk(KERN_INFO "%s%s[%d] bad frame in sigreturn frame:" 215 printk("%s%s[%d] bad frame in sigreturn frame:"
216 "%p ip:%lx sp:%lx oeax:%lx", 216 "%p ip:%lx sp:%lx oeax:%lx",
217 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, 217 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
218 current->comm, task_pid_nr(current), frame, regs->ip, 218 current->comm, task_pid_nr(current), frame, regs->ip,
@@ -657,18 +657,9 @@ static void do_signal(struct pt_regs *regs)
657void 657void
658do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 658do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
659{ 659{
660 /* Pending single-step? */
661 if (thread_info_flags & _TIF_SINGLESTEP) {
662 regs->flags |= X86_EFLAGS_TF;
663 clear_thread_flag(TIF_SINGLESTEP);
664 }
665
666 /* deal with pending signal delivery */ 660 /* deal with pending signal delivery */
667 if (thread_info_flags & _TIF_SIGPENDING) 661 if (thread_info_flags & _TIF_SIGPENDING)
668 do_signal(regs); 662 do_signal(regs);
669 663
670 if (thread_info_flags & _TIF_HRTICK_RESCHED)
671 hrtick_resched();
672
673 clear_thread_flag(TIF_IRET); 664 clear_thread_flag(TIF_IRET);
674} 665}
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index e53b267662e7..b45ef8ddd651 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -53,6 +53,59 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
53 return do_sigaltstack(uss, uoss, regs->sp); 53 return do_sigaltstack(uss, uoss, regs->sp);
54} 54}
55 55
56/*
57 * Signal frame handlers.
58 */
59
60static inline int save_i387(struct _fpstate __user *buf)
61{
62 struct task_struct *tsk = current;
63 int err = 0;
64
65 BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
66 sizeof(tsk->thread.xstate->fxsave));
67
68 if ((unsigned long)buf % 16)
69 printk("save_i387: bad fpstate %p\n", buf);
70
71 if (!used_math())
72 return 0;
73 clear_used_math(); /* trigger finit */
74 if (task_thread_info(tsk)->status & TS_USEDFPU) {
75 err = save_i387_checking((struct i387_fxsave_struct __user *)
76 buf);
77 if (err)
78 return err;
79 task_thread_info(tsk)->status &= ~TS_USEDFPU;
80 stts();
81 } else {
82 if (__copy_to_user(buf, &tsk->thread.xstate->fxsave,
83 sizeof(struct i387_fxsave_struct)))
84 return -1;
85 }
86 return 1;
87}
88
89/*
90 * This restores directly out of user space. Exceptions are handled.
91 */
92static inline int restore_i387(struct _fpstate __user *buf)
93{
94 struct task_struct *tsk = current;
95 int err;
96
97 if (!used_math()) {
98 err = init_fpu(tsk);
99 if (err)
100 return err;
101 }
102
103 if (!(task_thread_info(current)->status & TS_USEDFPU)) {
104 clts();
105 task_thread_info(current)->status |= TS_USEDFPU;
106 }
107 return restore_fpu_checking((__force struct i387_fxsave_struct *)buf);
108}
56 109
57/* 110/*
58 * Do a signal return; undo the signal stack. 111 * Do a signal return; undo the signal stack.
@@ -487,12 +540,6 @@ static void do_signal(struct pt_regs *regs)
487void do_notify_resume(struct pt_regs *regs, void *unused, 540void do_notify_resume(struct pt_regs *regs, void *unused,
488 __u32 thread_info_flags) 541 __u32 thread_info_flags)
489{ 542{
490 /* Pending single-step? */
491 if (thread_info_flags & _TIF_SINGLESTEP) {
492 regs->flags |= X86_EFLAGS_TF;
493 clear_thread_flag(TIF_SINGLESTEP);
494 }
495
496#ifdef CONFIG_X86_MCE 543#ifdef CONFIG_X86_MCE
497 /* notify userspace of pending MCEs */ 544 /* notify userspace of pending MCEs */
498 if (thread_info_flags & _TIF_MCE_NOTIFY) 545 if (thread_info_flags & _TIF_MCE_NOTIFY)
@@ -502,9 +549,6 @@ void do_notify_resume(struct pt_regs *regs, void *unused,
502 /* deal with pending signal delivery */ 549 /* deal with pending signal delivery */
503 if (thread_info_flags & _TIF_SIGPENDING) 550 if (thread_info_flags & _TIF_SIGPENDING)
504 do_signal(regs); 551 do_signal(regs);
505
506 if (thread_info_flags & _TIF_HRTICK_RESCHED)
507 hrtick_resched();
508} 552}
509 553
510void signal_fault(struct pt_regs *regs, void __user *frame, char *where) 554void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 0cb7aadc87cd..361b7a4c640c 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -121,132 +121,23 @@ static void native_smp_send_reschedule(int cpu)
121 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); 121 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
122} 122}
123 123
124/* 124void native_send_call_func_single_ipi(int cpu)
125 * Structure and data for smp_call_function(). This is designed to minimise
126 * static memory requirements. It also looks cleaner.
127 */
128static DEFINE_SPINLOCK(call_lock);
129
130struct call_data_struct {
131 void (*func) (void *info);
132 void *info;
133 atomic_t started;
134 atomic_t finished;
135 int wait;
136};
137
138void lock_ipi_call_lock(void)
139{ 125{
140 spin_lock_irq(&call_lock); 126 send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_SINGLE_VECTOR);
141}
142
143void unlock_ipi_call_lock(void)
144{
145 spin_unlock_irq(&call_lock);
146}
147
148static struct call_data_struct *call_data;
149
150static void __smp_call_function(void (*func) (void *info), void *info,
151 int nonatomic, int wait)
152{
153 struct call_data_struct data;
154 int cpus = num_online_cpus() - 1;
155
156 if (!cpus)
157 return;
158
159 data.func = func;
160 data.info = info;
161 atomic_set(&data.started, 0);
162 data.wait = wait;
163 if (wait)
164 atomic_set(&data.finished, 0);
165
166 call_data = &data;
167 mb();
168
169 /* Send a message to all other CPUs and wait for them to respond */
170 send_IPI_allbutself(CALL_FUNCTION_VECTOR);
171
172 /* Wait for response */
173 while (atomic_read(&data.started) != cpus)
174 cpu_relax();
175
176 if (wait)
177 while (atomic_read(&data.finished) != cpus)
178 cpu_relax();
179} 127}
180 128
181 129void native_send_call_func_ipi(cpumask_t mask)
182/**
183 * smp_call_function_mask(): Run a function on a set of other CPUs.
184 * @mask: The set of cpus to run on. Must not include the current cpu.
185 * @func: The function to run. This must be fast and non-blocking.
186 * @info: An arbitrary pointer to pass to the function.
187 * @wait: If true, wait (atomically) until function has completed on other CPUs.
188 *
189 * Returns 0 on success, else a negative status code.
190 *
191 * If @wait is true, then returns once @func has returned; otherwise
192 * it returns just before the target cpu calls @func.
193 *
194 * You must not call this function with disabled interrupts or from a
195 * hardware interrupt handler or from a bottom half handler.
196 */
197static int
198native_smp_call_function_mask(cpumask_t mask,
199 void (*func)(void *), void *info,
200 int wait)
201{ 130{
202 struct call_data_struct data;
203 cpumask_t allbutself; 131 cpumask_t allbutself;
204 int cpus;
205
206 /* Can deadlock when called with interrupts disabled */
207 WARN_ON(irqs_disabled());
208
209 /* Holding any lock stops cpus from going down. */
210 spin_lock(&call_lock);
211 132
212 allbutself = cpu_online_map; 133 allbutself = cpu_online_map;
213 cpu_clear(smp_processor_id(), allbutself); 134 cpu_clear(smp_processor_id(), allbutself);
214 135
215 cpus_and(mask, mask, allbutself);
216 cpus = cpus_weight(mask);
217
218 if (!cpus) {
219 spin_unlock(&call_lock);
220 return 0;
221 }
222
223 data.func = func;
224 data.info = info;
225 atomic_set(&data.started, 0);
226 data.wait = wait;
227 if (wait)
228 atomic_set(&data.finished, 0);
229
230 call_data = &data;
231 wmb();
232
233 /* Send a message to other CPUs */
234 if (cpus_equal(mask, allbutself) && 136 if (cpus_equal(mask, allbutself) &&
235 cpus_equal(cpu_online_map, cpu_callout_map)) 137 cpus_equal(cpu_online_map, cpu_callout_map))
236 send_IPI_allbutself(CALL_FUNCTION_VECTOR); 138 send_IPI_allbutself(CALL_FUNCTION_VECTOR);
237 else 139 else
238 send_IPI_mask(mask, CALL_FUNCTION_VECTOR); 140 send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
239
240 /* Wait for response */
241 while (atomic_read(&data.started) != cpus)
242 cpu_relax();
243
244 if (wait)
245 while (atomic_read(&data.finished) != cpus)
246 cpu_relax();
247 spin_unlock(&call_lock);
248
249 return 0;
250} 141}
251 142
252static void stop_this_cpu(void *dummy) 143static void stop_this_cpu(void *dummy)
@@ -268,18 +159,13 @@ static void stop_this_cpu(void *dummy)
268 159
269static void native_smp_send_stop(void) 160static void native_smp_send_stop(void)
270{ 161{
271 int nolock;
272 unsigned long flags; 162 unsigned long flags;
273 163
274 if (reboot_force) 164 if (reboot_force)
275 return; 165 return;
276 166
277 /* Don't deadlock on the call lock in panic */ 167 smp_call_function(stop_this_cpu, NULL, 0);
278 nolock = !spin_trylock(&call_lock);
279 local_irq_save(flags); 168 local_irq_save(flags);
280 __smp_call_function(stop_this_cpu, NULL, 0, 0);
281 if (!nolock)
282 spin_unlock(&call_lock);
283 disable_local_APIC(); 169 disable_local_APIC();
284 local_irq_restore(flags); 170 local_irq_restore(flags);
285} 171}
@@ -301,33 +187,28 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
301 187
302void smp_call_function_interrupt(struct pt_regs *regs) 188void smp_call_function_interrupt(struct pt_regs *regs)
303{ 189{
304 void (*func) (void *info) = call_data->func;
305 void *info = call_data->info;
306 int wait = call_data->wait;
307
308 ack_APIC_irq(); 190 ack_APIC_irq();
309 /*
310 * Notify initiating CPU that I've grabbed the data and am
311 * about to execute the function
312 */
313 mb();
314 atomic_inc(&call_data->started);
315 /*
316 * At this point the info structure may be out of scope unless wait==1
317 */
318 irq_enter(); 191 irq_enter();
319 (*func)(info); 192 generic_smp_call_function_interrupt();
320#ifdef CONFIG_X86_32 193#ifdef CONFIG_X86_32
321 __get_cpu_var(irq_stat).irq_call_count++; 194 __get_cpu_var(irq_stat).irq_call_count++;
322#else 195#else
323 add_pda(irq_call_count, 1); 196 add_pda(irq_call_count, 1);
324#endif 197#endif
325 irq_exit(); 198 irq_exit();
199}
326 200
327 if (wait) { 201void smp_call_function_single_interrupt(struct pt_regs *regs)
328 mb(); 202{
329 atomic_inc(&call_data->finished); 203 ack_APIC_irq();
330 } 204 irq_enter();
205 generic_smp_call_function_single_interrupt();
206#ifdef CONFIG_X86_32
207 __get_cpu_var(irq_stat).irq_call_count++;
208#else
209 add_pda(irq_call_count, 1);
210#endif
211 irq_exit();
331} 212}
332 213
333struct smp_ops smp_ops = { 214struct smp_ops smp_ops = {
@@ -338,7 +219,8 @@ struct smp_ops smp_ops = {
338 219
339 .smp_send_stop = native_smp_send_stop, 220 .smp_send_stop = native_smp_send_stop,
340 .smp_send_reschedule = native_smp_send_reschedule, 221 .smp_send_reschedule = native_smp_send_reschedule,
341 .smp_call_function_mask = native_smp_call_function_mask, 222
223 .send_call_func_ipi = native_send_call_func_ipi,
224 .send_call_func_single_ipi = native_send_call_func_single_ipi,
342}; 225};
343EXPORT_SYMBOL_GPL(smp_ops); 226EXPORT_SYMBOL_GPL(smp_ops);
344
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 3e1cecedde42..332512767f4f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -59,7 +59,6 @@
59#include <asm/pgtable.h> 59#include <asm/pgtable.h>
60#include <asm/tlbflush.h> 60#include <asm/tlbflush.h>
61#include <asm/mtrr.h> 61#include <asm/mtrr.h>
62#include <asm/nmi.h>
63#include <asm/vmi.h> 62#include <asm/vmi.h>
64#include <asm/genapic.h> 63#include <asm/genapic.h>
65#include <linux/mc146818rtc.h> 64#include <linux/mc146818rtc.h>
@@ -68,22 +67,6 @@
68#include <mach_wakecpu.h> 67#include <mach_wakecpu.h>
69#include <smpboot_hooks.h> 68#include <smpboot_hooks.h>
70 69
71/*
72 * FIXME: For x86_64, those are defined in other files. But moving them here,
73 * would make the setup areas dependent on smp, which is a loss. When we
74 * integrate apic between arches, we can probably do a better job, but
75 * right now, they'll stay here -- glommer
76 */
77
78/* which logical CPU number maps to which CPU (physical APIC ID) */
79u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata =
80 { [0 ... NR_CPUS-1] = BAD_APICID };
81void *x86_cpu_to_apicid_early_ptr;
82
83u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
84 = { [0 ... NR_CPUS-1] = BAD_APICID };
85void *x86_bios_cpu_apicid_early_ptr;
86
87#ifdef CONFIG_X86_32 70#ifdef CONFIG_X86_32
88u8 apicid_2_node[MAX_APICID]; 71u8 apicid_2_node[MAX_APICID];
89static int low_mappings; 72static int low_mappings;
@@ -198,13 +181,12 @@ static void map_cpu_to_logical_apicid(void)
198 map_cpu_to_node(cpu, node); 181 map_cpu_to_node(cpu, node);
199} 182}
200 183
201static void unmap_cpu_to_logical_apicid(int cpu) 184void numa_remove_cpu(int cpu)
202{ 185{
203 cpu_2_logical_apicid[cpu] = BAD_APICID; 186 cpu_2_logical_apicid[cpu] = BAD_APICID;
204 unmap_cpu_to_node(cpu); 187 unmap_cpu_to_node(cpu);
205} 188}
206#else 189#else
207#define unmap_cpu_to_logical_apicid(cpu) do {} while (0)
208#define map_cpu_to_logical_apicid() do {} while (0) 190#define map_cpu_to_logical_apicid() do {} while (0)
209#endif 191#endif
210 192
@@ -234,7 +216,7 @@ static void __cpuinit smp_callin(void)
234 panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, 216 panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__,
235 phys_id, cpuid); 217 phys_id, cpuid);
236 } 218 }
237 Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); 219 pr_debug("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
238 220
239 /* 221 /*
240 * STARTUP IPIs are fragile beasts as they might sometimes 222 * STARTUP IPIs are fragile beasts as they might sometimes
@@ -269,7 +251,7 @@ static void __cpuinit smp_callin(void)
269 * boards) 251 * boards)
270 */ 252 */
271 253
272 Dprintk("CALLIN, before setup_local_APIC().\n"); 254 pr_debug("CALLIN, before setup_local_APIC().\n");
273 smp_callin_clear_local_apic(); 255 smp_callin_clear_local_apic();
274 setup_local_APIC(); 256 setup_local_APIC();
275 end_local_APIC_setup(); 257 end_local_APIC_setup();
@@ -284,7 +266,7 @@ static void __cpuinit smp_callin(void)
284 local_irq_enable(); 266 local_irq_enable();
285 calibrate_delay(); 267 calibrate_delay();
286 local_irq_disable(); 268 local_irq_disable();
287 Dprintk("Stack at about %p\n", &cpuid); 269 pr_debug("Stack at about %p\n", &cpuid);
288 270
289 /* 271 /*
290 * Save our processor parameters 272 * Save our processor parameters
@@ -345,19 +327,12 @@ static void __cpuinit start_secondary(void *unused)
345 * lock helps us to not include this cpu in a currently in progress 327 * lock helps us to not include this cpu in a currently in progress
346 * smp_call_function(). 328 * smp_call_function().
347 */ 329 */
348 lock_ipi_call_lock(); 330 ipi_call_lock_irq();
349#ifdef CONFIG_X86_64 331#ifdef CONFIG_X86_IO_APIC
350 spin_lock(&vector_lock); 332 setup_vector_irq(smp_processor_id());
351
352 /* Setup the per cpu irq handling data structures */
353 __setup_vector_irq(smp_processor_id());
354 /*
355 * Allow the master to continue.
356 */
357 spin_unlock(&vector_lock);
358#endif 333#endif
359 cpu_set(smp_processor_id(), cpu_online_map); 334 cpu_set(smp_processor_id(), cpu_online_map);
360 unlock_ipi_call_lock(); 335 ipi_call_unlock_irq();
361 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 336 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
362 337
363 setup_secondary_clock(); 338 setup_secondary_clock();
@@ -366,31 +341,8 @@ static void __cpuinit start_secondary(void *unused)
366 cpu_idle(); 341 cpu_idle();
367} 342}
368 343
369#ifdef CONFIG_X86_32
370/*
371 * Everything has been set up for the secondary
372 * CPUs - they just need to reload everything
373 * from the task structure
374 * This function must not return.
375 */
376void __devinit initialize_secondary(void)
377{
378 /*
379 * We don't actually need to load the full TSS,
380 * basically just the stack pointer and the ip.
381 */
382
383 asm volatile(
384 "movl %0,%%esp\n\t"
385 "jmp *%1"
386 :
387 :"m" (current->thread.sp), "m" (current->thread.ip));
388}
389#endif
390
391static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c) 344static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
392{ 345{
393#ifdef CONFIG_X86_32
394 /* 346 /*
395 * Mask B, Pentium, but not Pentium MMX 347 * Mask B, Pentium, but not Pentium MMX
396 */ 348 */
@@ -440,7 +392,6 @@ static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
440 392
441valid_k7: 393valid_k7:
442 ; 394 ;
443#endif
444} 395}
445 396
446static void __cpuinit smp_checks(void) 397static void __cpuinit smp_checks(void)
@@ -487,7 +438,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
487 cpu_set(cpu, cpu_sibling_setup_map); 438 cpu_set(cpu, cpu_sibling_setup_map);
488 439
489 if (smp_num_siblings > 1) { 440 if (smp_num_siblings > 1) {
490 for_each_cpu_mask(i, cpu_sibling_setup_map) { 441 for_each_cpu_mask_nr(i, cpu_sibling_setup_map) {
491 if (c->phys_proc_id == cpu_data(i).phys_proc_id && 442 if (c->phys_proc_id == cpu_data(i).phys_proc_id &&
492 c->cpu_core_id == cpu_data(i).cpu_core_id) { 443 c->cpu_core_id == cpu_data(i).cpu_core_id) {
493 cpu_set(i, per_cpu(cpu_sibling_map, cpu)); 444 cpu_set(i, per_cpu(cpu_sibling_map, cpu));
@@ -510,7 +461,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
510 return; 461 return;
511 } 462 }
512 463
513 for_each_cpu_mask(i, cpu_sibling_setup_map) { 464 for_each_cpu_mask_nr(i, cpu_sibling_setup_map) {
514 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && 465 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
515 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { 466 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
516 cpu_set(i, c->llc_shared_map); 467 cpu_set(i, c->llc_shared_map);
@@ -555,23 +506,6 @@ cpumask_t cpu_coregroup_map(int cpu)
555 return c->llc_shared_map; 506 return c->llc_shared_map;
556} 507}
557 508
558#ifdef CONFIG_X86_32
559/*
560 * We are called very early to get the low memory for the
561 * SMP bootup trampoline page.
562 */
563void __init smp_alloc_memory(void)
564{
565 trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE);
566 /*
567 * Has to be in very low memory so we can execute
568 * real-mode AP code.
569 */
570 if (__pa(trampoline_base) >= 0x9F000)
571 BUG();
572}
573#endif
574
575static void impress_friends(void) 509static void impress_friends(void)
576{ 510{
577 int cpu; 511 int cpu;
@@ -579,7 +513,7 @@ static void impress_friends(void)
579 /* 513 /*
580 * Allow the user to impress friends. 514 * Allow the user to impress friends.
581 */ 515 */
582 Dprintk("Before bogomips.\n"); 516 pr_debug("Before bogomips.\n");
583 for_each_possible_cpu(cpu) 517 for_each_possible_cpu(cpu)
584 if (cpu_isset(cpu, cpu_callout_map)) 518 if (cpu_isset(cpu, cpu_callout_map))
585 bogosum += cpu_data(cpu).loops_per_jiffy; 519 bogosum += cpu_data(cpu).loops_per_jiffy;
@@ -589,7 +523,7 @@ static void impress_friends(void)
589 bogosum/(500000/HZ), 523 bogosum/(500000/HZ),
590 (bogosum/(5000/HZ))%100); 524 (bogosum/(5000/HZ))%100);
591 525
592 Dprintk("Before bogocount - setting activated=1.\n"); 526 pr_debug("Before bogocount - setting activated=1.\n");
593} 527}
594 528
595static inline void __inquire_remote_apic(int apicid) 529static inline void __inquire_remote_apic(int apicid)
@@ -612,8 +546,8 @@ static inline void __inquire_remote_apic(int apicid)
612 printk(KERN_CONT 546 printk(KERN_CONT
613 "a previous APIC delivery may have failed\n"); 547 "a previous APIC delivery may have failed\n");
614 548
615 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); 549 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
616 apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); 550 apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]);
617 551
618 timeout = 0; 552 timeout = 0;
619 do { 553 do {
@@ -645,29 +579,24 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
645 int maxlvt; 579 int maxlvt;
646 580
647 /* Target chip */ 581 /* Target chip */
648 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); 582 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
649 583
650 /* Boot on the stack */ 584 /* Boot on the stack */
651 /* Kick the second */ 585 /* Kick the second */
652 apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); 586 apic_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
653 587
654 Dprintk("Waiting for send to finish...\n"); 588 pr_debug("Waiting for send to finish...\n");
655 send_status = safe_apic_wait_icr_idle(); 589 send_status = safe_apic_wait_icr_idle();
656 590
657 /* 591 /*
658 * Give the other CPU some time to accept the IPI. 592 * Give the other CPU some time to accept the IPI.
659 */ 593 */
660 udelay(200); 594 udelay(200);
661 /*
662 * Due to the Pentium erratum 3AP.
663 */
664 maxlvt = lapic_get_maxlvt(); 595 maxlvt = lapic_get_maxlvt();
665 if (maxlvt > 3) { 596 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
666 apic_read_around(APIC_SPIV);
667 apic_write(APIC_ESR, 0); 597 apic_write(APIC_ESR, 0);
668 }
669 accept_status = (apic_read(APIC_ESR) & 0xEF); 598 accept_status = (apic_read(APIC_ESR) & 0xEF);
670 Dprintk("NMI sent.\n"); 599 pr_debug("NMI sent.\n");
671 600
672 if (send_status) 601 if (send_status)
673 printk(KERN_ERR "APIC never delivered???\n"); 602 printk(KERN_ERR "APIC never delivered???\n");
@@ -691,42 +620,44 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
691 return send_status; 620 return send_status;
692 } 621 }
693 622
623 maxlvt = lapic_get_maxlvt();
624
694 /* 625 /*
695 * Be paranoid about clearing APIC errors. 626 * Be paranoid about clearing APIC errors.
696 */ 627 */
697 if (APIC_INTEGRATED(apic_version[phys_apicid])) { 628 if (APIC_INTEGRATED(apic_version[phys_apicid])) {
698 apic_read_around(APIC_SPIV); 629 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
699 apic_write(APIC_ESR, 0); 630 apic_write(APIC_ESR, 0);
700 apic_read(APIC_ESR); 631 apic_read(APIC_ESR);
701 } 632 }
702 633
703 Dprintk("Asserting INIT.\n"); 634 pr_debug("Asserting INIT.\n");
704 635
705 /* 636 /*
706 * Turn INIT on target chip 637 * Turn INIT on target chip
707 */ 638 */
708 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); 639 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
709 640
710 /* 641 /*
711 * Send IPI 642 * Send IPI
712 */ 643 */
713 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT 644 apic_write(APIC_ICR,
714 | APIC_DM_INIT); 645 APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT);
715 646
716 Dprintk("Waiting for send to finish...\n"); 647 pr_debug("Waiting for send to finish...\n");
717 send_status = safe_apic_wait_icr_idle(); 648 send_status = safe_apic_wait_icr_idle();
718 649
719 mdelay(10); 650 mdelay(10);
720 651
721 Dprintk("Deasserting INIT.\n"); 652 pr_debug("Deasserting INIT.\n");
722 653
723 /* Target chip */ 654 /* Target chip */
724 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); 655 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
725 656
726 /* Send IPI */ 657 /* Send IPI */
727 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); 658 apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
728 659
729 Dprintk("Waiting for send to finish...\n"); 660 pr_debug("Waiting for send to finish...\n");
730 send_status = safe_apic_wait_icr_idle(); 661 send_status = safe_apic_wait_icr_idle();
731 662
732 mb(); 663 mb();
@@ -748,64 +679,52 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
748 * target processor state. 679 * target processor state.
749 */ 680 */
750 startup_ipi_hook(phys_apicid, (unsigned long) start_secondary, 681 startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
751#ifdef CONFIG_X86_64
752 (unsigned long)init_rsp);
753#else
754 (unsigned long)stack_start.sp); 682 (unsigned long)stack_start.sp);
755#endif
756 683
757 /* 684 /*
758 * Run STARTUP IPI loop. 685 * Run STARTUP IPI loop.
759 */ 686 */
760 Dprintk("#startup loops: %d.\n", num_starts); 687 pr_debug("#startup loops: %d.\n", num_starts);
761
762 maxlvt = lapic_get_maxlvt();
763 688
764 for (j = 1; j <= num_starts; j++) { 689 for (j = 1; j <= num_starts; j++) {
765 Dprintk("Sending STARTUP #%d.\n", j); 690 pr_debug("Sending STARTUP #%d.\n", j);
766 apic_read_around(APIC_SPIV); 691 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
767 apic_write(APIC_ESR, 0); 692 apic_write(APIC_ESR, 0);
768 apic_read(APIC_ESR); 693 apic_read(APIC_ESR);
769 Dprintk("After apic_write.\n"); 694 pr_debug("After apic_write.\n");
770 695
771 /* 696 /*
772 * STARTUP IPI 697 * STARTUP IPI
773 */ 698 */
774 699
775 /* Target chip */ 700 /* Target chip */
776 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); 701 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
777 702
778 /* Boot on the stack */ 703 /* Boot on the stack */
779 /* Kick the second */ 704 /* Kick the second */
780 apic_write_around(APIC_ICR, APIC_DM_STARTUP 705 apic_write(APIC_ICR, APIC_DM_STARTUP | (start_eip >> 12));
781 | (start_eip >> 12));
782 706
783 /* 707 /*
784 * Give the other CPU some time to accept the IPI. 708 * Give the other CPU some time to accept the IPI.
785 */ 709 */
786 udelay(300); 710 udelay(300);
787 711
788 Dprintk("Startup point 1.\n"); 712 pr_debug("Startup point 1.\n");
789 713
790 Dprintk("Waiting for send to finish...\n"); 714 pr_debug("Waiting for send to finish...\n");
791 send_status = safe_apic_wait_icr_idle(); 715 send_status = safe_apic_wait_icr_idle();
792 716
793 /* 717 /*
794 * Give the other CPU some time to accept the IPI. 718 * Give the other CPU some time to accept the IPI.
795 */ 719 */
796 udelay(200); 720 udelay(200);
797 /* 721 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
798 * Due to the Pentium erratum 3AP.
799 */
800 if (maxlvt > 3) {
801 apic_read_around(APIC_SPIV);
802 apic_write(APIC_ESR, 0); 722 apic_write(APIC_ESR, 0);
803 }
804 accept_status = (apic_read(APIC_ESR) & 0xEF); 723 accept_status = (apic_read(APIC_ESR) & 0xEF);
805 if (send_status || accept_status) 724 if (send_status || accept_status)
806 break; 725 break;
807 } 726 }
808 Dprintk("After Startup.\n"); 727 pr_debug("After Startup.\n");
809 728
810 if (send_status) 729 if (send_status)
811 printk(KERN_ERR "APIC never delivered???\n"); 730 printk(KERN_ERR "APIC never delivered???\n");
@@ -832,6 +751,45 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
832 complete(&c_idle->done); 751 complete(&c_idle->done);
833} 752}
834 753
754#ifdef CONFIG_X86_64
755/*
756 * Allocate node local memory for the AP pda.
757 *
758 * Must be called after the _cpu_pda pointer table is initialized.
759 */
760int __cpuinit get_local_pda(int cpu)
761{
762 struct x8664_pda *oldpda, *newpda;
763 unsigned long size = sizeof(struct x8664_pda);
764 int node = cpu_to_node(cpu);
765
766 if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
767 return 0;
768
769 oldpda = cpu_pda(cpu);
770 newpda = kmalloc_node(size, GFP_ATOMIC, node);
771 if (!newpda) {
772 printk(KERN_ERR "Could not allocate node local PDA "
773 "for CPU %d on node %d\n", cpu, node);
774
775 if (oldpda)
776 return 0; /* have a usable pda */
777 else
778 return -1;
779 }
780
781 if (oldpda) {
782 memcpy(newpda, oldpda, size);
783 if (!after_bootmem)
784 free_bootmem((unsigned long)oldpda, size);
785 }
786
787 newpda->in_bootmem = 0;
788 cpu_pda(cpu) = newpda;
789 return 0;
790}
791#endif /* CONFIG_X86_64 */
792
835static int __cpuinit do_boot_cpu(int apicid, int cpu) 793static int __cpuinit do_boot_cpu(int apicid, int cpu)
836/* 794/*
837 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad 795 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -848,28 +806,14 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
848 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), 806 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
849 }; 807 };
850 INIT_WORK(&c_idle.work, do_fork_idle); 808 INIT_WORK(&c_idle.work, do_fork_idle);
851#ifdef CONFIG_X86_64
852 /* allocate memory for gdts of secondary cpus. Hotplug is considered */
853 if (!cpu_gdt_descr[cpu].address &&
854 !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) {
855 printk(KERN_ERR "Failed to allocate GDT for CPU %d\n", cpu);
856 return -1;
857 }
858 809
810#ifdef CONFIG_X86_64
859 /* Allocate node local memory for AP pdas */ 811 /* Allocate node local memory for AP pdas */
860 if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) { 812 if (cpu > 0) {
861 struct x8664_pda *newpda, *pda; 813 boot_error = get_local_pda(cpu);
862 int node = cpu_to_node(cpu); 814 if (boot_error)
863 pda = cpu_pda(cpu); 815 goto restore_state;
864 newpda = kmalloc_node(sizeof(struct x8664_pda), GFP_ATOMIC, 816 /* if can't get pda memory, can't start cpu */
865 node);
866 if (newpda) {
867 memcpy(newpda, pda, sizeof(struct x8664_pda));
868 cpu_pda(cpu) = newpda;
869 } else
870 printk(KERN_ERR
871 "Could not allocate node local PDA for CPU %d on node %d\n",
872 cpu, node);
873 } 817 }
874#endif 818#endif
875 819
@@ -905,18 +849,15 @@ do_rest:
905#ifdef CONFIG_X86_32 849#ifdef CONFIG_X86_32
906 per_cpu(current_task, cpu) = c_idle.idle; 850 per_cpu(current_task, cpu) = c_idle.idle;
907 init_gdt(cpu); 851 init_gdt(cpu);
908 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
909 c_idle.idle->thread.ip = (unsigned long) start_secondary;
910 /* Stack for startup_32 can be just as for start_secondary onwards */ 852 /* Stack for startup_32 can be just as for start_secondary onwards */
911 stack_start.sp = (void *) c_idle.idle->thread.sp;
912 irq_ctx_init(cpu); 853 irq_ctx_init(cpu);
913#else 854#else
914 cpu_pda(cpu)->pcurrent = c_idle.idle; 855 cpu_pda(cpu)->pcurrent = c_idle.idle;
915 init_rsp = c_idle.idle->thread.sp;
916 load_sp0(&per_cpu(init_tss, cpu), &c_idle.idle->thread);
917 initial_code = (unsigned long)start_secondary;
918 clear_tsk_thread_flag(c_idle.idle, TIF_FORK); 856 clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
919#endif 857#endif
858 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
859 initial_code = (unsigned long)start_secondary;
860 stack_start.sp = (void *) c_idle.idle->thread.sp;
920 861
921 /* start_ip had better be page-aligned! */ 862 /* start_ip had better be page-aligned! */
922 start_ip = setup_trampoline(); 863 start_ip = setup_trampoline();
@@ -934,7 +875,7 @@ do_rest:
934 875
935 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { 876 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
936 877
937 Dprintk("Setting warm reset code and vector.\n"); 878 pr_debug("Setting warm reset code and vector.\n");
938 879
939 store_NMI_vector(&nmi_high, &nmi_low); 880 store_NMI_vector(&nmi_high, &nmi_low);
940 881
@@ -955,9 +896,9 @@ do_rest:
955 /* 896 /*
956 * allow APs to start initializing. 897 * allow APs to start initializing.
957 */ 898 */
958 Dprintk("Before Callout %d.\n", cpu); 899 pr_debug("Before Callout %d.\n", cpu);
959 cpu_set(cpu, cpu_callout_map); 900 cpu_set(cpu, cpu_callout_map);
960 Dprintk("After Callout %d.\n", cpu); 901 pr_debug("After Callout %d.\n", cpu);
961 902
962 /* 903 /*
963 * Wait 5s total for a response 904 * Wait 5s total for a response
@@ -970,10 +911,10 @@ do_rest:
970 911
971 if (cpu_isset(cpu, cpu_callin_map)) { 912 if (cpu_isset(cpu, cpu_callin_map)) {
972 /* number CPUs logically, starting from 1 (BSP is 0) */ 913 /* number CPUs logically, starting from 1 (BSP is 0) */
973 Dprintk("OK.\n"); 914 pr_debug("OK.\n");
974 printk(KERN_INFO "CPU%d: ", cpu); 915 printk(KERN_INFO "CPU%d: ", cpu);
975 print_cpu_info(&cpu_data(cpu)); 916 print_cpu_info(&cpu_data(cpu));
976 Dprintk("CPU has booted.\n"); 917 pr_debug("CPU has booted.\n");
977 } else { 918 } else {
978 boot_error = 1; 919 boot_error = 1;
979 if (*((volatile unsigned char *)trampoline_base) 920 if (*((volatile unsigned char *)trampoline_base)
@@ -987,13 +928,12 @@ do_rest:
987 inquire_remote_apic(apicid); 928 inquire_remote_apic(apicid);
988 } 929 }
989 } 930 }
990
991 if (boot_error) {
992 /* Try to put things back the way they were before ... */
993 unmap_cpu_to_logical_apicid(cpu);
994#ifdef CONFIG_X86_64 931#ifdef CONFIG_X86_64
995 clear_node_cpumask(cpu); /* was set by numa_add_cpu */ 932restore_state:
996#endif 933#endif
934 if (boot_error) {
935 /* Try to put things back the way they were before ... */
936 numa_remove_cpu(cpu); /* was set by numa_add_cpu */
997 cpu_clear(cpu, cpu_callout_map); /* was set by do_boot_cpu() */ 937 cpu_clear(cpu, cpu_callout_map); /* was set by do_boot_cpu() */
998 cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */ 938 cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
999 cpu_clear(cpu, cpu_present_map); 939 cpu_clear(cpu, cpu_present_map);
@@ -1019,7 +959,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
1019 959
1020 WARN_ON(irqs_disabled()); 960 WARN_ON(irqs_disabled());
1021 961
1022 Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu); 962 pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu);
1023 963
1024 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || 964 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
1025 !physid_isset(apicid, phys_cpu_present_map)) { 965 !physid_isset(apicid, phys_cpu_present_map)) {
@@ -1031,7 +971,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
1031 * Already booted CPU? 971 * Already booted CPU?
1032 */ 972 */
1033 if (cpu_isset(cpu, cpu_callin_map)) { 973 if (cpu_isset(cpu, cpu_callin_map)) {
1034 Dprintk("do_boot_cpu %d Already started\n", cpu); 974 pr_debug("do_boot_cpu %d Already started\n", cpu);
1035 return -ENOSYS; 975 return -ENOSYS;
1036 } 976 }
1037 977
@@ -1058,7 +998,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
1058 err = do_boot_cpu(apicid, cpu); 998 err = do_boot_cpu(apicid, cpu);
1059#endif 999#endif
1060 if (err) { 1000 if (err) {
1061 Dprintk("do_boot_cpu failed %d\n", err); 1001 pr_debug("do_boot_cpu failed %d\n", err);
1062 return -EIO; 1002 return -EIO;
1063 } 1003 }
1064 1004
@@ -1087,14 +1027,12 @@ static __init void disable_smp(void)
1087{ 1027{
1088 cpu_present_map = cpumask_of_cpu(0); 1028 cpu_present_map = cpumask_of_cpu(0);
1089 cpu_possible_map = cpumask_of_cpu(0); 1029 cpu_possible_map = cpumask_of_cpu(0);
1090#ifdef CONFIG_X86_32
1091 smpboot_clear_io_apic_irqs(); 1030 smpboot_clear_io_apic_irqs();
1092#endif 1031
1093 if (smp_found_config) 1032 if (smp_found_config)
1094 phys_cpu_present_map = 1033 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
1095 physid_mask_of_physid(boot_cpu_physical_apicid);
1096 else 1034 else
1097 phys_cpu_present_map = physid_mask_of_physid(0); 1035 physid_set_mask_of_physid(0, &phys_cpu_present_map);
1098 map_cpu_to_logical_apicid(); 1036 map_cpu_to_logical_apicid();
1099 cpu_set(0, per_cpu(cpu_sibling_map, 0)); 1037 cpu_set(0, per_cpu(cpu_sibling_map, 0));
1100 cpu_set(0, per_cpu(cpu_core_map, 0)); 1038 cpu_set(0, per_cpu(cpu_core_map, 0));
@@ -1157,12 +1095,12 @@ static int __init smp_sanity_check(unsigned max_cpus)
1157 * If SMP should be disabled, then really disable it! 1095 * If SMP should be disabled, then really disable it!
1158 */ 1096 */
1159 if (!max_cpus) { 1097 if (!max_cpus) {
1160 printk(KERN_INFO "SMP mode deactivated," 1098 printk(KERN_INFO "SMP mode deactivated.\n");
1161 "forcing use of dummy APIC emulation.\n");
1162 smpboot_clear_io_apic(); 1099 smpboot_clear_io_apic();
1163#ifdef CONFIG_X86_32 1100
1101 localise_nmi_watchdog();
1102
1164 connect_bsp_APIC(); 1103 connect_bsp_APIC();
1165#endif
1166 setup_local_APIC(); 1104 setup_local_APIC();
1167 end_local_APIC_setup(); 1105 end_local_APIC_setup();
1168 return -1; 1106 return -1;
@@ -1190,7 +1128,6 @@ static void __init smp_cpu_index_default(void)
1190void __init native_smp_prepare_cpus(unsigned int max_cpus) 1128void __init native_smp_prepare_cpus(unsigned int max_cpus)
1191{ 1129{
1192 preempt_disable(); 1130 preempt_disable();
1193 nmi_watchdog_default();
1194 smp_cpu_index_default(); 1131 smp_cpu_index_default();
1195 current_cpu_data = boot_cpu_data; 1132 current_cpu_data = boot_cpu_data;
1196 cpu_callin_map = cpumask_of_cpu(0); 1133 cpu_callin_map = cpumask_of_cpu(0);
@@ -1217,9 +1154,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1217 } 1154 }
1218 preempt_enable(); 1155 preempt_enable();
1219 1156
1220#ifdef CONFIG_X86_32
1221 connect_bsp_APIC(); 1157 connect_bsp_APIC();
1222#endif 1158
1223 /* 1159 /*
1224 * Switch from PIC to APIC mode. 1160 * Switch from PIC to APIC mode.
1225 */ 1161 */
@@ -1257,8 +1193,8 @@ void __init native_smp_prepare_boot_cpu(void)
1257 int me = smp_processor_id(); 1193 int me = smp_processor_id();
1258#ifdef CONFIG_X86_32 1194#ifdef CONFIG_X86_32
1259 init_gdt(me); 1195 init_gdt(me);
1260 switch_to_new_gdt();
1261#endif 1196#endif
1197 switch_to_new_gdt();
1262 /* already set me in cpu_online_map in boot_cpu_init() */ 1198 /* already set me in cpu_online_map in boot_cpu_init() */
1263 cpu_set(me, cpu_callout_map); 1199 cpu_set(me, cpu_callout_map);
1264 per_cpu(cpu_state, me) = CPU_ONLINE; 1200 per_cpu(cpu_state, me) = CPU_ONLINE;
@@ -1266,7 +1202,7 @@ void __init native_smp_prepare_boot_cpu(void)
1266 1202
1267void __init native_smp_cpus_done(unsigned int max_cpus) 1203void __init native_smp_cpus_done(unsigned int max_cpus)
1268{ 1204{
1269 Dprintk("Boot done.\n"); 1205 pr_debug("Boot done.\n");
1270 1206
1271 impress_friends(); 1207 impress_friends();
1272 smp_checks(); 1208 smp_checks();
@@ -1278,29 +1214,12 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1278 1214
1279#ifdef CONFIG_HOTPLUG_CPU 1215#ifdef CONFIG_HOTPLUG_CPU
1280 1216
1281# ifdef CONFIG_X86_32
1282void cpu_exit_clear(void)
1283{
1284 int cpu = raw_smp_processor_id();
1285
1286 idle_task_exit();
1287
1288 cpu_uninit();
1289 irq_ctx_exit(cpu);
1290
1291 cpu_clear(cpu, cpu_callout_map);
1292 cpu_clear(cpu, cpu_callin_map);
1293
1294 unmap_cpu_to_logical_apicid(cpu);
1295}
1296# endif /* CONFIG_X86_32 */
1297
1298static void remove_siblinginfo(int cpu) 1217static void remove_siblinginfo(int cpu)
1299{ 1218{
1300 int sibling; 1219 int sibling;
1301 struct cpuinfo_x86 *c = &cpu_data(cpu); 1220 struct cpuinfo_x86 *c = &cpu_data(cpu);
1302 1221
1303 for_each_cpu_mask(sibling, per_cpu(cpu_core_map, cpu)) { 1222 for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) {
1304 cpu_clear(cpu, per_cpu(cpu_core_map, sibling)); 1223 cpu_clear(cpu, per_cpu(cpu_core_map, sibling));
1305 /*/ 1224 /*/
1306 * last thread sibling in this cpu core going down 1225 * last thread sibling in this cpu core going down
@@ -1309,7 +1228,7 @@ static void remove_siblinginfo(int cpu)
1309 cpu_data(sibling).booted_cores--; 1228 cpu_data(sibling).booted_cores--;
1310 } 1229 }
1311 1230
1312 for_each_cpu_mask(sibling, per_cpu(cpu_sibling_map, cpu)) 1231 for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu))
1313 cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling)); 1232 cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling));
1314 cpus_clear(per_cpu(cpu_sibling_map, cpu)); 1233 cpus_clear(per_cpu(cpu_sibling_map, cpu));
1315 cpus_clear(per_cpu(cpu_core_map, cpu)); 1234 cpus_clear(per_cpu(cpu_core_map, cpu));
@@ -1348,12 +1267,20 @@ __init void prefill_possible_map(void)
1348 int i; 1267 int i;
1349 int possible; 1268 int possible;
1350 1269
1270 /* no processor from mptable or madt */
1271 if (!num_processors)
1272 num_processors = 1;
1273
1274#ifdef CONFIG_HOTPLUG_CPU
1351 if (additional_cpus == -1) { 1275 if (additional_cpus == -1) {
1352 if (disabled_cpus > 0) 1276 if (disabled_cpus > 0)
1353 additional_cpus = disabled_cpus; 1277 additional_cpus = disabled_cpus;
1354 else 1278 else
1355 additional_cpus = 0; 1279 additional_cpus = 0;
1356 } 1280 }
1281#else
1282 additional_cpus = 0;
1283#endif
1357 possible = num_processors + additional_cpus; 1284 possible = num_processors + additional_cpus;
1358 if (possible > NR_CPUS) 1285 if (possible > NR_CPUS)
1359 possible = NR_CPUS; 1286 possible = NR_CPUS;
@@ -1363,18 +1290,18 @@ __init void prefill_possible_map(void)
1363 1290
1364 for (i = 0; i < possible; i++) 1291 for (i = 0; i < possible; i++)
1365 cpu_set(i, cpu_possible_map); 1292 cpu_set(i, cpu_possible_map);
1293
1294 nr_cpu_ids = possible;
1366} 1295}
1367 1296
1368static void __ref remove_cpu_from_maps(int cpu) 1297static void __ref remove_cpu_from_maps(int cpu)
1369{ 1298{
1370 cpu_clear(cpu, cpu_online_map); 1299 cpu_clear(cpu, cpu_online_map);
1371#ifdef CONFIG_X86_64
1372 cpu_clear(cpu, cpu_callout_map); 1300 cpu_clear(cpu, cpu_callout_map);
1373 cpu_clear(cpu, cpu_callin_map); 1301 cpu_clear(cpu, cpu_callin_map);
1374 /* was set by cpu_init() */ 1302 /* was set by cpu_init() */
1375 clear_bit(cpu, (unsigned long *)&cpu_initialized); 1303 cpu_clear(cpu, cpu_initialized);
1376 clear_node_cpumask(cpu); 1304 numa_remove_cpu(cpu);
1377#endif
1378} 1305}
1379 1306
1380int __cpu_disable(void) 1307int __cpu_disable(void)
@@ -1452,7 +1379,8 @@ static int __init parse_maxcpus(char *arg)
1452{ 1379{
1453 extern unsigned int maxcpus; 1380 extern unsigned int maxcpus;
1454 1381
1455 maxcpus = simple_strtoul(arg, NULL, 0); 1382 if (arg)
1383 maxcpus = simple_strtoul(arg, NULL, 0);
1456 return 0; 1384 return 0;
1457} 1385}
1458early_param("maxcpus", parse_maxcpus); 1386early_param("maxcpus", parse_maxcpus);
diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
index 3449064d141a..99941b37eca0 100644
--- a/arch/x86/kernel/smpcommon.c
+++ b/arch/x86/kernel/smpcommon.c
@@ -25,59 +25,3 @@ __cpuinit void init_gdt(int cpu)
25 per_cpu(cpu_number, cpu) = cpu; 25 per_cpu(cpu_number, cpu) = cpu;
26} 26}
27#endif 27#endif
28
29/**
30 * smp_call_function(): Run a function on all other CPUs.
31 * @func: The function to run. This must be fast and non-blocking.
32 * @info: An arbitrary pointer to pass to the function.
33 * @nonatomic: Unused.
34 * @wait: If true, wait (atomically) until function has completed on other CPUs.
35 *
36 * Returns 0 on success, else a negative status code.
37 *
38 * If @wait is true, then returns once @func has returned; otherwise
39 * it returns just before the target cpu calls @func.
40 *
41 * You must not call this function with disabled interrupts or from a
42 * hardware interrupt handler or from a bottom half handler.
43 */
44int smp_call_function(void (*func) (void *info), void *info, int nonatomic,
45 int wait)
46{
47 return smp_call_function_mask(cpu_online_map, func, info, wait);
48}
49EXPORT_SYMBOL(smp_call_function);
50
51/**
52 * smp_call_function_single - Run a function on a specific CPU
53 * @cpu: The target CPU. Cannot be the calling CPU.
54 * @func: The function to run. This must be fast and non-blocking.
55 * @info: An arbitrary pointer to pass to the function.
56 * @nonatomic: Unused.
57 * @wait: If true, wait until function has completed on other CPUs.
58 *
59 * Returns 0 on success, else a negative status code.
60 *
61 * If @wait is true, then returns once @func has returned; otherwise
62 * it returns just before the target cpu calls @func.
63 */
64int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
65 int nonatomic, int wait)
66{
67 /* prevent preemption and reschedule on another processor */
68 int ret;
69 int me = get_cpu();
70 if (cpu == me) {
71 local_irq_disable();
72 func(info);
73 local_irq_enable();
74 put_cpu();
75 return 0;
76 }
77
78 ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
79
80 put_cpu();
81 return ret;
82}
83EXPORT_SYMBOL(smp_call_function_single);
diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c
deleted file mode 100644
index 8b137891791f..000000000000
--- a/arch/x86/kernel/smpcommon_32.c
+++ /dev/null
@@ -1 +0,0 @@
1
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index c28c342c162f..a03e7f6d90c3 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -74,6 +74,7 @@ void save_stack_trace(struct stack_trace *trace)
74 if (trace->nr_entries < trace->max_entries) 74 if (trace->nr_entries < trace->max_entries)
75 trace->entries[trace->nr_entries++] = ULONG_MAX; 75 trace->entries[trace->nr_entries++] = ULONG_MAX;
76} 76}
77EXPORT_SYMBOL_GPL(save_stack_trace);
77 78
78void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) 79void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
79{ 80{
@@ -81,3 +82,4 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
81 if (trace->nr_entries < trace->max_entries) 82 if (trace->nr_entries < trace->max_entries)
82 trace->entries[trace->nr_entries++] = ULONG_MAX; 83 trace->entries[trace->nr_entries++] = ULONG_MAX;
83} 84}
85EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 92c20fee6781..e8b9863ef8c4 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -105,6 +105,20 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
105static int enable_single_step(struct task_struct *child) 105static int enable_single_step(struct task_struct *child)
106{ 106{
107 struct pt_regs *regs = task_pt_regs(child); 107 struct pt_regs *regs = task_pt_regs(child);
108 unsigned long oflags;
109
110 /*
111 * If we stepped into a sysenter/syscall insn, it trapped in
112 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
113 * If user-mode had set TF itself, then it's still clear from
114 * do_debug() and we need to set it again to restore the user
115 * state so we don't wrongly set TIF_FORCED_TF below.
116 * If enable_single_step() was used last and that is what
117 * set TIF_SINGLESTEP, then both TF and TIF_FORCED_TF are
118 * already set and our bookkeeping is fine.
119 */
120 if (unlikely(test_tsk_thread_flag(child, TIF_SINGLESTEP)))
121 regs->flags |= X86_EFLAGS_TF;
108 122
109 /* 123 /*
110 * Always set TIF_SINGLESTEP - this guarantees that 124 * Always set TIF_SINGLESTEP - this guarantees that
@@ -113,11 +127,7 @@ static int enable_single_step(struct task_struct *child)
113 */ 127 */
114 set_tsk_thread_flag(child, TIF_SINGLESTEP); 128 set_tsk_thread_flag(child, TIF_SINGLESTEP);
115 129
116 /* 130 oflags = regs->flags;
117 * If TF was already set, don't do anything else
118 */
119 if (regs->flags & X86_EFLAGS_TF)
120 return 0;
121 131
122 /* Set TF on the kernel stack.. */ 132 /* Set TF on the kernel stack.. */
123 regs->flags |= X86_EFLAGS_TF; 133 regs->flags |= X86_EFLAGS_TF;
@@ -126,9 +136,22 @@ static int enable_single_step(struct task_struct *child)
126 * ..but if TF is changed by the instruction we will trace, 136 * ..but if TF is changed by the instruction we will trace,
127 * don't mark it as being "us" that set it, so that we 137 * don't mark it as being "us" that set it, so that we
128 * won't clear it by hand later. 138 * won't clear it by hand later.
139 *
140 * Note that if we don't actually execute the popf because
141 * of a signal arriving right now or suchlike, we will lose
142 * track of the fact that it really was "us" that set it.
129 */ 143 */
130 if (is_setting_trap_flag(child, regs)) 144 if (is_setting_trap_flag(child, regs)) {
145 clear_tsk_thread_flag(child, TIF_FORCED_TF);
131 return 0; 146 return 0;
147 }
148
149 /*
150 * If TF was already set, check whether it was us who set it.
151 * If not, we should never attempt a block step.
152 */
153 if (oflags & X86_EFLAGS_TF)
154 return test_tsk_thread_flag(child, TIF_FORCED_TF);
132 155
133 set_tsk_thread_flag(child, TIF_FORCED_TF); 156 set_tsk_thread_flag(child, TIF_FORCED_TF);
134 157
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c
index ae751094eba9..d67ce5f044ba 100644
--- a/arch/x86/kernel/summit_32.c
+++ b/arch/x86/kernel/summit_32.c
@@ -36,7 +36,9 @@ static struct rio_table_hdr *rio_table_hdr __initdata;
36static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; 36static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
37static struct rio_detail *rio_devs[MAX_NUMNODES*4] __initdata; 37static struct rio_detail *rio_devs[MAX_NUMNODES*4] __initdata;
38 38
39#ifndef CONFIG_X86_NUMAQ
39static int mp_bus_id_to_node[MAX_MP_BUSSES] __initdata; 40static int mp_bus_id_to_node[MAX_MP_BUSSES] __initdata;
41#endif
40 42
41static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) 43static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
42{ 44{
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index d2ab52cc1d6b..7066cb855a60 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -19,8 +19,8 @@
19#include <linux/utsname.h> 19#include <linux/utsname.h>
20#include <linux/ipc.h> 20#include <linux/ipc.h>
21 21
22#include <asm/uaccess.h> 22#include <linux/uaccess.h>
23#include <asm/unistd.h> 23#include <linux/unistd.h>
24 24
25asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, 25asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
26 unsigned long prot, unsigned long flags, 26 unsigned long prot, unsigned long flags,
@@ -103,7 +103,7 @@ asmlinkage int old_select(struct sel_arg_struct __user *arg)
103 * 103 *
104 * This is really horribly ugly. 104 * This is really horribly ugly.
105 */ 105 */
106asmlinkage int sys_ipc (uint call, int first, int second, 106asmlinkage int sys_ipc(uint call, int first, int second,
107 int third, void __user *ptr, long fifth) 107 int third, void __user *ptr, long fifth)
108{ 108{
109 int version, ret; 109 int version, ret;
@@ -113,24 +113,24 @@ asmlinkage int sys_ipc (uint call, int first, int second,
113 113
114 switch (call) { 114 switch (call) {
115 case SEMOP: 115 case SEMOP:
116 return sys_semtimedop (first, (struct sembuf __user *)ptr, second, NULL); 116 return sys_semtimedop(first, (struct sembuf __user *)ptr, second, NULL);
117 case SEMTIMEDOP: 117 case SEMTIMEDOP:
118 return sys_semtimedop(first, (struct sembuf __user *)ptr, second, 118 return sys_semtimedop(first, (struct sembuf __user *)ptr, second,
119 (const struct timespec __user *)fifth); 119 (const struct timespec __user *)fifth);
120 120
121 case SEMGET: 121 case SEMGET:
122 return sys_semget (first, second, third); 122 return sys_semget(first, second, third);
123 case SEMCTL: { 123 case SEMCTL: {
124 union semun fourth; 124 union semun fourth;
125 if (!ptr) 125 if (!ptr)
126 return -EINVAL; 126 return -EINVAL;
127 if (get_user(fourth.__pad, (void __user * __user *) ptr)) 127 if (get_user(fourth.__pad, (void __user * __user *) ptr))
128 return -EFAULT; 128 return -EFAULT;
129 return sys_semctl (first, second, third, fourth); 129 return sys_semctl(first, second, third, fourth);
130 } 130 }
131 131
132 case MSGSND: 132 case MSGSND:
133 return sys_msgsnd (first, (struct msgbuf __user *) ptr, 133 return sys_msgsnd(first, (struct msgbuf __user *) ptr,
134 second, third); 134 second, third);
135 case MSGRCV: 135 case MSGRCV:
136 switch (version) { 136 switch (version) {
@@ -138,45 +138,45 @@ asmlinkage int sys_ipc (uint call, int first, int second,
138 struct ipc_kludge tmp; 138 struct ipc_kludge tmp;
139 if (!ptr) 139 if (!ptr)
140 return -EINVAL; 140 return -EINVAL;
141 141
142 if (copy_from_user(&tmp, 142 if (copy_from_user(&tmp,
143 (struct ipc_kludge __user *) ptr, 143 (struct ipc_kludge __user *) ptr,
144 sizeof (tmp))) 144 sizeof(tmp)))
145 return -EFAULT; 145 return -EFAULT;
146 return sys_msgrcv (first, tmp.msgp, second, 146 return sys_msgrcv(first, tmp.msgp, second,
147 tmp.msgtyp, third); 147 tmp.msgtyp, third);
148 } 148 }
149 default: 149 default:
150 return sys_msgrcv (first, 150 return sys_msgrcv(first,
151 (struct msgbuf __user *) ptr, 151 (struct msgbuf __user *) ptr,
152 second, fifth, third); 152 second, fifth, third);
153 } 153 }
154 case MSGGET: 154 case MSGGET:
155 return sys_msgget ((key_t) first, second); 155 return sys_msgget((key_t) first, second);
156 case MSGCTL: 156 case MSGCTL:
157 return sys_msgctl (first, second, (struct msqid_ds __user *) ptr); 157 return sys_msgctl(first, second, (struct msqid_ds __user *) ptr);
158 158
159 case SHMAT: 159 case SHMAT:
160 switch (version) { 160 switch (version) {
161 default: { 161 default: {
162 ulong raddr; 162 ulong raddr;
163 ret = do_shmat (first, (char __user *) ptr, second, &raddr); 163 ret = do_shmat(first, (char __user *) ptr, second, &raddr);
164 if (ret) 164 if (ret)
165 return ret; 165 return ret;
166 return put_user (raddr, (ulong __user *) third); 166 return put_user(raddr, (ulong __user *) third);
167 } 167 }
168 case 1: /* iBCS2 emulator entry point */ 168 case 1: /* iBCS2 emulator entry point */
169 if (!segment_eq(get_fs(), get_ds())) 169 if (!segment_eq(get_fs(), get_ds()))
170 return -EINVAL; 170 return -EINVAL;
171 /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */ 171 /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */
172 return do_shmat (first, (char __user *) ptr, second, (ulong *) third); 172 return do_shmat(first, (char __user *) ptr, second, (ulong *) third);
173 } 173 }
174 case SHMDT: 174 case SHMDT:
175 return sys_shmdt ((char __user *)ptr); 175 return sys_shmdt((char __user *)ptr);
176 case SHMGET: 176 case SHMGET:
177 return sys_shmget (first, second, third); 177 return sys_shmget(first, second, third);
178 case SHMCTL: 178 case SHMCTL:
179 return sys_shmctl (first, second, 179 return sys_shmctl(first, second,
180 (struct shmid_ds __user *) ptr); 180 (struct shmid_ds __user *) ptr);
181 default: 181 default:
182 return -ENOSYS; 182 return -ENOSYS;
@@ -186,28 +186,28 @@ asmlinkage int sys_ipc (uint call, int first, int second,
186/* 186/*
187 * Old cruft 187 * Old cruft
188 */ 188 */
189asmlinkage int sys_uname(struct old_utsname __user * name) 189asmlinkage int sys_uname(struct old_utsname __user *name)
190{ 190{
191 int err; 191 int err;
192 if (!name) 192 if (!name)
193 return -EFAULT; 193 return -EFAULT;
194 down_read(&uts_sem); 194 down_read(&uts_sem);
195 err = copy_to_user(name, utsname(), sizeof (*name)); 195 err = copy_to_user(name, utsname(), sizeof(*name));
196 up_read(&uts_sem); 196 up_read(&uts_sem);
197 return err?-EFAULT:0; 197 return err? -EFAULT:0;
198} 198}
199 199
200asmlinkage int sys_olduname(struct oldold_utsname __user * name) 200asmlinkage int sys_olduname(struct oldold_utsname __user *name)
201{ 201{
202 int error; 202 int error;
203 203
204 if (!name) 204 if (!name)
205 return -EFAULT; 205 return -EFAULT;
206 if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname))) 206 if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
207 return -EFAULT; 207 return -EFAULT;
208 208
209 down_read(&uts_sem); 209 down_read(&uts_sem);
210 210
211 error = __copy_to_user(&name->sysname, &utsname()->sysname, 211 error = __copy_to_user(&name->sysname, &utsname()->sysname,
212 __OLD_UTS_LEN); 212 __OLD_UTS_LEN);
213 error |= __put_user(0, name->sysname + __OLD_UTS_LEN); 213 error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
@@ -223,9 +223,9 @@ asmlinkage int sys_olduname(struct oldold_utsname __user * name)
223 error |= __copy_to_user(&name->machine, &utsname()->machine, 223 error |= __copy_to_user(&name->machine, &utsname()->machine,
224 __OLD_UTS_LEN); 224 __OLD_UTS_LEN);
225 error |= __put_user(0, name->machine + __OLD_UTS_LEN); 225 error |= __put_user(0, name->machine + __OLD_UTS_LEN);
226 226
227 up_read(&uts_sem); 227 up_read(&uts_sem);
228 228
229 error = error ? -EFAULT : 0; 229 error = error ? -EFAULT : 0;
230 230
231 return error; 231 return error;
@@ -241,6 +241,6 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[])
241 long __res; 241 long __res;
242 asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" 242 asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx"
243 : "=a" (__res) 243 : "=a" (__res)
244 : "0" (__NR_execve),"ri" (filename),"c" (argv), "d" (envp) : "memory"); 244 : "0" (__NR_execve), "ri" (filename), "c" (argv), "d" (envp) : "memory");
245 return __res; 245 return __res;
246} 246}
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index adff5562f5fd..d44395ff34c3 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -326,3 +326,9 @@ ENTRY(sys_call_table)
326 .long sys_fallocate 326 .long sys_fallocate
327 .long sys_timerfd_settime /* 325 */ 327 .long sys_timerfd_settime /* 325 */
328 .long sys_timerfd_gettime 328 .long sys_timerfd_gettime
329 .long sys_signalfd4
330 .long sys_eventfd2
331 .long sys_epoll_create1
332 .long sys_dup3 /* 330 */
333 .long sys_pipe2
334 .long sys_inotify_init1
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 2ff21f398934..ffe3c664afc0 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -39,9 +39,6 @@
39 39
40#include "do_timer.h" 40#include "do_timer.h"
41 41
42unsigned int cpu_khz; /* Detected as we calibrate the TSC */
43EXPORT_SYMBOL(cpu_khz);
44
45int timer_ack; 42int timer_ack;
46 43
47unsigned long profile_pc(struct pt_regs *regs) 44unsigned long profile_pc(struct pt_regs *regs)
@@ -84,8 +81,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
84 if (timer_ack) { 81 if (timer_ack) {
85 /* 82 /*
86 * Subtle, when I/O APICs are used we have to ack timer IRQ 83 * Subtle, when I/O APICs are used we have to ack timer IRQ
87 * manually to reset the IRR bit for do_slow_gettimeoffset(). 84 * manually to deassert NMI lines for the watchdog if run
88 * This will also deassert NMI lines for the watchdog if run
89 * on an 82489DX-based system. 85 * on an 82489DX-based system.
90 */ 86 */
91 spin_lock(&i8259A_lock); 87 spin_lock(&i8259A_lock);
@@ -133,6 +129,7 @@ void __init hpet_time_init(void)
133 */ 129 */
134void __init time_init(void) 130void __init time_init(void)
135{ 131{
132 pre_time_init_hook();
136 tsc_init(); 133 tsc_init();
137 late_time_init = choose_time_init(); 134 late_time_init = choose_time_init();
138} 135}
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index c737849e2ef7..e3d49c553af2 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -56,7 +56,7 @@ static irqreturn_t timer_event_interrupt(int irq, void *dev_id)
56/* calibrate_cpu is used on systems with fixed rate TSCs to determine 56/* calibrate_cpu is used on systems with fixed rate TSCs to determine
57 * processor frequency */ 57 * processor frequency */
58#define TICK_COUNT 100000000 58#define TICK_COUNT 100000000
59unsigned long __init native_calculate_cpu_khz(void) 59unsigned long __init calibrate_cpu(void)
60{ 60{
61 int tsc_start, tsc_now; 61 int tsc_start, tsc_now;
62 int i, no_ctr_free; 62 int i, no_ctr_free;
@@ -116,23 +116,11 @@ void __init hpet_time_init(void)
116 116
117void __init time_init(void) 117void __init time_init(void)
118{ 118{
119 tsc_calibrate(); 119 tsc_init();
120
121 cpu_khz = tsc_khz;
122 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
123 (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
124 cpu_khz = calculate_cpu_khz();
125
126 if (unsynchronized_tsc())
127 mark_tsc_unstable("TSCs unsynchronized");
128
129 if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) 120 if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
130 vgetcpu_mode = VGETCPU_RDTSCP; 121 vgetcpu_mode = VGETCPU_RDTSCP;
131 else 122 else
132 vgetcpu_mode = VGETCPU_LSL; 123 vgetcpu_mode = VGETCPU_LSL;
133 124
134 printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
135 cpu_khz / 1000, cpu_khz % 1000);
136 init_tsc_clocksource();
137 late_time_init = choose_time_init(); 125 late_time_init = choose_time_init();
138} 126}
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index 9bb2363851af..fec1ecedc9b7 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -238,6 +238,6 @@ static void do_flush_tlb_all(void *info)
238 238
239void flush_tlb_all(void) 239void flush_tlb_all(void)
240{ 240{
241 on_each_cpu(do_flush_tlb_all, NULL, 1, 1); 241 on_each_cpu(do_flush_tlb_all, NULL, 1);
242} 242}
243 243
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
index a1f07d793202..dcbf7a1159ea 100644
--- a/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/kernel/tlb_64.c
@@ -15,6 +15,8 @@
15#include <asm/proto.h> 15#include <asm/proto.h>
16#include <asm/apicdef.h> 16#include <asm/apicdef.h>
17#include <asm/idle.h> 17#include <asm/idle.h>
18#include <asm/uv/uv_hub.h>
19#include <asm/uv/uv_bau.h>
18 20
19#include <mach_ipi.h> 21#include <mach_ipi.h>
20/* 22/*
@@ -162,6 +164,9 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
162 union smp_flush_state *f; 164 union smp_flush_state *f;
163 cpumask_t cpumask = *cpumaskp; 165 cpumask_t cpumask = *cpumaskp;
164 166
167 if (is_uv_system() && uv_flush_tlb_others(&cpumask, mm, va))
168 return;
169
165 /* Caller has disabled preemption */ 170 /* Caller has disabled preemption */
166 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; 171 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
167 f = &per_cpu(flush_state, sender); 172 f = &per_cpu(flush_state, sender);
@@ -270,5 +275,5 @@ static void do_flush_tlb_all(void *info)
270 275
271void flush_tlb_all(void) 276void flush_tlb_all(void)
272{ 277{
273 on_each_cpu(do_flush_tlb_all, NULL, 1, 1); 278 on_each_cpu(do_flush_tlb_all, NULL, 1);
274} 279}
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
new file mode 100644
index 000000000000..d0fbb7712ab0
--- /dev/null
+++ b/arch/x86/kernel/tlb_uv.c
@@ -0,0 +1,792 @@
1/*
2 * SGI UltraViolet TLB flush routines.
3 *
4 * (c) 2008 Cliff Wickman <cpw@sgi.com>, SGI.
5 *
6 * This code is released under the GNU General Public License version 2 or
7 * later.
8 */
9#include <linux/mc146818rtc.h>
10#include <linux/proc_fs.h>
11#include <linux/kernel.h>
12
13#include <asm/mmu_context.h>
14#include <asm/uv/uv_mmrs.h>
15#include <asm/uv/uv_hub.h>
16#include <asm/uv/uv_bau.h>
17#include <asm/genapic.h>
18#include <asm/idle.h>
19#include <asm/tsc.h>
20
21#include <mach_apic.h>
22
23static struct bau_control **uv_bau_table_bases __read_mostly;
24static int uv_bau_retry_limit __read_mostly;
25
26/* position of pnode (which is nasid>>1): */
27static int uv_nshift __read_mostly;
28
29static unsigned long uv_mmask __read_mostly;
30
31static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
32static DEFINE_PER_CPU(struct bau_control, bau_control);
33
34/*
35 * Free a software acknowledge hardware resource by clearing its Pending
36 * bit. This will return a reply to the sender.
37 * If the message has timed out, a reply has already been sent by the
38 * hardware but the resource has not been released. In that case our
39 * clear of the Timeout bit (as well) will free the resource. No reply will
40 * be sent (the hardware will only do one reply per message).
41 */
42static void uv_reply_to_message(int resource,
43 struct bau_payload_queue_entry *msg,
44 struct bau_msg_status *msp)
45{
46 unsigned long dw;
47
48 dw = (1 << (resource + UV_SW_ACK_NPENDING)) | (1 << resource);
49 msg->replied_to = 1;
50 msg->sw_ack_vector = 0;
51 if (msp)
52 msp->seen_by.bits = 0;
53 uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
54}
55
56/*
57 * Do all the things a cpu should do for a TLB shootdown message.
58 * Other cpu's may come here at the same time for this message.
59 */
60static void uv_bau_process_message(struct bau_payload_queue_entry *msg,
61 int msg_slot, int sw_ack_slot)
62{
63 unsigned long this_cpu_mask;
64 struct bau_msg_status *msp;
65 int cpu;
66
67 msp = __get_cpu_var(bau_control).msg_statuses + msg_slot;
68 cpu = uv_blade_processor_id();
69 msg->number_of_cpus =
70 uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id()));
71 this_cpu_mask = 1UL << cpu;
72 if (msp->seen_by.bits & this_cpu_mask)
73 return;
74 atomic_or_long(&msp->seen_by.bits, this_cpu_mask);
75
76 if (msg->replied_to == 1)
77 return;
78
79 if (msg->address == TLB_FLUSH_ALL) {
80 local_flush_tlb();
81 __get_cpu_var(ptcstats).alltlb++;
82 } else {
83 __flush_tlb_one(msg->address);
84 __get_cpu_var(ptcstats).onetlb++;
85 }
86
87 __get_cpu_var(ptcstats).requestee++;
88
89 atomic_inc_short(&msg->acknowledge_count);
90 if (msg->number_of_cpus == msg->acknowledge_count)
91 uv_reply_to_message(sw_ack_slot, msg, msp);
92}
93
94/*
95 * Examine the payload queue on one distribution node to see
96 * which messages have not been seen, and which cpu(s) have not seen them.
97 *
98 * Returns the number of cpu's that have not responded.
99 */
100static int uv_examine_destination(struct bau_control *bau_tablesp, int sender)
101{
102 struct bau_payload_queue_entry *msg;
103 struct bau_msg_status *msp;
104 int count = 0;
105 int i;
106 int j;
107
108 for (msg = bau_tablesp->va_queue_first, i = 0; i < DEST_Q_SIZE;
109 msg++, i++) {
110 if ((msg->sending_cpu == sender) && (!msg->replied_to)) {
111 msp = bau_tablesp->msg_statuses + i;
112 printk(KERN_DEBUG
113 "blade %d: address:%#lx %d of %d, not cpu(s): ",
114 i, msg->address, msg->acknowledge_count,
115 msg->number_of_cpus);
116 for (j = 0; j < msg->number_of_cpus; j++) {
117 if (!((1L << j) & msp->seen_by.bits)) {
118 count++;
119 printk("%d ", j);
120 }
121 }
122 printk("\n");
123 }
124 }
125 return count;
126}
127
128/*
129 * Examine the payload queue on all the distribution nodes to see
130 * which messages have not been seen, and which cpu(s) have not seen them.
131 *
132 * Returns the number of cpu's that have not responded.
133 */
134static int uv_examine_destinations(struct bau_target_nodemask *distribution)
135{
136 int sender;
137 int i;
138 int count = 0;
139
140 sender = smp_processor_id();
141 for (i = 0; i < sizeof(struct bau_target_nodemask) * BITSPERBYTE; i++) {
142 if (!bau_node_isset(i, distribution))
143 continue;
144 count += uv_examine_destination(uv_bau_table_bases[i], sender);
145 }
146 return count;
147}
148
149/*
150 * wait for completion of a broadcast message
151 *
152 * return COMPLETE, RETRY or GIVEUP
153 */
154static int uv_wait_completion(struct bau_desc *bau_desc,
155 unsigned long mmr_offset, int right_shift)
156{
157 int exams = 0;
158 long destination_timeouts = 0;
159 long source_timeouts = 0;
160 unsigned long descriptor_status;
161
162 while ((descriptor_status = (((unsigned long)
163 uv_read_local_mmr(mmr_offset) >>
164 right_shift) & UV_ACT_STATUS_MASK)) !=
165 DESC_STATUS_IDLE) {
166 if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
167 source_timeouts++;
168 if (source_timeouts > SOURCE_TIMEOUT_LIMIT)
169 source_timeouts = 0;
170 __get_cpu_var(ptcstats).s_retry++;
171 return FLUSH_RETRY;
172 }
173 /*
174 * spin here looking for progress at the destinations
175 */
176 if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) {
177 destination_timeouts++;
178 if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) {
179 /*
180 * returns number of cpus not responding
181 */
182 if (uv_examine_destinations
183 (&bau_desc->distribution) == 0) {
184 __get_cpu_var(ptcstats).d_retry++;
185 return FLUSH_RETRY;
186 }
187 exams++;
188 if (exams >= uv_bau_retry_limit) {
189 printk(KERN_DEBUG
190 "uv_flush_tlb_others");
191 printk("giving up on cpu %d\n",
192 smp_processor_id());
193 return FLUSH_GIVEUP;
194 }
195 /*
196 * delays can hang the simulator
197 udelay(1000);
198 */
199 destination_timeouts = 0;
200 }
201 }
202 }
203 return FLUSH_COMPLETE;
204}
205
206/**
207 * uv_flush_send_and_wait
208 *
209 * Send a broadcast and wait for a broadcast message to complete.
210 *
211 * The cpumaskp mask contains the cpus the broadcast was sent to.
212 *
213 * Returns 1 if all remote flushing was done. The mask is zeroed.
214 * Returns 0 if some remote flushing remains to be done. The mask is left
215 * unchanged.
216 */
217int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc,
218 cpumask_t *cpumaskp)
219{
220 int completion_status = 0;
221 int right_shift;
222 int tries = 0;
223 int blade;
224 int bit;
225 unsigned long mmr_offset;
226 unsigned long index;
227 cycles_t time1;
228 cycles_t time2;
229
230 if (cpu < UV_CPUS_PER_ACT_STATUS) {
231 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
232 right_shift = cpu * UV_ACT_STATUS_SIZE;
233 } else {
234 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
235 right_shift =
236 ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE);
237 }
238 time1 = get_cycles();
239 do {
240 tries++;
241 index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
242 cpu;
243 uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
244 completion_status = uv_wait_completion(bau_desc, mmr_offset,
245 right_shift);
246 } while (completion_status == FLUSH_RETRY);
247 time2 = get_cycles();
248 __get_cpu_var(ptcstats).sflush += (time2 - time1);
249 if (tries > 1)
250 __get_cpu_var(ptcstats).retriesok++;
251
252 if (completion_status == FLUSH_GIVEUP) {
253 /*
254 * Cause the caller to do an IPI-style TLB shootdown on
255 * the cpu's, all of which are still in the mask.
256 */
257 __get_cpu_var(ptcstats).ptc_i++;
258 return 0;
259 }
260
261 /*
262 * Success, so clear the remote cpu's from the mask so we don't
263 * use the IPI method of shootdown on them.
264 */
265 for_each_cpu_mask(bit, *cpumaskp) {
266 blade = uv_cpu_to_blade_id(bit);
267 if (blade == this_blade)
268 continue;
269 cpu_clear(bit, *cpumaskp);
270 }
271 if (!cpus_empty(*cpumaskp))
272 return 0;
273 return 1;
274}
275
276/**
277 * uv_flush_tlb_others - globally purge translation cache of a virtual
278 * address or all TLB's
279 * @cpumaskp: mask of all cpu's in which the address is to be removed
280 * @mm: mm_struct containing virtual address range
281 * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
282 *
283 * This is the entry point for initiating any UV global TLB shootdown.
284 *
285 * Purges the translation caches of all specified processors of the given
286 * virtual address, or purges all TLB's on specified processors.
287 *
288 * The caller has derived the cpumaskp from the mm_struct and has subtracted
289 * the local cpu from the mask. This function is called only if there
290 * are bits set in the mask. (e.g. flush_tlb_page())
291 *
292 * The cpumaskp is converted into a nodemask of the nodes containing
293 * the cpus.
294 *
295 * Returns 1 if all remote flushing was done.
296 * Returns 0 if some remote flushing remains to be done.
297 */
298int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm,
299 unsigned long va)
300{
301 int i;
302 int bit;
303 int blade;
304 int cpu;
305 int this_blade;
306 int locals = 0;
307 struct bau_desc *bau_desc;
308
309 cpu = uv_blade_processor_id();
310 this_blade = uv_numa_blade_id();
311 bau_desc = __get_cpu_var(bau_control).descriptor_base;
312 bau_desc += UV_ITEMS_PER_DESCRIPTOR * cpu;
313
314 bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
315
316 i = 0;
317 for_each_cpu_mask(bit, *cpumaskp) {
318 blade = uv_cpu_to_blade_id(bit);
319 BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1));
320 if (blade == this_blade) {
321 locals++;
322 continue;
323 }
324 bau_node_set(blade, &bau_desc->distribution);
325 i++;
326 }
327 if (i == 0) {
328 /*
329 * no off_node flushing; return status for local node
330 */
331 if (locals)
332 return 0;
333 else
334 return 1;
335 }
336 __get_cpu_var(ptcstats).requestor++;
337 __get_cpu_var(ptcstats).ntargeted += i;
338
339 bau_desc->payload.address = va;
340 bau_desc->payload.sending_cpu = smp_processor_id();
341
342 return uv_flush_send_and_wait(cpu, this_blade, bau_desc, cpumaskp);
343}
344
345/*
346 * The BAU message interrupt comes here. (registered by set_intr_gate)
347 * See entry_64.S
348 *
349 * We received a broadcast assist message.
350 *
351 * Interrupts may have been disabled; this interrupt could represent
352 * the receipt of several messages.
353 *
354 * All cores/threads on this node get this interrupt.
355 * The last one to see it does the s/w ack.
356 * (the resource will not be freed until noninterruptable cpus see this
357 * interrupt; hardware will timeout the s/w ack and reply ERROR)
358 */
359void uv_bau_message_interrupt(struct pt_regs *regs)
360{
361 struct bau_payload_queue_entry *va_queue_first;
362 struct bau_payload_queue_entry *va_queue_last;
363 struct bau_payload_queue_entry *msg;
364 struct pt_regs *old_regs = set_irq_regs(regs);
365 cycles_t time1;
366 cycles_t time2;
367 int msg_slot;
368 int sw_ack_slot;
369 int fw;
370 int count = 0;
371 unsigned long local_pnode;
372
373 ack_APIC_irq();
374 exit_idle();
375 irq_enter();
376
377 time1 = get_cycles();
378
379 local_pnode = uv_blade_to_pnode(uv_numa_blade_id());
380
381 va_queue_first = __get_cpu_var(bau_control).va_queue_first;
382 va_queue_last = __get_cpu_var(bau_control).va_queue_last;
383
384 msg = __get_cpu_var(bau_control).bau_msg_head;
385 while (msg->sw_ack_vector) {
386 count++;
387 fw = msg->sw_ack_vector;
388 msg_slot = msg - va_queue_first;
389 sw_ack_slot = ffs(fw) - 1;
390
391 uv_bau_process_message(msg, msg_slot, sw_ack_slot);
392
393 msg++;
394 if (msg > va_queue_last)
395 msg = va_queue_first;
396 __get_cpu_var(bau_control).bau_msg_head = msg;
397 }
398 if (!count)
399 __get_cpu_var(ptcstats).nomsg++;
400 else if (count > 1)
401 __get_cpu_var(ptcstats).multmsg++;
402
403 time2 = get_cycles();
404 __get_cpu_var(ptcstats).dflush += (time2 - time1);
405
406 irq_exit();
407 set_irq_regs(old_regs);
408}
409
410static void uv_enable_timeouts(void)
411{
412 int i;
413 int blade;
414 int last_blade;
415 int pnode;
416 int cur_cpu = 0;
417 unsigned long apicid;
418
419 last_blade = -1;
420 for_each_online_node(i) {
421 blade = uv_node_to_blade_id(i);
422 if (blade == last_blade)
423 continue;
424 last_blade = blade;
425 apicid = per_cpu(x86_cpu_to_apicid, cur_cpu);
426 pnode = uv_blade_to_pnode(blade);
427 cur_cpu += uv_blade_nr_possible_cpus(i);
428 }
429}
430
431static void *uv_ptc_seq_start(struct seq_file *file, loff_t *offset)
432{
433 if (*offset < num_possible_cpus())
434 return offset;
435 return NULL;
436}
437
438static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
439{
440 (*offset)++;
441 if (*offset < num_possible_cpus())
442 return offset;
443 return NULL;
444}
445
446static void uv_ptc_seq_stop(struct seq_file *file, void *data)
447{
448}
449
450/*
451 * Display the statistics thru /proc
452 * data points to the cpu number
453 */
454static int uv_ptc_seq_show(struct seq_file *file, void *data)
455{
456 struct ptc_stats *stat;
457 int cpu;
458
459 cpu = *(loff_t *)data;
460
461 if (!cpu) {
462 seq_printf(file,
463 "# cpu requestor requestee one all sretry dretry ptc_i ");
464 seq_printf(file,
465 "sw_ack sflush dflush sok dnomsg dmult starget\n");
466 }
467 if (cpu < num_possible_cpus() && cpu_online(cpu)) {
468 stat = &per_cpu(ptcstats, cpu);
469 seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld ",
470 cpu, stat->requestor,
471 stat->requestee, stat->onetlb, stat->alltlb,
472 stat->s_retry, stat->d_retry, stat->ptc_i);
473 seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld\n",
474 uv_read_global_mmr64(uv_blade_to_pnode
475 (uv_cpu_to_blade_id(cpu)),
476 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
477 stat->sflush, stat->dflush,
478 stat->retriesok, stat->nomsg,
479 stat->multmsg, stat->ntargeted);
480 }
481
482 return 0;
483}
484
485/*
486 * 0: display meaning of the statistics
487 * >0: retry limit
488 */
489static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
490 size_t count, loff_t *data)
491{
492 long newmode;
493 char optstr[64];
494
495 if (count == 0 || count > sizeof(optstr))
496 return -EINVAL;
497 if (copy_from_user(optstr, user, count))
498 return -EFAULT;
499 optstr[count - 1] = '\0';
500 if (strict_strtoul(optstr, 10, &newmode) < 0) {
501 printk(KERN_DEBUG "%s is invalid\n", optstr);
502 return -EINVAL;
503 }
504
505 if (newmode == 0) {
506 printk(KERN_DEBUG "# cpu: cpu number\n");
507 printk(KERN_DEBUG
508 "requestor: times this cpu was the flush requestor\n");
509 printk(KERN_DEBUG
510 "requestee: times this cpu was requested to flush its TLBs\n");
511 printk(KERN_DEBUG
512 "one: times requested to flush a single address\n");
513 printk(KERN_DEBUG
514 "all: times requested to flush all TLB's\n");
515 printk(KERN_DEBUG
516 "sretry: number of retries of source-side timeouts\n");
517 printk(KERN_DEBUG
518 "dretry: number of retries of destination-side timeouts\n");
519 printk(KERN_DEBUG
520 "ptc_i: times UV fell through to IPI-style flushes\n");
521 printk(KERN_DEBUG
522 "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
523 printk(KERN_DEBUG
524 "sflush_us: cycles spent in uv_flush_tlb_others()\n");
525 printk(KERN_DEBUG
526 "dflush_us: cycles spent in handling flush requests\n");
527 printk(KERN_DEBUG "sok: successes on retry\n");
528 printk(KERN_DEBUG "dnomsg: interrupts with no message\n");
529 printk(KERN_DEBUG
530 "dmult: interrupts with multiple messages\n");
531 printk(KERN_DEBUG "starget: nodes targeted\n");
532 } else {
533 uv_bau_retry_limit = newmode;
534 printk(KERN_DEBUG "timeout retry limit:%d\n",
535 uv_bau_retry_limit);
536 }
537
538 return count;
539}
540
541static const struct seq_operations uv_ptc_seq_ops = {
542 .start = uv_ptc_seq_start,
543 .next = uv_ptc_seq_next,
544 .stop = uv_ptc_seq_stop,
545 .show = uv_ptc_seq_show
546};
547
548static int uv_ptc_proc_open(struct inode *inode, struct file *file)
549{
550 return seq_open(file, &uv_ptc_seq_ops);
551}
552
553static const struct file_operations proc_uv_ptc_operations = {
554 .open = uv_ptc_proc_open,
555 .read = seq_read,
556 .write = uv_ptc_proc_write,
557 .llseek = seq_lseek,
558 .release = seq_release,
559};
560
561static int __init uv_ptc_init(void)
562{
563 struct proc_dir_entry *proc_uv_ptc;
564
565 if (!is_uv_system())
566 return 0;
567
568 if (!proc_mkdir("sgi_uv", NULL))
569 return -EINVAL;
570
571 proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL);
572 if (!proc_uv_ptc) {
573 printk(KERN_ERR "unable to create %s proc entry\n",
574 UV_PTC_BASENAME);
575 remove_proc_entry("sgi_uv", NULL);
576 return -EINVAL;
577 }
578 proc_uv_ptc->proc_fops = &proc_uv_ptc_operations;
579 return 0;
580}
581
582/*
583 * begin the initialization of the per-blade control structures
584 */
585static struct bau_control * __init uv_table_bases_init(int blade, int node)
586{
587 int i;
588 int *ip;
589 struct bau_msg_status *msp;
590 struct bau_control *bau_tabp;
591
592 bau_tabp =
593 kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, node);
594 BUG_ON(!bau_tabp);
595
596 bau_tabp->msg_statuses =
597 kmalloc_node(sizeof(struct bau_msg_status) *
598 DEST_Q_SIZE, GFP_KERNEL, node);
599 BUG_ON(!bau_tabp->msg_statuses);
600
601 for (i = 0, msp = bau_tabp->msg_statuses; i < DEST_Q_SIZE; i++, msp++)
602 bau_cpubits_clear(&msp->seen_by, (int)
603 uv_blade_nr_possible_cpus(blade));
604
605 bau_tabp->watching =
606 kmalloc_node(sizeof(int) * DEST_NUM_RESOURCES, GFP_KERNEL, node);
607 BUG_ON(!bau_tabp->watching);
608
609 for (i = 0, ip = bau_tabp->watching; i < DEST_Q_SIZE; i++, ip++)
610 *ip = 0;
611
612 uv_bau_table_bases[blade] = bau_tabp;
613
614 return bau_tabp;
615}
616
617/*
618 * finish the initialization of the per-blade control structures
619 */
620static void __init
621uv_table_bases_finish(int blade, int node, int cur_cpu,
622 struct bau_control *bau_tablesp,
623 struct bau_desc *adp)
624{
625 struct bau_control *bcp;
626 int i;
627
628 for (i = cur_cpu; i < cur_cpu + uv_blade_nr_possible_cpus(blade); i++) {
629 bcp = (struct bau_control *)&per_cpu(bau_control, i);
630
631 bcp->bau_msg_head = bau_tablesp->va_queue_first;
632 bcp->va_queue_first = bau_tablesp->va_queue_first;
633 bcp->va_queue_last = bau_tablesp->va_queue_last;
634 bcp->watching = bau_tablesp->watching;
635 bcp->msg_statuses = bau_tablesp->msg_statuses;
636 bcp->descriptor_base = adp;
637 }
638}
639
640/*
641 * initialize the sending side's sending buffers
642 */
643static struct bau_desc * __init
644uv_activation_descriptor_init(int node, int pnode)
645{
646 int i;
647 unsigned long pa;
648 unsigned long m;
649 unsigned long n;
650 unsigned long mmr_image;
651 struct bau_desc *adp;
652 struct bau_desc *ad2;
653
654 adp = (struct bau_desc *)
655 kmalloc_node(16384, GFP_KERNEL, node);
656 BUG_ON(!adp);
657
658 pa = __pa((unsigned long)adp);
659 n = pa >> uv_nshift;
660 m = pa & uv_mmask;
661
662 mmr_image = uv_read_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE);
663 if (mmr_image) {
664 uv_write_global_mmr64(pnode, (unsigned long)
665 UVH_LB_BAU_SB_DESCRIPTOR_BASE,
666 (n << UV_DESC_BASE_PNODE_SHIFT | m));
667 }
668
669 for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) {
670 memset(ad2, 0, sizeof(struct bau_desc));
671 ad2->header.sw_ack_flag = 1;
672 ad2->header.base_dest_nodeid =
673 uv_blade_to_pnode(uv_cpu_to_blade_id(0));
674 ad2->header.command = UV_NET_ENDPOINT_INTD;
675 ad2->header.int_both = 1;
676 /*
677 * all others need to be set to zero:
678 * fairness chaining multilevel count replied_to
679 */
680 }
681 return adp;
682}
683
684/*
685 * initialize the destination side's receiving buffers
686 */
687static struct bau_payload_queue_entry * __init
688uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp)
689{
690 struct bau_payload_queue_entry *pqp;
691 char *cp;
692
693 pqp = (struct bau_payload_queue_entry *) kmalloc_node(
694 (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry),
695 GFP_KERNEL, node);
696 BUG_ON(!pqp);
697
698 cp = (char *)pqp + 31;
699 pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
700 bau_tablesp->va_queue_first = pqp;
701 uv_write_global_mmr64(pnode,
702 UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
703 ((unsigned long)pnode <<
704 UV_PAYLOADQ_PNODE_SHIFT) |
705 uv_physnodeaddr(pqp));
706 uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
707 uv_physnodeaddr(pqp));
708 bau_tablesp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
709 uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
710 (unsigned long)
711 uv_physnodeaddr(bau_tablesp->va_queue_last));
712 memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE);
713
714 return pqp;
715}
716
717/*
718 * Initialization of each UV blade's structures
719 */
720static int __init uv_init_blade(int blade, int node, int cur_cpu)
721{
722 int pnode;
723 unsigned long pa;
724 unsigned long apicid;
725 struct bau_desc *adp;
726 struct bau_payload_queue_entry *pqp;
727 struct bau_control *bau_tablesp;
728
729 bau_tablesp = uv_table_bases_init(blade, node);
730 pnode = uv_blade_to_pnode(blade);
731 adp = uv_activation_descriptor_init(node, pnode);
732 pqp = uv_payload_queue_init(node, pnode, bau_tablesp);
733 uv_table_bases_finish(blade, node, cur_cpu, bau_tablesp, adp);
734 /*
735 * the below initialization can't be in firmware because the
736 * messaging IRQ will be determined by the OS
737 */
738 apicid = per_cpu(x86_cpu_to_apicid, cur_cpu);
739 pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
740 if ((pa & 0xff) != UV_BAU_MESSAGE) {
741 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
742 ((apicid << 32) | UV_BAU_MESSAGE));
743 }
744 return 0;
745}
746
747/*
748 * Initialization of BAU-related structures
749 */
750static int __init uv_bau_init(void)
751{
752 int blade;
753 int node;
754 int nblades;
755 int last_blade;
756 int cur_cpu = 0;
757
758 if (!is_uv_system())
759 return 0;
760
761 uv_bau_retry_limit = 1;
762 uv_nshift = uv_hub_info->n_val;
763 uv_mmask = (1UL << uv_hub_info->n_val) - 1;
764 nblades = 0;
765 last_blade = -1;
766 for_each_online_node(node) {
767 blade = uv_node_to_blade_id(node);
768 if (blade == last_blade)
769 continue;
770 last_blade = blade;
771 nblades++;
772 }
773 uv_bau_table_bases = (struct bau_control **)
774 kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL);
775 BUG_ON(!uv_bau_table_bases);
776
777 last_blade = -1;
778 for_each_online_node(node) {
779 blade = uv_node_to_blade_id(node);
780 if (blade == last_blade)
781 continue;
782 last_blade = blade;
783 uv_init_blade(blade, node, cur_cpu);
784 cur_cpu += uv_blade_nr_possible_cpus(blade);
785 }
786 set_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1);
787 uv_enable_timeouts();
788
789 return 0;
790}
791__initcall(uv_bau_init);
792__initcall(uv_ptc_init);
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index abbf199adebb..1106fac6024d 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -2,7 +2,7 @@
2 2
3#include <asm/trampoline.h> 3#include <asm/trampoline.h>
4 4
5/* ready for x86_64, no harm for x86, since it will overwrite after alloc */ 5/* ready for x86_64 and x86 */
6unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); 6unsigned char *trampoline_base = __va(TRAMPOLINE_BASE);
7 7
8/* 8/*
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 08d752de4eee..03df8e45e5a1 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -1,5 +1,6 @@
1/* 1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds 2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
3 * 4 *
4 * Pentium III FXSR, SSE support 5 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000 6 * Gareth Hughes <gareth@valinux.com>, May 2000
@@ -57,11 +58,10 @@
57#include <asm/nmi.h> 58#include <asm/nmi.h>
58#include <asm/smp.h> 59#include <asm/smp.h>
59#include <asm/io.h> 60#include <asm/io.h>
61#include <asm/traps.h>
60 62
61#include "mach_traps.h" 63#include "mach_traps.h"
62 64
63int panic_on_unrecovered_nmi;
64
65DECLARE_BITMAP(used_vectors, NR_VECTORS); 65DECLARE_BITMAP(used_vectors, NR_VECTORS);
66EXPORT_SYMBOL_GPL(used_vectors); 66EXPORT_SYMBOL_GPL(used_vectors);
67 67
@@ -78,39 +78,22 @@ char ignore_fpu_irq;
78gate_desc idt_table[256] 78gate_desc idt_table[256]
79 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; 79 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
80 80
81asmlinkage void divide_error(void); 81int panic_on_unrecovered_nmi;
82asmlinkage void debug(void);
83asmlinkage void nmi(void);
84asmlinkage void int3(void);
85asmlinkage void overflow(void);
86asmlinkage void bounds(void);
87asmlinkage void invalid_op(void);
88asmlinkage void device_not_available(void);
89asmlinkage void coprocessor_segment_overrun(void);
90asmlinkage void invalid_TSS(void);
91asmlinkage void segment_not_present(void);
92asmlinkage void stack_segment(void);
93asmlinkage void general_protection(void);
94asmlinkage void page_fault(void);
95asmlinkage void coprocessor_error(void);
96asmlinkage void simd_coprocessor_error(void);
97asmlinkage void alignment_check(void);
98asmlinkage void spurious_interrupt_bug(void);
99asmlinkage void machine_check(void);
100
101int kstack_depth_to_print = 24; 82int kstack_depth_to_print = 24;
102static unsigned int code_bytes = 64; 83static unsigned int code_bytes = 64;
84static int ignore_nmis;
85static int die_counter;
103 86
104void printk_address(unsigned long address, int reliable) 87void printk_address(unsigned long address, int reliable)
105{ 88{
106#ifdef CONFIG_KALLSYMS 89#ifdef CONFIG_KALLSYMS
107 char namebuf[KSYM_NAME_LEN];
108 unsigned long offset = 0; 90 unsigned long offset = 0;
109 unsigned long symsize; 91 unsigned long symsize;
110 const char *symname; 92 const char *symname;
111 char reliab[4] = "";
112 char *delim = ":";
113 char *modname; 93 char *modname;
94 char *delim = ":";
95 char namebuf[KSYM_NAME_LEN];
96 char reliab[4] = "";
114 97
115 symname = kallsyms_lookup(address, &symsize, &offset, 98 symname = kallsyms_lookup(address, &symsize, &offset,
116 &modname, namebuf); 99 &modname, namebuf);
@@ -130,22 +113,23 @@ void printk_address(unsigned long address, int reliable)
130#endif 113#endif
131} 114}
132 115
133static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size) 116static inline int valid_stack_ptr(struct thread_info *tinfo,
117 void *p, unsigned int size)
134{ 118{
135 return p > (void *)tinfo && 119 void *t = tinfo;
136 p <= (void *)tinfo + THREAD_SIZE - size; 120 return p > t && p <= t + THREAD_SIZE - size;
137} 121}
138 122
139/* The form of the top of the frame on the stack */ 123/* The form of the top of the frame on the stack */
140struct stack_frame { 124struct stack_frame {
141 struct stack_frame *next_frame; 125 struct stack_frame *next_frame;
142 unsigned long return_address; 126 unsigned long return_address;
143}; 127};
144 128
145static inline unsigned long 129static inline unsigned long
146print_context_stack(struct thread_info *tinfo, 130print_context_stack(struct thread_info *tinfo,
147 unsigned long *stack, unsigned long bp, 131 unsigned long *stack, unsigned long bp,
148 const struct stacktrace_ops *ops, void *data) 132 const struct stacktrace_ops *ops, void *data)
149{ 133{
150 struct stack_frame *frame = (struct stack_frame *)bp; 134 struct stack_frame *frame = (struct stack_frame *)bp;
151 135
@@ -167,8 +151,6 @@ print_context_stack(struct thread_info *tinfo,
167 return bp; 151 return bp;
168} 152}
169 153
170#define MSG(msg) ops->warning(data, msg)
171
172void dump_trace(struct task_struct *task, struct pt_regs *regs, 154void dump_trace(struct task_struct *task, struct pt_regs *regs,
173 unsigned long *stack, unsigned long bp, 155 unsigned long *stack, unsigned long bp,
174 const struct stacktrace_ops *ops, void *data) 156 const struct stacktrace_ops *ops, void *data)
@@ -178,7 +160,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
178 160
179 if (!stack) { 161 if (!stack) {
180 unsigned long dummy; 162 unsigned long dummy;
181
182 stack = &dummy; 163 stack = &dummy;
183 if (task != current) 164 if (task != current)
184 stack = (unsigned long *)task->thread.sp; 165 stack = (unsigned long *)task->thread.sp;
@@ -196,7 +177,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
196 } 177 }
197#endif 178#endif
198 179
199 while (1) { 180 for (;;) {
200 struct thread_info *context; 181 struct thread_info *context;
201 182
202 context = (struct thread_info *) 183 context = (struct thread_info *)
@@ -248,15 +229,15 @@ static void print_trace_address(void *data, unsigned long addr, int reliable)
248} 229}
249 230
250static const struct stacktrace_ops print_trace_ops = { 231static const struct stacktrace_ops print_trace_ops = {
251 .warning = print_trace_warning, 232 .warning = print_trace_warning,
252 .warning_symbol = print_trace_warning_symbol, 233 .warning_symbol = print_trace_warning_symbol,
253 .stack = print_trace_stack, 234 .stack = print_trace_stack,
254 .address = print_trace_address, 235 .address = print_trace_address,
255}; 236};
256 237
257static void 238static void
258show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 239show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
259 unsigned long *stack, unsigned long bp, char *log_lvl) 240 unsigned long *stack, unsigned long bp, char *log_lvl)
260{ 241{
261 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); 242 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
262 printk("%s =======================\n", log_lvl); 243 printk("%s =======================\n", log_lvl);
@@ -351,15 +332,14 @@ void show_registers(struct pt_regs *regs)
351 printk(KERN_EMERG "Code: "); 332 printk(KERN_EMERG "Code: ");
352 333
353 ip = (u8 *)regs->ip - code_prologue; 334 ip = (u8 *)regs->ip - code_prologue;
354 if (ip < (u8 *)PAGE_OFFSET || 335 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
355 probe_kernel_address(ip, c)) {
356 /* try starting at EIP */ 336 /* try starting at EIP */
357 ip = (u8 *)regs->ip; 337 ip = (u8 *)regs->ip;
358 code_len = code_len - code_prologue + 1; 338 code_len = code_len - code_prologue + 1;
359 } 339 }
360 for (i = 0; i < code_len; i++, ip++) { 340 for (i = 0; i < code_len; i++, ip++) {
361 if (ip < (u8 *)PAGE_OFFSET || 341 if (ip < (u8 *)PAGE_OFFSET ||
362 probe_kernel_address(ip, c)) { 342 probe_kernel_address(ip, c)) {
363 printk(" Bad EIP value."); 343 printk(" Bad EIP value.");
364 break; 344 break;
365 } 345 }
@@ -384,7 +364,53 @@ int is_valid_bugaddr(unsigned long ip)
384 return ud2 == 0x0b0f; 364 return ud2 == 0x0b0f;
385} 365}
386 366
387static int die_counter; 367static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
368static int die_owner = -1;
369static unsigned int die_nest_count;
370
371unsigned __kprobes long oops_begin(void)
372{
373 unsigned long flags;
374
375 oops_enter();
376
377 if (die_owner != raw_smp_processor_id()) {
378 console_verbose();
379 raw_local_irq_save(flags);
380 __raw_spin_lock(&die_lock);
381 die_owner = smp_processor_id();
382 die_nest_count = 0;
383 bust_spinlocks(1);
384 } else {
385 raw_local_irq_save(flags);
386 }
387 die_nest_count++;
388 return flags;
389}
390
391void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
392{
393 bust_spinlocks(0);
394 die_owner = -1;
395 add_taint(TAINT_DIE);
396 __raw_spin_unlock(&die_lock);
397 raw_local_irq_restore(flags);
398
399 if (!regs)
400 return;
401
402 if (kexec_should_crash(current))
403 crash_kexec(regs);
404
405 if (in_interrupt())
406 panic("Fatal exception in interrupt");
407
408 if (panic_on_oops)
409 panic("Fatal exception");
410
411 oops_exit();
412 do_exit(signr);
413}
388 414
389int __kprobes __die(const char *str, struct pt_regs *regs, long err) 415int __kprobes __die(const char *str, struct pt_regs *regs, long err)
390{ 416{
@@ -402,26 +428,22 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
402 printk("DEBUG_PAGEALLOC"); 428 printk("DEBUG_PAGEALLOC");
403#endif 429#endif
404 printk("\n"); 430 printk("\n");
405
406 if (notify_die(DIE_OOPS, str, regs, err, 431 if (notify_die(DIE_OOPS, str, regs, err,
407 current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) { 432 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
408 433 return 1;
409 show_registers(regs);
410 /* Executive summary in case the oops scrolled away */
411 sp = (unsigned long) (&regs->sp);
412 savesegment(ss, ss);
413 if (user_mode(regs)) {
414 sp = regs->sp;
415 ss = regs->ss & 0xffff;
416 }
417 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
418 print_symbol("%s", regs->ip);
419 printk(" SS:ESP %04x:%08lx\n", ss, sp);
420 434
421 return 0; 435 show_registers(regs);
436 /* Executive summary in case the oops scrolled away */
437 sp = (unsigned long) (&regs->sp);
438 savesegment(ss, ss);
439 if (user_mode(regs)) {
440 sp = regs->sp;
441 ss = regs->ss & 0xffff;
422 } 442 }
423 443 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
424 return 1; 444 print_symbol("%s", regs->ip);
445 printk(" SS:ESP %04x:%08lx\n", ss, sp);
446 return 0;
425} 447}
426 448
427/* 449/*
@@ -430,31 +452,9 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
430 */ 452 */
431void die(const char *str, struct pt_regs *regs, long err) 453void die(const char *str, struct pt_regs *regs, long err)
432{ 454{
433 static struct { 455 unsigned long flags = oops_begin();
434 raw_spinlock_t lock;
435 u32 lock_owner;
436 int lock_owner_depth;
437 } die = {
438 .lock = __RAW_SPIN_LOCK_UNLOCKED,
439 .lock_owner = -1,
440 .lock_owner_depth = 0
441 };
442 unsigned long flags;
443
444 oops_enter();
445
446 if (die.lock_owner != raw_smp_processor_id()) {
447 console_verbose();
448 raw_local_irq_save(flags);
449 __raw_spin_lock(&die.lock);
450 die.lock_owner = smp_processor_id();
451 die.lock_owner_depth = 0;
452 bust_spinlocks(1);
453 } else {
454 raw_local_irq_save(flags);
455 }
456 456
457 if (++die.lock_owner_depth < 3) { 457 if (die_nest_count < 3) {
458 report_bug(regs->ip, regs); 458 report_bug(regs->ip, regs);
459 459
460 if (__die(str, regs, err)) 460 if (__die(str, regs, err))
@@ -463,26 +463,7 @@ void die(const char *str, struct pt_regs *regs, long err)
463 printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); 463 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
464 } 464 }
465 465
466 bust_spinlocks(0); 466 oops_end(flags, regs, SIGSEGV);
467 die.lock_owner = -1;
468 add_taint(TAINT_DIE);
469 __raw_spin_unlock(&die.lock);
470 raw_local_irq_restore(flags);
471
472 if (!regs)
473 return;
474
475 if (kexec_should_crash(current))
476 crash_kexec(regs);
477
478 if (in_interrupt())
479 panic("Fatal exception in interrupt");
480
481 if (panic_on_oops)
482 panic("Fatal exception");
483
484 oops_exit();
485 do_exit(SIGSEGV);
486} 467}
487 468
488static inline void 469static inline void
@@ -546,7 +527,7 @@ void do_##name(struct pt_regs *regs, long error_code) \
546{ \ 527{ \
547 trace_hardirqs_fixup(); \ 528 trace_hardirqs_fixup(); \
548 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 529 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
549 == NOTIFY_STOP) \ 530 == NOTIFY_STOP) \
550 return; \ 531 return; \
551 do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ 532 do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
552} 533}
@@ -562,7 +543,7 @@ void do_##name(struct pt_regs *regs, long error_code) \
562 info.si_code = sicode; \ 543 info.si_code = sicode; \
563 info.si_addr = (void __user *)siaddr; \ 544 info.si_addr = (void __user *)siaddr; \
564 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 545 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
565 == NOTIFY_STOP) \ 546 == NOTIFY_STOP) \
566 return; \ 547 return; \
567 do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ 548 do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
568} 549}
@@ -571,7 +552,7 @@ void do_##name(struct pt_regs *regs, long error_code) \
571void do_##name(struct pt_regs *regs, long error_code) \ 552void do_##name(struct pt_regs *regs, long error_code) \
572{ \ 553{ \
573 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 554 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
574 == NOTIFY_STOP) \ 555 == NOTIFY_STOP) \
575 return; \ 556 return; \
576 do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ 557 do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
577} 558}
@@ -586,27 +567,29 @@ void do_##name(struct pt_regs *regs, long error_code) \
586 info.si_addr = (void __user *)siaddr; \ 567 info.si_addr = (void __user *)siaddr; \
587 trace_hardirqs_fixup(); \ 568 trace_hardirqs_fixup(); \
588 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 569 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
589 == NOTIFY_STOP) \ 570 == NOTIFY_STOP) \
590 return; \ 571 return; \
591 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ 572 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
592} 573}
593 574
594DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) 575DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
595#ifndef CONFIG_KPROBES 576#ifndef CONFIG_KPROBES
596DO_VM86_ERROR(3, SIGTRAP, "int3", int3) 577DO_VM86_ERROR(3, SIGTRAP, "int3", int3)
597#endif 578#endif
598DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow) 579DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
599DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds) 580DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
600DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) 581DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
601DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) 582DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
602DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) 583DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
603DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) 584DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
604DO_ERROR(12, SIGBUS, "stack segment", stack_segment) 585DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
605DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) 586DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
606DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) 587DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1)
607 588
608void __kprobes do_general_protection(struct pt_regs *regs, long error_code) 589void __kprobes
590do_general_protection(struct pt_regs *regs, long error_code)
609{ 591{
592 struct task_struct *tsk;
610 struct thread_struct *thread; 593 struct thread_struct *thread;
611 struct tss_struct *tss; 594 struct tss_struct *tss;
612 int cpu; 595 int cpu;
@@ -647,23 +630,24 @@ void __kprobes do_general_protection(struct pt_regs *regs, long error_code)
647 if (regs->flags & X86_VM_MASK) 630 if (regs->flags & X86_VM_MASK)
648 goto gp_in_vm86; 631 goto gp_in_vm86;
649 632
633 tsk = current;
650 if (!user_mode(regs)) 634 if (!user_mode(regs))
651 goto gp_in_kernel; 635 goto gp_in_kernel;
652 636
653 current->thread.error_code = error_code; 637 tsk->thread.error_code = error_code;
654 current->thread.trap_no = 13; 638 tsk->thread.trap_no = 13;
655 639
656 if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) && 640 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
657 printk_ratelimit()) { 641 printk_ratelimit()) {
658 printk(KERN_INFO 642 printk(KERN_INFO
659 "%s[%d] general protection ip:%lx sp:%lx error:%lx", 643 "%s[%d] general protection ip:%lx sp:%lx error:%lx",
660 current->comm, task_pid_nr(current), 644 tsk->comm, task_pid_nr(tsk),
661 regs->ip, regs->sp, error_code); 645 regs->ip, regs->sp, error_code);
662 print_vma_addr(" in ", regs->ip); 646 print_vma_addr(" in ", regs->ip);
663 printk("\n"); 647 printk("\n");
664 } 648 }
665 649
666 force_sig(SIGSEGV, current); 650 force_sig(SIGSEGV, tsk);
667 return; 651 return;
668 652
669gp_in_vm86: 653gp_in_vm86:
@@ -672,14 +656,15 @@ gp_in_vm86:
672 return; 656 return;
673 657
674gp_in_kernel: 658gp_in_kernel:
675 if (!fixup_exception(regs)) { 659 if (fixup_exception(regs))
676 current->thread.error_code = error_code; 660 return;
677 current->thread.trap_no = 13; 661
678 if (notify_die(DIE_GPF, "general protection fault", regs, 662 tsk->thread.error_code = error_code;
663 tsk->thread.trap_no = 13;
664 if (notify_die(DIE_GPF, "general protection fault", regs,
679 error_code, 13, SIGSEGV) == NOTIFY_STOP) 665 error_code, 13, SIGSEGV) == NOTIFY_STOP)
680 return; 666 return;
681 die("general protection fault", regs, error_code); 667 die("general protection fault", regs, error_code);
682 }
683} 668}
684 669
685static notrace __kprobes void 670static notrace __kprobes void
@@ -756,9 +741,9 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
756 741
757static DEFINE_SPINLOCK(nmi_print_lock); 742static DEFINE_SPINLOCK(nmi_print_lock);
758 743
759void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg) 744void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
760{ 745{
761 if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP) 746 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
762 return; 747 return;
763 748
764 spin_lock(&nmi_print_lock); 749 spin_lock(&nmi_print_lock);
@@ -767,10 +752,12 @@ void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
767 * to get a message out: 752 * to get a message out:
768 */ 753 */
769 bust_spinlocks(1); 754 bust_spinlocks(1);
770 printk(KERN_EMERG "%s", msg); 755 printk(KERN_EMERG "%s", str);
771 printk(" on CPU%d, ip %08lx, registers:\n", 756 printk(" on CPU%d, ip %08lx, registers:\n",
772 smp_processor_id(), regs->ip); 757 smp_processor_id(), regs->ip);
773 show_registers(regs); 758 show_registers(regs);
759 if (do_panic)
760 panic("Non maskable interrupt");
774 console_silent(); 761 console_silent();
775 spin_unlock(&nmi_print_lock); 762 spin_unlock(&nmi_print_lock);
776 bust_spinlocks(0); 763 bust_spinlocks(0);
@@ -790,14 +777,17 @@ void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
790static notrace __kprobes void default_do_nmi(struct pt_regs *regs) 777static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
791{ 778{
792 unsigned char reason = 0; 779 unsigned char reason = 0;
780 int cpu;
793 781
794 /* Only the BSP gets external NMIs from the system: */ 782 cpu = smp_processor_id();
795 if (!smp_processor_id()) 783
784 /* Only the BSP gets external NMIs from the system. */
785 if (!cpu)
796 reason = get_nmi_reason(); 786 reason = get_nmi_reason();
797 787
798 if (!(reason & 0xc0)) { 788 if (!(reason & 0xc0)) {
799 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) 789 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
800 == NOTIFY_STOP) 790 == NOTIFY_STOP)
801 return; 791 return;
802#ifdef CONFIG_X86_LOCAL_APIC 792#ifdef CONFIG_X86_LOCAL_APIC
803 /* 793 /*
@@ -806,7 +796,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
806 */ 796 */
807 if (nmi_watchdog_tick(regs, reason)) 797 if (nmi_watchdog_tick(regs, reason))
808 return; 798 return;
809 if (!do_nmi_callback(regs, smp_processor_id())) 799 if (!do_nmi_callback(regs, cpu))
810 unknown_nmi_error(reason, regs); 800 unknown_nmi_error(reason, regs);
811#else 801#else
812 unknown_nmi_error(reason, regs); 802 unknown_nmi_error(reason, regs);
@@ -816,6 +806,8 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
816 } 806 }
817 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) 807 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
818 return; 808 return;
809
810 /* AK: following checks seem to be broken on modern chipsets. FIXME */
819 if (reason & 0x80) 811 if (reason & 0x80)
820 mem_parity_error(reason, regs); 812 mem_parity_error(reason, regs);
821 if (reason & 0x40) 813 if (reason & 0x40)
@@ -827,8 +819,6 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
827 reassert_nmi(); 819 reassert_nmi();
828} 820}
829 821
830static int ignore_nmis;
831
832notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) 822notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code)
833{ 823{
834 int cpu; 824 int cpu;
@@ -913,7 +903,7 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
913 tsk->thread.debugctlmsr = 0; 903 tsk->thread.debugctlmsr = 0;
914 904
915 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, 905 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
916 SIGTRAP) == NOTIFY_STOP) 906 SIGTRAP) == NOTIFY_STOP)
917 return; 907 return;
918 /* It's safe to allow irq's after DR6 has been saved */ 908 /* It's safe to allow irq's after DR6 has been saved */
919 if (regs->flags & X86_EFLAGS_IF) 909 if (regs->flags & X86_EFLAGS_IF)
@@ -974,9 +964,8 @@ clear_TF_reenable:
974void math_error(void __user *ip) 964void math_error(void __user *ip)
975{ 965{
976 struct task_struct *task; 966 struct task_struct *task;
977 unsigned short cwd;
978 unsigned short swd;
979 siginfo_t info; 967 siginfo_t info;
968 unsigned short cwd, swd;
980 969
981 /* 970 /*
982 * Save the info for the exception handler and clear the error. 971 * Save the info for the exception handler and clear the error.
@@ -995,7 +984,7 @@ void math_error(void __user *ip)
995 * C1 reg you need in case of a stack fault, 0x040 is the stack 984 * C1 reg you need in case of a stack fault, 0x040 is the stack
996 * fault bit. We should only be taking one exception at a time, 985 * fault bit. We should only be taking one exception at a time,
997 * so if this combination doesn't produce any single exception, 986 * so if this combination doesn't produce any single exception,
998 * then we have a bad program that isn't syncronizing its FPU usage 987 * then we have a bad program that isn't synchronizing its FPU usage
999 * and it will suffer the consequences since we won't be able to 988 * and it will suffer the consequences since we won't be able to
1000 * fully reproduce the context of the exception 989 * fully reproduce the context of the exception
1001 */ 990 */
@@ -1004,7 +993,7 @@ void math_error(void __user *ip)
1004 switch (swd & ~cwd & 0x3f) { 993 switch (swd & ~cwd & 0x3f) {
1005 case 0x000: /* No unmasked exception */ 994 case 0x000: /* No unmasked exception */
1006 return; 995 return;
1007 default: /* Multiple exceptions */ 996 default: /* Multiple exceptions */
1008 break; 997 break;
1009 case 0x001: /* Invalid Op */ 998 case 0x001: /* Invalid Op */
1010 /* 999 /*
@@ -1040,8 +1029,8 @@ void do_coprocessor_error(struct pt_regs *regs, long error_code)
1040static void simd_math_error(void __user *ip) 1029static void simd_math_error(void __user *ip)
1041{ 1030{
1042 struct task_struct *task; 1031 struct task_struct *task;
1043 unsigned short mxcsr;
1044 siginfo_t info; 1032 siginfo_t info;
1033 unsigned short mxcsr;
1045 1034
1046 /* 1035 /*
1047 * Save the info for the exception handler and clear the error. 1036 * Save the info for the exception handler and clear the error.
@@ -1117,7 +1106,7 @@ void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
1117 1106
1118unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) 1107unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
1119{ 1108{
1120 struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt; 1109 struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id());
1121 unsigned long base = (kesp - uesp) & -THREAD_SIZE; 1110 unsigned long base = (kesp - uesp) & -THREAD_SIZE;
1122 unsigned long new_kesp = kesp - base; 1111 unsigned long new_kesp = kesp - base;
1123 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; 1112 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
@@ -1196,19 +1185,16 @@ void __init trap_init(void)
1196 early_iounmap(p, 4); 1185 early_iounmap(p, 4);
1197#endif 1186#endif
1198 1187
1199#ifdef CONFIG_X86_LOCAL_APIC 1188 set_trap_gate(0, &divide_error);
1200 init_apic_mappings(); 1189 set_intr_gate(1, &debug);
1201#endif 1190 set_intr_gate(2, &nmi);
1202 set_trap_gate(0, &divide_error); 1191 set_system_intr_gate(3, &int3); /* int3 can be called from all */
1203 set_intr_gate(1, &debug); 1192 set_system_gate(4, &overflow); /* int4 can be called from all */
1204 set_intr_gate(2, &nmi); 1193 set_trap_gate(5, &bounds);
1205 set_system_intr_gate(3, &int3); /* int3/4 can be called from all */ 1194 set_trap_gate(6, &invalid_op);
1206 set_system_gate(4, &overflow); 1195 set_trap_gate(7, &device_not_available);
1207 set_trap_gate(5, &bounds); 1196 set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS);
1208 set_trap_gate(6, &invalid_op); 1197 set_trap_gate(9, &coprocessor_segment_overrun);
1209 set_trap_gate(7, &device_not_available);
1210 set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS);
1211 set_trap_gate(9, &coprocessor_segment_overrun);
1212 set_trap_gate(10, &invalid_TSS); 1198 set_trap_gate(10, &invalid_TSS);
1213 set_trap_gate(11, &segment_not_present); 1199 set_trap_gate(11, &segment_not_present);
1214 set_trap_gate(12, &stack_segment); 1200 set_trap_gate(12, &stack_segment);
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index adff76ea97c4..3f18d73f420c 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -10,73 +10,56 @@
10 * 'Traps.c' handles hardware traps and faults after we have saved some 10 * 'Traps.c' handles hardware traps and faults after we have saved some
11 * state in 'entry.S'. 11 * state in 'entry.S'.
12 */ 12 */
13#include <linux/sched.h> 13#include <linux/moduleparam.h>
14#include <linux/interrupt.h>
15#include <linux/kallsyms.h>
16#include <linux/spinlock.h>
17#include <linux/kprobes.h>
18#include <linux/uaccess.h>
19#include <linux/utsname.h>
20#include <linux/kdebug.h>
14#include <linux/kernel.h> 21#include <linux/kernel.h>
22#include <linux/module.h>
23#include <linux/ptrace.h>
15#include <linux/string.h> 24#include <linux/string.h>
25#include <linux/unwind.h>
26#include <linux/delay.h>
16#include <linux/errno.h> 27#include <linux/errno.h>
17#include <linux/ptrace.h> 28#include <linux/kexec.h>
29#include <linux/sched.h>
18#include <linux/timer.h> 30#include <linux/timer.h>
19#include <linux/mm.h>
20#include <linux/init.h> 31#include <linux/init.h>
21#include <linux/delay.h>
22#include <linux/spinlock.h>
23#include <linux/interrupt.h>
24#include <linux/kallsyms.h>
25#include <linux/module.h>
26#include <linux/moduleparam.h>
27#include <linux/nmi.h>
28#include <linux/kprobes.h>
29#include <linux/kexec.h>
30#include <linux/unwind.h>
31#include <linux/uaccess.h>
32#include <linux/bug.h> 32#include <linux/bug.h>
33#include <linux/kdebug.h> 33#include <linux/nmi.h>
34#include <linux/utsname.h> 34#include <linux/mm.h>
35
36#include <mach_traps.h>
37 35
38#if defined(CONFIG_EDAC) 36#if defined(CONFIG_EDAC)
39#include <linux/edac.h> 37#include <linux/edac.h>
40#endif 38#endif
41 39
42#include <asm/system.h> 40#include <asm/stacktrace.h>
43#include <asm/io.h> 41#include <asm/processor.h>
44#include <asm/atomic.h>
45#include <asm/debugreg.h> 42#include <asm/debugreg.h>
43#include <asm/atomic.h>
44#include <asm/system.h>
45#include <asm/unwind.h>
46#include <asm/desc.h> 46#include <asm/desc.h>
47#include <asm/i387.h> 47#include <asm/i387.h>
48#include <asm/processor.h> 48#include <asm/nmi.h>
49#include <asm/unwind.h>
50#include <asm/smp.h> 49#include <asm/smp.h>
50#include <asm/io.h>
51#include <asm/pgalloc.h> 51#include <asm/pgalloc.h>
52#include <asm/pda.h>
53#include <asm/proto.h> 52#include <asm/proto.h>
54#include <asm/nmi.h> 53#include <asm/pda.h>
55#include <asm/stacktrace.h> 54#include <asm/traps.h>
56 55
57asmlinkage void divide_error(void); 56#include <mach_traps.h>
58asmlinkage void debug(void);
59asmlinkage void nmi(void);
60asmlinkage void int3(void);
61asmlinkage void overflow(void);
62asmlinkage void bounds(void);
63asmlinkage void invalid_op(void);
64asmlinkage void device_not_available(void);
65asmlinkage void double_fault(void);
66asmlinkage void coprocessor_segment_overrun(void);
67asmlinkage void invalid_TSS(void);
68asmlinkage void segment_not_present(void);
69asmlinkage void stack_segment(void);
70asmlinkage void general_protection(void);
71asmlinkage void page_fault(void);
72asmlinkage void coprocessor_error(void);
73asmlinkage void simd_coprocessor_error(void);
74asmlinkage void reserved(void);
75asmlinkage void alignment_check(void);
76asmlinkage void machine_check(void);
77asmlinkage void spurious_interrupt_bug(void);
78 57
58int panic_on_unrecovered_nmi;
59int kstack_depth_to_print = 12;
79static unsigned int code_bytes = 64; 60static unsigned int code_bytes = 64;
61static int ignore_nmis;
62static int die_counter;
80 63
81static inline void conditional_sti(struct pt_regs *regs) 64static inline void conditional_sti(struct pt_regs *regs)
82{ 65{
@@ -100,34 +83,9 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
100 dec_preempt_count(); 83 dec_preempt_count();
101} 84}
102 85
103int kstack_depth_to_print = 12;
104
105void printk_address(unsigned long address, int reliable) 86void printk_address(unsigned long address, int reliable)
106{ 87{
107#ifdef CONFIG_KALLSYMS 88 printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address);
108 unsigned long offset = 0, symsize;
109 const char *symname;
110 char *modname;
111 char *delim = ":";
112 char namebuf[KSYM_NAME_LEN];
113 char reliab[4] = "";
114
115 symname = kallsyms_lookup(address, &symsize, &offset,
116 &modname, namebuf);
117 if (!symname) {
118 printk(" [<%016lx>]\n", address);
119 return;
120 }
121 if (!reliable)
122 strcpy(reliab, "? ");
123
124 if (!modname)
125 modname = delim = "";
126 printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
127 address, reliab, delim, modname, delim, symname, offset, symsize);
128#else
129 printk(" [<%016lx>]\n", address);
130#endif
131} 89}
132 90
133static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 91static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
@@ -204,8 +162,6 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
204 return NULL; 162 return NULL;
205} 163}
206 164
207#define MSG(txt) ops->warning(data, txt)
208
209/* 165/*
210 * x86-64 can have up to three kernel stacks: 166 * x86-64 can have up to three kernel stacks:
211 * process stack 167 * process stack
@@ -232,11 +188,11 @@ struct stack_frame {
232 unsigned long return_address; 188 unsigned long return_address;
233}; 189};
234 190
235 191static inline unsigned long
236static inline unsigned long print_context_stack(struct thread_info *tinfo, 192print_context_stack(struct thread_info *tinfo,
237 unsigned long *stack, unsigned long bp, 193 unsigned long *stack, unsigned long bp,
238 const struct stacktrace_ops *ops, void *data, 194 const struct stacktrace_ops *ops, void *data,
239 unsigned long *end) 195 unsigned long *end)
240{ 196{
241 struct stack_frame *frame = (struct stack_frame *)bp; 197 struct stack_frame *frame = (struct stack_frame *)bp;
242 198
@@ -258,7 +214,7 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo,
258 return bp; 214 return bp;
259} 215}
260 216
261void dump_trace(struct task_struct *tsk, struct pt_regs *regs, 217void dump_trace(struct task_struct *task, struct pt_regs *regs,
262 unsigned long *stack, unsigned long bp, 218 unsigned long *stack, unsigned long bp,
263 const struct stacktrace_ops *ops, void *data) 219 const struct stacktrace_ops *ops, void *data)
264{ 220{
@@ -267,36 +223,34 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
267 unsigned used = 0; 223 unsigned used = 0;
268 struct thread_info *tinfo; 224 struct thread_info *tinfo;
269 225
270 if (!tsk) 226 if (!task)
271 tsk = current; 227 task = current;
272 tinfo = task_thread_info(tsk);
273 228
274 if (!stack) { 229 if (!stack) {
275 unsigned long dummy; 230 unsigned long dummy;
276 stack = &dummy; 231 stack = &dummy;
277 if (tsk && tsk != current) 232 if (task && task != current)
278 stack = (unsigned long *)tsk->thread.sp; 233 stack = (unsigned long *)task->thread.sp;
279 } 234 }
280 235
281#ifdef CONFIG_FRAME_POINTER 236#ifdef CONFIG_FRAME_POINTER
282 if (!bp) { 237 if (!bp) {
283 if (tsk == current) { 238 if (task == current) {
284 /* Grab bp right from our regs */ 239 /* Grab bp right from our regs */
285 asm("movq %%rbp, %0" : "=r" (bp):); 240 asm("movq %%rbp, %0" : "=r" (bp) :);
286 } else { 241 } else {
287 /* bp is the last reg pushed by switch_to */ 242 /* bp is the last reg pushed by switch_to */
288 bp = *(unsigned long *) tsk->thread.sp; 243 bp = *(unsigned long *) task->thread.sp;
289 } 244 }
290 } 245 }
291#endif 246#endif
292 247
293
294
295 /* 248 /*
296 * Print function call entries in all stacks, starting at the 249 * Print function call entries in all stacks, starting at the
297 * current stack address. If the stacks consist of nested 250 * current stack address. If the stacks consist of nested
298 * exceptions 251 * exceptions
299 */ 252 */
253 tinfo = task_thread_info(task);
300 for (;;) { 254 for (;;) {
301 char *id; 255 char *id;
302 unsigned long *estack_end; 256 unsigned long *estack_end;
@@ -381,18 +335,24 @@ static const struct stacktrace_ops print_trace_ops = {
381 .address = print_trace_address, 335 .address = print_trace_address,
382}; 336};
383 337
384void 338static void
385show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack, 339show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
386 unsigned long bp) 340 unsigned long *stack, unsigned long bp, char *log_lvl)
387{ 341{
388 printk("\nCall Trace:\n"); 342 printk("\nCall Trace:\n");
389 dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL); 343 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
390 printk("\n"); 344 printk("\n");
391} 345}
392 346
347void show_trace(struct task_struct *task, struct pt_regs *regs,
348 unsigned long *stack, unsigned long bp)
349{
350 show_trace_log_lvl(task, regs, stack, bp, "");
351}
352
393static void 353static void
394_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp, 354show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
395 unsigned long bp) 355 unsigned long *sp, unsigned long bp, char *log_lvl)
396{ 356{
397 unsigned long *stack; 357 unsigned long *stack;
398 int i; 358 int i;
@@ -404,14 +364,14 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
404 // back trace for this cpu. 364 // back trace for this cpu.
405 365
406 if (sp == NULL) { 366 if (sp == NULL) {
407 if (tsk) 367 if (task)
408 sp = (unsigned long *)tsk->thread.sp; 368 sp = (unsigned long *)task->thread.sp;
409 else 369 else
410 sp = (unsigned long *)&sp; 370 sp = (unsigned long *)&sp;
411 } 371 }
412 372
413 stack = sp; 373 stack = sp;
414 for(i=0; i < kstack_depth_to_print; i++) { 374 for (i = 0; i < kstack_depth_to_print; i++) {
415 if (stack >= irqstack && stack <= irqstack_end) { 375 if (stack >= irqstack && stack <= irqstack_end) {
416 if (stack == irqstack_end) { 376 if (stack == irqstack_end) {
417 stack = (unsigned long *) (irqstack_end[-1]); 377 stack = (unsigned long *) (irqstack_end[-1]);
@@ -426,12 +386,12 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
426 printk(" %016lx", *stack++); 386 printk(" %016lx", *stack++);
427 touch_nmi_watchdog(); 387 touch_nmi_watchdog();
428 } 388 }
429 show_trace(tsk, regs, sp, bp); 389 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
430} 390}
431 391
432void show_stack(struct task_struct *tsk, unsigned long * sp) 392void show_stack(struct task_struct *task, unsigned long *sp)
433{ 393{
434 _show_stack(tsk, NULL, sp, 0); 394 show_stack_log_lvl(task, NULL, sp, 0, "");
435} 395}
436 396
437/* 397/*
@@ -439,8 +399,8 @@ void show_stack(struct task_struct *tsk, unsigned long * sp)
439 */ 399 */
440void dump_stack(void) 400void dump_stack(void)
441{ 401{
442 unsigned long dummy;
443 unsigned long bp = 0; 402 unsigned long bp = 0;
403 unsigned long stack;
444 404
445#ifdef CONFIG_FRAME_POINTER 405#ifdef CONFIG_FRAME_POINTER
446 if (!bp) 406 if (!bp)
@@ -452,7 +412,7 @@ void dump_stack(void)
452 init_utsname()->release, 412 init_utsname()->release,
453 (int)strcspn(init_utsname()->version, " "), 413 (int)strcspn(init_utsname()->version, " "),
454 init_utsname()->version); 414 init_utsname()->version);
455 show_trace(NULL, NULL, &dummy, bp); 415 show_trace(NULL, NULL, &stack, bp);
456} 416}
457 417
458EXPORT_SYMBOL(dump_stack); 418EXPORT_SYMBOL(dump_stack);
@@ -463,12 +423,8 @@ void show_registers(struct pt_regs *regs)
463 unsigned long sp; 423 unsigned long sp;
464 const int cpu = smp_processor_id(); 424 const int cpu = smp_processor_id();
465 struct task_struct *cur = cpu_pda(cpu)->pcurrent; 425 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
466 u8 *ip;
467 unsigned int code_prologue = code_bytes * 43 / 64;
468 unsigned int code_len = code_bytes;
469 426
470 sp = regs->sp; 427 sp = regs->sp;
471 ip = (u8 *) regs->ip - code_prologue;
472 printk("CPU %d ", cpu); 428 printk("CPU %d ", cpu);
473 __show_regs(regs); 429 __show_regs(regs);
474 printk("Process %s (pid: %d, threadinfo %p, task %p)\n", 430 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
@@ -479,15 +435,22 @@ void show_registers(struct pt_regs *regs)
479 * time of the fault.. 435 * time of the fault..
480 */ 436 */
481 if (!user_mode(regs)) { 437 if (!user_mode(regs)) {
438 unsigned int code_prologue = code_bytes * 43 / 64;
439 unsigned int code_len = code_bytes;
482 unsigned char c; 440 unsigned char c;
441 u8 *ip;
442
483 printk("Stack: "); 443 printk("Stack: ");
484 _show_stack(NULL, regs, (unsigned long *)sp, regs->bp); 444 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
445 regs->bp, "");
485 printk("\n"); 446 printk("\n");
486 447
487 printk(KERN_EMERG "Code: "); 448 printk(KERN_EMERG "Code: ");
449
450 ip = (u8 *)regs->ip - code_prologue;
488 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { 451 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
489 /* try starting at RIP */ 452 /* try starting at RIP */
490 ip = (u8 *) regs->ip; 453 ip = (u8 *)regs->ip;
491 code_len = code_len - code_prologue + 1; 454 code_len = code_len - code_prologue + 1;
492 } 455 }
493 for (i = 0; i < code_len; i++, ip++) { 456 for (i = 0; i < code_len; i++, ip++) {
@@ -503,7 +466,7 @@ void show_registers(struct pt_regs *regs)
503 } 466 }
504 } 467 }
505 printk("\n"); 468 printk("\n");
506} 469}
507 470
508int is_valid_bugaddr(unsigned long ip) 471int is_valid_bugaddr(unsigned long ip)
509{ 472{
@@ -543,7 +506,7 @@ unsigned __kprobes long oops_begin(void)
543} 506}
544 507
545void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) 508void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
546{ 509{
547 die_owner = -1; 510 die_owner = -1;
548 bust_spinlocks(0); 511 bust_spinlocks(0);
549 die_nest_count--; 512 die_nest_count--;
@@ -561,10 +524,9 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
561 do_exit(signr); 524 do_exit(signr);
562} 525}
563 526
564int __kprobes __die(const char * str, struct pt_regs * regs, long err) 527int __kprobes __die(const char *str, struct pt_regs *regs, long err)
565{ 528{
566 static int die_counter; 529 printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter);
567 printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
568#ifdef CONFIG_PREEMPT 530#ifdef CONFIG_PREEMPT
569 printk("PREEMPT "); 531 printk("PREEMPT ");
570#endif 532#endif
@@ -575,8 +537,10 @@ int __kprobes __die(const char * str, struct pt_regs * regs, long err)
575 printk("DEBUG_PAGEALLOC"); 537 printk("DEBUG_PAGEALLOC");
576#endif 538#endif
577 printk("\n"); 539 printk("\n");
578 if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) 540 if (notify_die(DIE_OOPS, str, regs, err,
541 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
579 return 1; 542 return 1;
543
580 show_registers(regs); 544 show_registers(regs);
581 add_taint(TAINT_DIE); 545 add_taint(TAINT_DIE);
582 /* Executive summary in case the oops scrolled away */ 546 /* Executive summary in case the oops scrolled away */
@@ -588,7 +552,7 @@ int __kprobes __die(const char * str, struct pt_regs * regs, long err)
588 return 0; 552 return 0;
589} 553}
590 554
591void die(const char * str, struct pt_regs * regs, long err) 555void die(const char *str, struct pt_regs *regs, long err)
592{ 556{
593 unsigned long flags = oops_begin(); 557 unsigned long flags = oops_begin();
594 558
@@ -605,8 +569,7 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic)
605{ 569{
606 unsigned long flags; 570 unsigned long flags;
607 571
608 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == 572 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
609 NOTIFY_STOP)
610 return; 573 return;
611 574
612 flags = oops_begin(); 575 flags = oops_begin();
@@ -614,7 +577,9 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic)
614 * We are in trouble anyway, lets at least try 577 * We are in trouble anyway, lets at least try
615 * to get a message out. 578 * to get a message out.
616 */ 579 */
617 printk(str, smp_processor_id()); 580 printk(KERN_EMERG "%s", str);
581 printk(" on CPU%d, ip %08lx, registers:\n",
582 smp_processor_id(), regs->ip);
618 show_registers(regs); 583 show_registers(regs);
619 if (kexec_should_crash(current)) 584 if (kexec_should_crash(current))
620 crash_kexec(regs); 585 crash_kexec(regs);
@@ -626,44 +591,44 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic)
626 do_exit(SIGBUS); 591 do_exit(SIGBUS);
627} 592}
628 593
629static void __kprobes do_trap(int trapnr, int signr, char *str, 594static void __kprobes
630 struct pt_regs * regs, long error_code, 595do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
631 siginfo_t *info) 596 long error_code, siginfo_t *info)
632{ 597{
633 struct task_struct *tsk = current; 598 struct task_struct *tsk = current;
634 599
635 if (user_mode(regs)) { 600 if (!user_mode(regs))
636 /* 601 goto kernel_trap;
637 * We want error_code and trap_no set for userspace
638 * faults and kernelspace faults which result in
639 * die(), but not kernelspace faults which are fixed
640 * up. die() gives the process no chance to handle
641 * the signal and notice the kernel fault information,
642 * so that won't result in polluting the information
643 * about previously queued, but not yet delivered,
644 * faults. See also do_general_protection below.
645 */
646 tsk->thread.error_code = error_code;
647 tsk->thread.trap_no = trapnr;
648
649 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
650 printk_ratelimit()) {
651 printk(KERN_INFO
652 "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
653 tsk->comm, tsk->pid, str,
654 regs->ip, regs->sp, error_code);
655 print_vma_addr(" in ", regs->ip);
656 printk("\n");
657 }
658 602
659 if (info) 603 /*
660 force_sig_info(signr, info, tsk); 604 * We want error_code and trap_no set for userspace faults and
661 else 605 * kernelspace faults which result in die(), but not
662 force_sig(signr, tsk); 606 * kernelspace faults which are fixed up. die() gives the
663 return; 607 * process no chance to handle the signal and notice the
608 * kernel fault information, so that won't result in polluting
609 * the information about previously queued, but not yet
610 * delivered, faults. See also do_general_protection below.
611 */
612 tsk->thread.error_code = error_code;
613 tsk->thread.trap_no = trapnr;
614
615 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
616 printk_ratelimit()) {
617 printk(KERN_INFO
618 "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
619 tsk->comm, tsk->pid, str,
620 regs->ip, regs->sp, error_code);
621 print_vma_addr(" in ", regs->ip);
622 printk("\n");
664 } 623 }
665 624
625 if (info)
626 force_sig_info(signr, info, tsk);
627 else
628 force_sig(signr, tsk);
629 return;
666 630
631kernel_trap:
667 if (!fixup_exception(regs)) { 632 if (!fixup_exception(regs)) {
668 tsk->thread.error_code = error_code; 633 tsk->thread.error_code = error_code;
669 tsk->thread.trap_no = trapnr; 634 tsk->thread.trap_no = trapnr;
@@ -673,41 +638,39 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
673} 638}
674 639
675#define DO_ERROR(trapnr, signr, str, name) \ 640#define DO_ERROR(trapnr, signr, str, name) \
676asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ 641asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
677{ \ 642{ \
678 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 643 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
679 == NOTIFY_STOP) \ 644 == NOTIFY_STOP) \
680 return; \ 645 return; \
681 conditional_sti(regs); \ 646 conditional_sti(regs); \
682 do_trap(trapnr, signr, str, regs, error_code, NULL); \ 647 do_trap(trapnr, signr, str, regs, error_code, NULL); \
683} 648}
684 649
685#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ 650#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
686asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ 651asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
687{ \ 652{ \
688 siginfo_t info; \ 653 siginfo_t info; \
689 info.si_signo = signr; \ 654 info.si_signo = signr; \
690 info.si_errno = 0; \ 655 info.si_errno = 0; \
691 info.si_code = sicode; \ 656 info.si_code = sicode; \
692 info.si_addr = (void __user *)siaddr; \ 657 info.si_addr = (void __user *)siaddr; \
693 trace_hardirqs_fixup(); \ 658 trace_hardirqs_fixup(); \
694 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 659 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
695 == NOTIFY_STOP) \ 660 == NOTIFY_STOP) \
696 return; \ 661 return; \
697 conditional_sti(regs); \ 662 conditional_sti(regs); \
698 do_trap(trapnr, signr, str, regs, error_code, &info); \ 663 do_trap(trapnr, signr, str, regs, error_code, &info); \
699} 664}
700 665
701DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) 666DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
702DO_ERROR( 4, SIGSEGV, "overflow", overflow) 667DO_ERROR(4, SIGSEGV, "overflow", overflow)
703DO_ERROR( 5, SIGSEGV, "bounds", bounds) 668DO_ERROR(5, SIGSEGV, "bounds", bounds)
704DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) 669DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
705DO_ERROR( 7, SIGSEGV, "device not available", device_not_available) 670DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
706DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
707DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) 671DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
708DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) 672DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
709DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) 673DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
710DO_ERROR(18, SIGSEGV, "reserved", reserved)
711 674
712/* Runs on IST stack */ 675/* Runs on IST stack */
713asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code) 676asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
@@ -737,31 +700,34 @@ asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
737 die(str, regs, error_code); 700 die(str, regs, error_code);
738} 701}
739 702
740asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, 703asmlinkage void __kprobes
741 long error_code) 704do_general_protection(struct pt_regs *regs, long error_code)
742{ 705{
743 struct task_struct *tsk = current; 706 struct task_struct *tsk;
744 707
745 conditional_sti(regs); 708 conditional_sti(regs);
746 709
747 if (user_mode(regs)) { 710 tsk = current;
748 tsk->thread.error_code = error_code; 711 if (!user_mode(regs))
749 tsk->thread.trap_no = 13; 712 goto gp_in_kernel;
750
751 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
752 printk_ratelimit()) {
753 printk(KERN_INFO
754 "%s[%d] general protection ip:%lx sp:%lx error:%lx",
755 tsk->comm, tsk->pid,
756 regs->ip, regs->sp, error_code);
757 print_vma_addr(" in ", regs->ip);
758 printk("\n");
759 }
760 713
761 force_sig(SIGSEGV, tsk); 714 tsk->thread.error_code = error_code;
762 return; 715 tsk->thread.trap_no = 13;
763 } 716
717 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
718 printk_ratelimit()) {
719 printk(KERN_INFO
720 "%s[%d] general protection ip:%lx sp:%lx error:%lx",
721 tsk->comm, tsk->pid,
722 regs->ip, regs->sp, error_code);
723 print_vma_addr(" in ", regs->ip);
724 printk("\n");
725 }
726
727 force_sig(SIGSEGV, tsk);
728 return;
764 729
730gp_in_kernel:
765 if (fixup_exception(regs)) 731 if (fixup_exception(regs))
766 return; 732 return;
767 733
@@ -774,14 +740,14 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
774} 740}
775 741
776static notrace __kprobes void 742static notrace __kprobes void
777mem_parity_error(unsigned char reason, struct pt_regs * regs) 743mem_parity_error(unsigned char reason, struct pt_regs *regs)
778{ 744{
779 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", 745 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
780 reason); 746 reason);
781 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); 747 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
782 748
783#if defined(CONFIG_EDAC) 749#if defined(CONFIG_EDAC)
784 if(edac_handler_set()) { 750 if (edac_handler_set()) {
785 edac_atomic_assert_error(); 751 edac_atomic_assert_error();
786 return; 752 return;
787 } 753 }
@@ -798,7 +764,7 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
798} 764}
799 765
800static notrace __kprobes void 766static notrace __kprobes void
801io_check_error(unsigned char reason, struct pt_regs * regs) 767io_check_error(unsigned char reason, struct pt_regs *regs)
802{ 768{
803 printk("NMI: IOCK error (debug interrupt?)\n"); 769 printk("NMI: IOCK error (debug interrupt?)\n");
804 show_registers(regs); 770 show_registers(regs);
@@ -828,14 +794,14 @@ unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
828 794
829/* Runs on IST stack. This code must keep interrupts off all the time. 795/* Runs on IST stack. This code must keep interrupts off all the time.
830 Nested NMIs are prevented by the CPU. */ 796 Nested NMIs are prevented by the CPU. */
831asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) 797asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
832{ 798{
833 unsigned char reason = 0; 799 unsigned char reason = 0;
834 int cpu; 800 int cpu;
835 801
836 cpu = smp_processor_id(); 802 cpu = smp_processor_id();
837 803
838 /* Only the BSP gets external NMIs from the system. */ 804 /* Only the BSP gets external NMIs from the system. */
839 if (!cpu) 805 if (!cpu)
840 reason = get_nmi_reason(); 806 reason = get_nmi_reason();
841 807
@@ -847,32 +813,57 @@ asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
847 * Ok, so this is none of the documented NMI sources, 813 * Ok, so this is none of the documented NMI sources,
848 * so it must be the NMI watchdog. 814 * so it must be the NMI watchdog.
849 */ 815 */
850 if (nmi_watchdog_tick(regs,reason)) 816 if (nmi_watchdog_tick(regs, reason))
851 return; 817 return;
852 if (!do_nmi_callback(regs,cpu)) 818 if (!do_nmi_callback(regs, cpu))
853 unknown_nmi_error(reason, regs); 819 unknown_nmi_error(reason, regs);
854 820
855 return; 821 return;
856 } 822 }
857 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) 823 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
858 return; 824 return;
859 825
860 /* AK: following checks seem to be broken on modern chipsets. FIXME */ 826 /* AK: following checks seem to be broken on modern chipsets. FIXME */
861
862 if (reason & 0x80) 827 if (reason & 0x80)
863 mem_parity_error(reason, regs); 828 mem_parity_error(reason, regs);
864 if (reason & 0x40) 829 if (reason & 0x40)
865 io_check_error(reason, regs); 830 io_check_error(reason, regs);
866} 831}
867 832
833asmlinkage notrace __kprobes void
834do_nmi(struct pt_regs *regs, long error_code)
835{
836 nmi_enter();
837
838 add_pda(__nmi_count, 1);
839
840 if (!ignore_nmis)
841 default_do_nmi(regs);
842
843 nmi_exit();
844}
845
846void stop_nmi(void)
847{
848 acpi_nmi_disable();
849 ignore_nmis++;
850}
851
852void restart_nmi(void)
853{
854 ignore_nmis--;
855 acpi_nmi_enable();
856}
857
868/* runs on IST stack. */ 858/* runs on IST stack. */
869asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code) 859asmlinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
870{ 860{
871 trace_hardirqs_fixup(); 861 trace_hardirqs_fixup();
872 862
873 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) { 863 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
864 == NOTIFY_STOP)
874 return; 865 return;
875 } 866
876 preempt_conditional_sti(regs); 867 preempt_conditional_sti(regs);
877 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); 868 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
878 preempt_conditional_cli(regs); 869 preempt_conditional_cli(regs);
@@ -903,8 +894,8 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
903asmlinkage void __kprobes do_debug(struct pt_regs * regs, 894asmlinkage void __kprobes do_debug(struct pt_regs * regs,
904 unsigned long error_code) 895 unsigned long error_code)
905{ 896{
906 unsigned long condition;
907 struct task_struct *tsk = current; 897 struct task_struct *tsk = current;
898 unsigned long condition;
908 siginfo_t info; 899 siginfo_t info;
909 900
910 trace_hardirqs_fixup(); 901 trace_hardirqs_fixup();
@@ -925,21 +916,19 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
925 916
926 /* Mask out spurious debug traps due to lazy DR7 setting */ 917 /* Mask out spurious debug traps due to lazy DR7 setting */
927 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { 918 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
928 if (!tsk->thread.debugreg7) { 919 if (!tsk->thread.debugreg7)
929 goto clear_dr7; 920 goto clear_dr7;
930 }
931 } 921 }
932 922
933 tsk->thread.debugreg6 = condition; 923 tsk->thread.debugreg6 = condition;
934 924
935
936 /* 925 /*
937 * Single-stepping through TF: make sure we ignore any events in 926 * Single-stepping through TF: make sure we ignore any events in
938 * kernel space (but re-enable TF when returning to user mode). 927 * kernel space (but re-enable TF when returning to user mode).
939 */ 928 */
940 if (condition & DR_STEP) { 929 if (condition & DR_STEP) {
941 if (!user_mode(regs)) 930 if (!user_mode(regs))
942 goto clear_TF_reenable; 931 goto clear_TF_reenable;
943 } 932 }
944 933
945 /* Ok, finally something we can handle */ 934 /* Ok, finally something we can handle */
@@ -952,7 +941,7 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
952 force_sig_info(SIGTRAP, &info, tsk); 941 force_sig_info(SIGTRAP, &info, tsk);
953 942
954clear_dr7: 943clear_dr7:
955 set_debugreg(0UL, 7); 944 set_debugreg(0, 7);
956 preempt_conditional_cli(regs); 945 preempt_conditional_cli(regs);
957 return; 946 return;
958 947
@@ -960,6 +949,7 @@ clear_TF_reenable:
960 set_tsk_thread_flag(tsk, TIF_SINGLESTEP); 949 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
961 regs->flags &= ~X86_EFLAGS_TF; 950 regs->flags &= ~X86_EFLAGS_TF;
962 preempt_conditional_cli(regs); 951 preempt_conditional_cli(regs);
952 return;
963} 953}
964 954
965static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) 955static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
@@ -982,7 +972,7 @@ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
982asmlinkage void do_coprocessor_error(struct pt_regs *regs) 972asmlinkage void do_coprocessor_error(struct pt_regs *regs)
983{ 973{
984 void __user *ip = (void __user *)(regs->ip); 974 void __user *ip = (void __user *)(regs->ip);
985 struct task_struct * task; 975 struct task_struct *task;
986 siginfo_t info; 976 siginfo_t info;
987 unsigned short cwd, swd; 977 unsigned short cwd, swd;
988 978
@@ -1015,30 +1005,30 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs)
1015 cwd = get_fpu_cwd(task); 1005 cwd = get_fpu_cwd(task);
1016 swd = get_fpu_swd(task); 1006 swd = get_fpu_swd(task);
1017 switch (swd & ~cwd & 0x3f) { 1007 switch (swd & ~cwd & 0x3f) {
1018 case 0x000: 1008 case 0x000: /* No unmasked exception */
1019 default: 1009 default: /* Multiple exceptions */
1020 break; 1010 break;
1021 case 0x001: /* Invalid Op */ 1011 case 0x001: /* Invalid Op */
1022 /* 1012 /*
1023 * swd & 0x240 == 0x040: Stack Underflow 1013 * swd & 0x240 == 0x040: Stack Underflow
1024 * swd & 0x240 == 0x240: Stack Overflow 1014 * swd & 0x240 == 0x240: Stack Overflow
1025 * User must clear the SF bit (0x40) if set 1015 * User must clear the SF bit (0x40) if set
1026 */ 1016 */
1027 info.si_code = FPE_FLTINV; 1017 info.si_code = FPE_FLTINV;
1028 break; 1018 break;
1029 case 0x002: /* Denormalize */ 1019 case 0x002: /* Denormalize */
1030 case 0x010: /* Underflow */ 1020 case 0x010: /* Underflow */
1031 info.si_code = FPE_FLTUND; 1021 info.si_code = FPE_FLTUND;
1032 break; 1022 break;
1033 case 0x004: /* Zero Divide */ 1023 case 0x004: /* Zero Divide */
1034 info.si_code = FPE_FLTDIV; 1024 info.si_code = FPE_FLTDIV;
1035 break; 1025 break;
1036 case 0x008: /* Overflow */ 1026 case 0x008: /* Overflow */
1037 info.si_code = FPE_FLTOVF; 1027 info.si_code = FPE_FLTOVF;
1038 break; 1028 break;
1039 case 0x020: /* Precision */ 1029 case 0x020: /* Precision */
1040 info.si_code = FPE_FLTRES; 1030 info.si_code = FPE_FLTRES;
1041 break; 1031 break;
1042 } 1032 }
1043 force_sig_info(SIGFPE, &info, task); 1033 force_sig_info(SIGFPE, &info, task);
1044} 1034}
@@ -1051,7 +1041,7 @@ asmlinkage void bad_intr(void)
1051asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) 1041asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
1052{ 1042{
1053 void __user *ip = (void __user *)(regs->ip); 1043 void __user *ip = (void __user *)(regs->ip);
1054 struct task_struct * task; 1044 struct task_struct *task;
1055 siginfo_t info; 1045 siginfo_t info;
1056 unsigned short mxcsr; 1046 unsigned short mxcsr;
1057 1047
@@ -1079,25 +1069,25 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
1079 */ 1069 */
1080 mxcsr = get_fpu_mxcsr(task); 1070 mxcsr = get_fpu_mxcsr(task);
1081 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { 1071 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
1082 case 0x000: 1072 case 0x000:
1083 default: 1073 default:
1084 break; 1074 break;
1085 case 0x001: /* Invalid Op */ 1075 case 0x001: /* Invalid Op */
1086 info.si_code = FPE_FLTINV; 1076 info.si_code = FPE_FLTINV;
1087 break; 1077 break;
1088 case 0x002: /* Denormalize */ 1078 case 0x002: /* Denormalize */
1089 case 0x010: /* Underflow */ 1079 case 0x010: /* Underflow */
1090 info.si_code = FPE_FLTUND; 1080 info.si_code = FPE_FLTUND;
1091 break; 1081 break;
1092 case 0x004: /* Zero Divide */ 1082 case 0x004: /* Zero Divide */
1093 info.si_code = FPE_FLTDIV; 1083 info.si_code = FPE_FLTDIV;
1094 break; 1084 break;
1095 case 0x008: /* Overflow */ 1085 case 0x008: /* Overflow */
1096 info.si_code = FPE_FLTOVF; 1086 info.si_code = FPE_FLTOVF;
1097 break; 1087 break;
1098 case 0x020: /* Precision */ 1088 case 0x020: /* Precision */
1099 info.si_code = FPE_FLTRES; 1089 info.si_code = FPE_FLTRES;
1100 break; 1090 break;
1101 } 1091 }
1102 force_sig_info(SIGFPE, &info, task); 1092 force_sig_info(SIGFPE, &info, task);
1103} 1093}
@@ -1115,7 +1105,7 @@ asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
1115} 1105}
1116 1106
1117/* 1107/*
1118 * 'math_state_restore()' saves the current math information in the 1108 * 'math_state_restore()' saves the current math information in the
1119 * old math state array, and gets the new ones from the current task 1109 * old math state array, and gets the new ones from the current task
1120 * 1110 *
1121 * Careful.. There are problems with IBM-designed IRQ13 behaviour. 1111 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
@@ -1140,7 +1130,7 @@ asmlinkage void math_state_restore(void)
1140 local_irq_disable(); 1130 local_irq_disable();
1141 } 1131 }
1142 1132
1143 clts(); /* Allow maths ops (or we recurse) */ 1133 clts(); /* Allow maths ops (or we recurse) */
1144 restore_fpu_checking(&me->thread.xstate->fxsave); 1134 restore_fpu_checking(&me->thread.xstate->fxsave);
1145 task_thread_info(me)->status |= TS_USEDFPU; 1135 task_thread_info(me)->status |= TS_USEDFPU;
1146 me->fpu_counter++; 1136 me->fpu_counter++;
@@ -1149,64 +1139,61 @@ EXPORT_SYMBOL_GPL(math_state_restore);
1149 1139
1150void __init trap_init(void) 1140void __init trap_init(void)
1151{ 1141{
1152 set_intr_gate(0,&divide_error); 1142 set_intr_gate(0, &divide_error);
1153 set_intr_gate_ist(1,&debug,DEBUG_STACK); 1143 set_intr_gate_ist(1, &debug, DEBUG_STACK);
1154 set_intr_gate_ist(2,&nmi,NMI_STACK); 1144 set_intr_gate_ist(2, &nmi, NMI_STACK);
1155 set_system_gate_ist(3,&int3,DEBUG_STACK); /* int3 can be called from all */ 1145 set_system_gate_ist(3, &int3, DEBUG_STACK); /* int3 can be called from all */
1156 set_system_gate(4,&overflow); /* int4 can be called from all */ 1146 set_system_gate(4, &overflow); /* int4 can be called from all */
1157 set_intr_gate(5,&bounds); 1147 set_intr_gate(5, &bounds);
1158 set_intr_gate(6,&invalid_op); 1148 set_intr_gate(6, &invalid_op);
1159 set_intr_gate(7,&device_not_available); 1149 set_intr_gate(7, &device_not_available);
1160 set_intr_gate_ist(8,&double_fault, DOUBLEFAULT_STACK); 1150 set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK);
1161 set_intr_gate(9,&coprocessor_segment_overrun); 1151 set_intr_gate(9, &coprocessor_segment_overrun);
1162 set_intr_gate(10,&invalid_TSS); 1152 set_intr_gate(10, &invalid_TSS);
1163 set_intr_gate(11,&segment_not_present); 1153 set_intr_gate(11, &segment_not_present);
1164 set_intr_gate_ist(12,&stack_segment,STACKFAULT_STACK); 1154 set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
1165 set_intr_gate(13,&general_protection); 1155 set_intr_gate(13, &general_protection);
1166 set_intr_gate(14,&page_fault); 1156 set_intr_gate(14, &page_fault);
1167 set_intr_gate(15,&spurious_interrupt_bug); 1157 set_intr_gate(15, &spurious_interrupt_bug);
1168 set_intr_gate(16,&coprocessor_error); 1158 set_intr_gate(16, &coprocessor_error);
1169 set_intr_gate(17,&alignment_check); 1159 set_intr_gate(17, &alignment_check);
1170#ifdef CONFIG_X86_MCE 1160#ifdef CONFIG_X86_MCE
1171 set_intr_gate_ist(18,&machine_check, MCE_STACK); 1161 set_intr_gate_ist(18, &machine_check, MCE_STACK);
1172#endif 1162#endif
1173 set_intr_gate(19,&simd_coprocessor_error); 1163 set_intr_gate(19, &simd_coprocessor_error);
1174 1164
1175#ifdef CONFIG_IA32_EMULATION 1165#ifdef CONFIG_IA32_EMULATION
1176 set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall); 1166 set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
1177#endif 1167#endif
1178
1179 /* 1168 /*
1180 * initialize the per thread extended state: 1169 * initialize the per thread extended state:
1181 */ 1170 */
1182 init_thread_xstate(); 1171 init_thread_xstate();
1183 /* 1172 /*
1184 * Should be a barrier for any external CPU state. 1173 * Should be a barrier for any external CPU state:
1185 */ 1174 */
1186 cpu_init(); 1175 cpu_init();
1187} 1176}
1188 1177
1189
1190static int __init oops_setup(char *s) 1178static int __init oops_setup(char *s)
1191{ 1179{
1192 if (!s) 1180 if (!s)
1193 return -EINVAL; 1181 return -EINVAL;
1194 if (!strcmp(s, "panic")) 1182 if (!strcmp(s, "panic"))
1195 panic_on_oops = 1; 1183 panic_on_oops = 1;
1196 return 0; 1184 return 0;
1197} 1185}
1198early_param("oops", oops_setup); 1186early_param("oops", oops_setup);
1199 1187
1200static int __init kstack_setup(char *s) 1188static int __init kstack_setup(char *s)
1201{ 1189{
1202 if (!s) 1190 if (!s)
1203 return -EINVAL; 1191 return -EINVAL;
1204 kstack_depth_to_print = simple_strtoul(s,NULL,0); 1192 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
1205 return 0; 1193 return 0;
1206} 1194}
1207early_param("kstack", kstack_setup); 1195early_param("kstack", kstack_setup);
1208 1196
1209
1210static int __init code_bytes_setup(char *s) 1197static int __init code_bytes_setup(char *s)
1211{ 1198{
1212 code_bytes = simple_strtoul(s, NULL, 0); 1199 code_bytes = simple_strtoul(s, NULL, 0);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
new file mode 100644
index 000000000000..7603c0553909
--- /dev/null
+++ b/arch/x86/kernel/tsc.c
@@ -0,0 +1,535 @@
1#include <linux/kernel.h>
2#include <linux/sched.h>
3#include <linux/init.h>
4#include <linux/module.h>
5#include <linux/timer.h>
6#include <linux/acpi_pmtmr.h>
7#include <linux/cpufreq.h>
8#include <linux/dmi.h>
9#include <linux/delay.h>
10#include <linux/clocksource.h>
11#include <linux/percpu.h>
12
13#include <asm/hpet.h>
14#include <asm/timer.h>
15#include <asm/vgtod.h>
16#include <asm/time.h>
17#include <asm/delay.h>
18
19unsigned int cpu_khz; /* TSC clocks / usec, not used here */
20EXPORT_SYMBOL(cpu_khz);
21unsigned int tsc_khz;
22EXPORT_SYMBOL(tsc_khz);
23
24/*
25 * TSC can be unstable due to cpufreq or due to unsynced TSCs
26 */
27static int tsc_unstable;
28
29/* native_sched_clock() is called before tsc_init(), so
30 we must start with the TSC soft disabled to prevent
31 erroneous rdtsc usage on !cpu_has_tsc processors */
32static int tsc_disabled = -1;
33
34/*
35 * Scheduler clock - returns current time in nanosec units.
36 */
37u64 native_sched_clock(void)
38{
39 u64 this_offset;
40
41 /*
42 * Fall back to jiffies if there's no TSC available:
43 * ( But note that we still use it if the TSC is marked
44 * unstable. We do this because unlike Time Of Day,
45 * the scheduler clock tolerates small errors and it's
46 * very important for it to be as fast as the platform
47 * can achive it. )
48 */
49 if (unlikely(tsc_disabled)) {
50 /* No locking but a rare wrong value is not a big deal: */
51 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
52 }
53
54 /* read the Time Stamp Counter: */
55 rdtscll(this_offset);
56
57 /* return the value in ns */
58 return cycles_2_ns(this_offset);
59}
60
61/* We need to define a real function for sched_clock, to override the
62 weak default version */
63#ifdef CONFIG_PARAVIRT
64unsigned long long sched_clock(void)
65{
66 return paravirt_sched_clock();
67}
68#else
69unsigned long long
70sched_clock(void) __attribute__((alias("native_sched_clock")));
71#endif
72
73int check_tsc_unstable(void)
74{
75 return tsc_unstable;
76}
77EXPORT_SYMBOL_GPL(check_tsc_unstable);
78
79#ifdef CONFIG_X86_TSC
80int __init notsc_setup(char *str)
81{
82 printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
83 "cannot disable TSC completely.\n");
84 tsc_disabled = 1;
85 return 1;
86}
87#else
88/*
89 * disable flag for tsc. Takes effect by clearing the TSC cpu flag
90 * in cpu/common.c
91 */
92int __init notsc_setup(char *str)
93{
94 setup_clear_cpu_cap(X86_FEATURE_TSC);
95 return 1;
96}
97#endif
98
99__setup("notsc", notsc_setup);
100
101#define MAX_RETRIES 5
102#define SMI_TRESHOLD 50000
103
104/*
105 * Read TSC and the reference counters. Take care of SMI disturbance
106 */
107static u64 __init tsc_read_refs(u64 *pm, u64 *hpet)
108{
109 u64 t1, t2;
110 int i;
111
112 for (i = 0; i < MAX_RETRIES; i++) {
113 t1 = get_cycles();
114 if (hpet)
115 *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
116 else
117 *pm = acpi_pm_read_early();
118 t2 = get_cycles();
119 if ((t2 - t1) < SMI_TRESHOLD)
120 return t2;
121 }
122 return ULLONG_MAX;
123}
124
125/**
126 * native_calibrate_tsc - calibrate the tsc on boot
127 */
128unsigned long native_calibrate_tsc(void)
129{
130 unsigned long flags;
131 u64 tsc1, tsc2, tr1, tr2, delta, pm1, pm2, hpet1, hpet2;
132 int hpet = is_hpet_enabled();
133 unsigned int tsc_khz_val = 0;
134
135 local_irq_save(flags);
136
137 tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL);
138
139 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
140
141 outb(0xb0, 0x43);
142 outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
143 outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42);
144 tr1 = get_cycles();
145 while ((inb(0x61) & 0x20) == 0);
146 tr2 = get_cycles();
147
148 tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL);
149
150 local_irq_restore(flags);
151
152 /*
153 * Preset the result with the raw and inaccurate PIT
154 * calibration value
155 */
156 delta = (tr2 - tr1);
157 do_div(delta, 50);
158 tsc_khz_val = delta;
159
160 /* hpet or pmtimer available ? */
161 if (!hpet && !pm1 && !pm2) {
162 printk(KERN_INFO "TSC calibrated against PIT\n");
163 goto out;
164 }
165
166 /* Check, whether the sampling was disturbed by an SMI */
167 if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) {
168 printk(KERN_WARNING "TSC calibration disturbed by SMI, "
169 "using PIT calibration result\n");
170 goto out;
171 }
172
173 tsc2 = (tsc2 - tsc1) * 1000000LL;
174
175 if (hpet) {
176 printk(KERN_INFO "TSC calibrated against HPET\n");
177 if (hpet2 < hpet1)
178 hpet2 += 0x100000000ULL;
179 hpet2 -= hpet1;
180 tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
181 do_div(tsc1, 1000000);
182 } else {
183 printk(KERN_INFO "TSC calibrated against PM_TIMER\n");
184 if (pm2 < pm1)
185 pm2 += (u64)ACPI_PM_OVRRUN;
186 pm2 -= pm1;
187 tsc1 = pm2 * 1000000000LL;
188 do_div(tsc1, PMTMR_TICKS_PER_SEC);
189 }
190
191 do_div(tsc2, tsc1);
192 tsc_khz_val = tsc2;
193
194out:
195 return tsc_khz_val;
196}
197
198
199#ifdef CONFIG_X86_32
200/* Only called from the Powernow K7 cpu freq driver */
201int recalibrate_cpu_khz(void)
202{
203#ifndef CONFIG_SMP
204 unsigned long cpu_khz_old = cpu_khz;
205
206 if (cpu_has_tsc) {
207 tsc_khz = calibrate_tsc();
208 cpu_khz = tsc_khz;
209 cpu_data(0).loops_per_jiffy =
210 cpufreq_scale(cpu_data(0).loops_per_jiffy,
211 cpu_khz_old, cpu_khz);
212 return 0;
213 } else
214 return -ENODEV;
215#else
216 return -ENODEV;
217#endif
218}
219
220EXPORT_SYMBOL(recalibrate_cpu_khz);
221
222#endif /* CONFIG_X86_32 */
223
224/* Accelerators for sched_clock()
225 * convert from cycles(64bits) => nanoseconds (64bits)
226 * basic equation:
227 * ns = cycles / (freq / ns_per_sec)
228 * ns = cycles * (ns_per_sec / freq)
229 * ns = cycles * (10^9 / (cpu_khz * 10^3))
230 * ns = cycles * (10^6 / cpu_khz)
231 *
232 * Then we use scaling math (suggested by george@mvista.com) to get:
233 * ns = cycles * (10^6 * SC / cpu_khz) / SC
234 * ns = cycles * cyc2ns_scale / SC
235 *
236 * And since SC is a constant power of two, we can convert the div
237 * into a shift.
238 *
239 * We can use khz divisor instead of mhz to keep a better precision, since
240 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
241 * (mathieu.desnoyers@polymtl.ca)
242 *
243 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
244 */
245
246DEFINE_PER_CPU(unsigned long, cyc2ns);
247
248static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
249{
250 unsigned long long tsc_now, ns_now;
251 unsigned long flags, *scale;
252
253 local_irq_save(flags);
254 sched_clock_idle_sleep_event();
255
256 scale = &per_cpu(cyc2ns, cpu);
257
258 rdtscll(tsc_now);
259 ns_now = __cycles_2_ns(tsc_now);
260
261 if (cpu_khz)
262 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
263
264 sched_clock_idle_wakeup_event(0);
265 local_irq_restore(flags);
266}
267
268#ifdef CONFIG_CPU_FREQ
269
270/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
271 * changes.
272 *
273 * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's
274 * not that important because current Opteron setups do not support
275 * scaling on SMP anyroads.
276 *
277 * Should fix up last_tsc too. Currently gettimeofday in the
278 * first tick after the change will be slightly wrong.
279 */
280
281static unsigned int ref_freq;
282static unsigned long loops_per_jiffy_ref;
283static unsigned long tsc_khz_ref;
284
285static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
286 void *data)
287{
288 struct cpufreq_freqs *freq = data;
289 unsigned long *lpj, dummy;
290
291 if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC))
292 return 0;
293
294 lpj = &dummy;
295 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
296#ifdef CONFIG_SMP
297 lpj = &cpu_data(freq->cpu).loops_per_jiffy;
298#else
299 lpj = &boot_cpu_data.loops_per_jiffy;
300#endif
301
302 if (!ref_freq) {
303 ref_freq = freq->old;
304 loops_per_jiffy_ref = *lpj;
305 tsc_khz_ref = tsc_khz;
306 }
307 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
308 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
309 (val == CPUFREQ_RESUMECHANGE)) {
310 *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
311
312 tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
313 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
314 mark_tsc_unstable("cpufreq changes");
315 }
316
317 set_cyc2ns_scale(tsc_khz_ref, freq->cpu);
318
319 return 0;
320}
321
322static struct notifier_block time_cpufreq_notifier_block = {
323 .notifier_call = time_cpufreq_notifier
324};
325
326static int __init cpufreq_tsc(void)
327{
328 cpufreq_register_notifier(&time_cpufreq_notifier_block,
329 CPUFREQ_TRANSITION_NOTIFIER);
330 return 0;
331}
332
333core_initcall(cpufreq_tsc);
334
335#endif /* CONFIG_CPU_FREQ */
336
337/* clocksource code */
338
339static struct clocksource clocksource_tsc;
340
341/*
342 * We compare the TSC to the cycle_last value in the clocksource
343 * structure to avoid a nasty time-warp. This can be observed in a
344 * very small window right after one CPU updated cycle_last under
345 * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
346 * is smaller than the cycle_last reference value due to a TSC which
347 * is slighty behind. This delta is nowhere else observable, but in
348 * that case it results in a forward time jump in the range of hours
349 * due to the unsigned delta calculation of the time keeping core
350 * code, which is necessary to support wrapping clocksources like pm
351 * timer.
352 */
353static cycle_t read_tsc(void)
354{
355 cycle_t ret = (cycle_t)get_cycles();
356
357 return ret >= clocksource_tsc.cycle_last ?
358 ret : clocksource_tsc.cycle_last;
359}
360
361#ifdef CONFIG_X86_64
362static cycle_t __vsyscall_fn vread_tsc(void)
363{
364 cycle_t ret = (cycle_t)vget_cycles();
365
366 return ret >= __vsyscall_gtod_data.clock.cycle_last ?
367 ret : __vsyscall_gtod_data.clock.cycle_last;
368}
369#endif
370
371static struct clocksource clocksource_tsc = {
372 .name = "tsc",
373 .rating = 300,
374 .read = read_tsc,
375 .mask = CLOCKSOURCE_MASK(64),
376 .shift = 22,
377 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
378 CLOCK_SOURCE_MUST_VERIFY,
379#ifdef CONFIG_X86_64
380 .vread = vread_tsc,
381#endif
382};
383
384void mark_tsc_unstable(char *reason)
385{
386 if (!tsc_unstable) {
387 tsc_unstable = 1;
388 printk("Marking TSC unstable due to %s\n", reason);
389 /* Change only the rating, when not registered */
390 if (clocksource_tsc.mult)
391 clocksource_change_rating(&clocksource_tsc, 0);
392 else
393 clocksource_tsc.rating = 0;
394 }
395}
396
397EXPORT_SYMBOL_GPL(mark_tsc_unstable);
398
399static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d)
400{
401 printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
402 d->ident);
403 tsc_unstable = 1;
404 return 0;
405}
406
407/* List of systems that have known TSC problems */
408static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
409 {
410 .callback = dmi_mark_tsc_unstable,
411 .ident = "IBM Thinkpad 380XD",
412 .matches = {
413 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
414 DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
415 },
416 },
417 {}
418};
419
420/*
421 * Geode_LX - the OLPC CPU has a possibly a very reliable TSC
422 */
423#ifdef CONFIG_MGEODE_LX
424/* RTSC counts during suspend */
425#define RTSC_SUSP 0x100
426
427static void __init check_geode_tsc_reliable(void)
428{
429 unsigned long res_low, res_high;
430
431 rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
432 if (res_low & RTSC_SUSP)
433 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
434}
435#else
436static inline void check_geode_tsc_reliable(void) { }
437#endif
438
439/*
440 * Make an educated guess if the TSC is trustworthy and synchronized
441 * over all CPUs.
442 */
443__cpuinit int unsynchronized_tsc(void)
444{
445 if (!cpu_has_tsc || tsc_unstable)
446 return 1;
447
448#ifdef CONFIG_SMP
449 if (apic_is_clustered_box())
450 return 1;
451#endif
452
453 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
454 return 0;
455 /*
456 * Intel systems are normally all synchronized.
457 * Exceptions must mark TSC as unstable:
458 */
459 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
460 /* assume multi socket systems are not synchronized: */
461 if (num_possible_cpus() > 1)
462 tsc_unstable = 1;
463 }
464
465 return tsc_unstable;
466}
467
468static void __init init_tsc_clocksource(void)
469{
470 clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
471 clocksource_tsc.shift);
472 /* lower the rating if we already know its unstable: */
473 if (check_tsc_unstable()) {
474 clocksource_tsc.rating = 0;
475 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
476 }
477 clocksource_register(&clocksource_tsc);
478}
479
480void __init tsc_init(void)
481{
482 u64 lpj;
483 int cpu;
484
485 if (!cpu_has_tsc)
486 return;
487
488 tsc_khz = calibrate_tsc();
489 cpu_khz = tsc_khz;
490
491 if (!tsc_khz) {
492 mark_tsc_unstable("could not calculate TSC khz");
493 return;
494 }
495
496#ifdef CONFIG_X86_64
497 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
498 (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
499 cpu_khz = calibrate_cpu();
500#endif
501
502 lpj = ((u64)tsc_khz * 1000);
503 do_div(lpj, HZ);
504 lpj_fine = lpj;
505
506 printk("Detected %lu.%03lu MHz processor.\n",
507 (unsigned long)cpu_khz / 1000,
508 (unsigned long)cpu_khz % 1000);
509
510 /*
511 * Secondary CPUs do not run through tsc_init(), so set up
512 * all the scale factors for all CPUs, assuming the same
513 * speed as the bootup CPU. (cpufreq notifiers will fix this
514 * up if their speed diverges)
515 */
516 for_each_possible_cpu(cpu)
517 set_cyc2ns_scale(cpu_khz, cpu);
518
519 if (tsc_disabled > 0)
520 return;
521
522 /* now allow native_sched_clock() to use rdtsc */
523 tsc_disabled = 0;
524
525 use_tsc_delay();
526 /* Check and install the TSC clocksource */
527 dmi_check_system(bad_tsc_dmi_table);
528
529 if (unsynchronized_tsc())
530 mark_tsc_unstable("TSCs unsynchronized");
531
532 check_geode_tsc_reliable();
533 init_tsc_clocksource();
534}
535
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
deleted file mode 100644
index 65b70637ad97..000000000000
--- a/arch/x86/kernel/tsc_32.c
+++ /dev/null
@@ -1,451 +0,0 @@
1#include <linux/sched.h>
2#include <linux/clocksource.h>
3#include <linux/workqueue.h>
4#include <linux/cpufreq.h>
5#include <linux/jiffies.h>
6#include <linux/init.h>
7#include <linux/dmi.h>
8#include <linux/percpu.h>
9
10#include <asm/delay.h>
11#include <asm/tsc.h>
12#include <asm/io.h>
13#include <asm/timer.h>
14
15#include "mach_timer.h"
16
17/* native_sched_clock() is called before tsc_init(), so
18 we must start with the TSC soft disabled to prevent
19 erroneous rdtsc usage on !cpu_has_tsc processors */
20static int tsc_disabled = -1;
21
22/*
23 * On some systems the TSC frequency does not
24 * change with the cpu frequency. So we need
25 * an extra value to store the TSC freq
26 */
27unsigned int tsc_khz;
28EXPORT_SYMBOL_GPL(tsc_khz);
29
30#ifdef CONFIG_X86_TSC
31static int __init tsc_setup(char *str)
32{
33 printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
34 "cannot disable TSC completely.\n");
35 tsc_disabled = 1;
36 return 1;
37}
38#else
39/*
40 * disable flag for tsc. Takes effect by clearing the TSC cpu flag
41 * in cpu/common.c
42 */
43static int __init tsc_setup(char *str)
44{
45 setup_clear_cpu_cap(X86_FEATURE_TSC);
46 return 1;
47}
48#endif
49
50__setup("notsc", tsc_setup);
51
52/*
53 * code to mark and check if the TSC is unstable
54 * due to cpufreq or due to unsynced TSCs
55 */
56static int tsc_unstable;
57
58int check_tsc_unstable(void)
59{
60 return tsc_unstable;
61}
62EXPORT_SYMBOL_GPL(check_tsc_unstable);
63
64/* Accelerators for sched_clock()
65 * convert from cycles(64bits) => nanoseconds (64bits)
66 * basic equation:
67 * ns = cycles / (freq / ns_per_sec)
68 * ns = cycles * (ns_per_sec / freq)
69 * ns = cycles * (10^9 / (cpu_khz * 10^3))
70 * ns = cycles * (10^6 / cpu_khz)
71 *
72 * Then we use scaling math (suggested by george@mvista.com) to get:
73 * ns = cycles * (10^6 * SC / cpu_khz) / SC
74 * ns = cycles * cyc2ns_scale / SC
75 *
76 * And since SC is a constant power of two, we can convert the div
77 * into a shift.
78 *
79 * We can use khz divisor instead of mhz to keep a better precision, since
80 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
81 * (mathieu.desnoyers@polymtl.ca)
82 *
83 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
84 */
85
86DEFINE_PER_CPU(unsigned long, cyc2ns);
87
88static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
89{
90 unsigned long long tsc_now, ns_now;
91 unsigned long flags, *scale;
92
93 local_irq_save(flags);
94 sched_clock_idle_sleep_event();
95
96 scale = &per_cpu(cyc2ns, cpu);
97
98 rdtscll(tsc_now);
99 ns_now = __cycles_2_ns(tsc_now);
100
101 if (cpu_khz)
102 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
103
104 /*
105 * Start smoothly with the new frequency:
106 */
107 sched_clock_idle_wakeup_event(0);
108 local_irq_restore(flags);
109}
110
111/*
112 * Scheduler clock - returns current time in nanosec units.
113 */
114unsigned long long native_sched_clock(void)
115{
116 unsigned long long this_offset;
117
118 /*
119 * Fall back to jiffies if there's no TSC available:
120 * ( But note that we still use it if the TSC is marked
121 * unstable. We do this because unlike Time Of Day,
122 * the scheduler clock tolerates small errors and it's
123 * very important for it to be as fast as the platform
124 * can achive it. )
125 */
126 if (unlikely(tsc_disabled))
127 /* No locking but a rare wrong value is not a big deal: */
128 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
129
130 /* read the Time Stamp Counter: */
131 rdtscll(this_offset);
132
133 /* return the value in ns */
134 return cycles_2_ns(this_offset);
135}
136
137/* We need to define a real function for sched_clock, to override the
138 weak default version */
139#ifdef CONFIG_PARAVIRT
140unsigned long long sched_clock(void)
141{
142 return paravirt_sched_clock();
143}
144#else
145unsigned long long sched_clock(void)
146 __attribute__((alias("native_sched_clock")));
147#endif
148
149unsigned long native_calculate_cpu_khz(void)
150{
151 unsigned long long start, end;
152 unsigned long count;
153 u64 delta64 = (u64)ULLONG_MAX;
154 int i;
155 unsigned long flags;
156
157 local_irq_save(flags);
158
159 /* run 3 times to ensure the cache is warm and to get an accurate reading */
160 for (i = 0; i < 3; i++) {
161 mach_prepare_counter();
162 rdtscll(start);
163 mach_countup(&count);
164 rdtscll(end);
165
166 /*
167 * Error: ECTCNEVERSET
168 * The CTC wasn't reliable: we got a hit on the very first read,
169 * or the CPU was so fast/slow that the quotient wouldn't fit in
170 * 32 bits..
171 */
172 if (count <= 1)
173 continue;
174
175 /* cpu freq too slow: */
176 if ((end - start) <= CALIBRATE_TIME_MSEC)
177 continue;
178
179 /*
180 * We want the minimum time of all runs in case one of them
181 * is inaccurate due to SMI or other delay
182 */
183 delta64 = min(delta64, (end - start));
184 }
185
186 /* cpu freq too fast (or every run was bad): */
187 if (delta64 > (1ULL<<32))
188 goto err;
189
190 delta64 += CALIBRATE_TIME_MSEC/2; /* round for do_div */
191 do_div(delta64,CALIBRATE_TIME_MSEC);
192
193 local_irq_restore(flags);
194 return (unsigned long)delta64;
195err:
196 local_irq_restore(flags);
197 return 0;
198}
199
200int recalibrate_cpu_khz(void)
201{
202#ifndef CONFIG_SMP
203 unsigned long cpu_khz_old = cpu_khz;
204
205 if (cpu_has_tsc) {
206 cpu_khz = calculate_cpu_khz();
207 tsc_khz = cpu_khz;
208 cpu_data(0).loops_per_jiffy =
209 cpufreq_scale(cpu_data(0).loops_per_jiffy,
210 cpu_khz_old, cpu_khz);
211 return 0;
212 } else
213 return -ENODEV;
214#else
215 return -ENODEV;
216#endif
217}
218
219EXPORT_SYMBOL(recalibrate_cpu_khz);
220
221#ifdef CONFIG_CPU_FREQ
222
223/*
224 * if the CPU frequency is scaled, TSC-based delays will need a different
225 * loops_per_jiffy value to function properly.
226 */
227static unsigned int ref_freq;
228static unsigned long loops_per_jiffy_ref;
229static unsigned long cpu_khz_ref;
230
231static int
232time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
233{
234 struct cpufreq_freqs *freq = data;
235
236 if (!ref_freq) {
237 if (!freq->old){
238 ref_freq = freq->new;
239 return 0;
240 }
241 ref_freq = freq->old;
242 loops_per_jiffy_ref = cpu_data(freq->cpu).loops_per_jiffy;
243 cpu_khz_ref = cpu_khz;
244 }
245
246 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
247 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
248 (val == CPUFREQ_RESUMECHANGE)) {
249 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
250 cpu_data(freq->cpu).loops_per_jiffy =
251 cpufreq_scale(loops_per_jiffy_ref,
252 ref_freq, freq->new);
253
254 if (cpu_khz) {
255
256 if (num_online_cpus() == 1)
257 cpu_khz = cpufreq_scale(cpu_khz_ref,
258 ref_freq, freq->new);
259 if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
260 tsc_khz = cpu_khz;
261 set_cyc2ns_scale(cpu_khz, freq->cpu);
262 /*
263 * TSC based sched_clock turns
264 * to junk w/ cpufreq
265 */
266 mark_tsc_unstable("cpufreq changes");
267 }
268 }
269 }
270
271 return 0;
272}
273
274static struct notifier_block time_cpufreq_notifier_block = {
275 .notifier_call = time_cpufreq_notifier
276};
277
278static int __init cpufreq_tsc(void)
279{
280 return cpufreq_register_notifier(&time_cpufreq_notifier_block,
281 CPUFREQ_TRANSITION_NOTIFIER);
282}
283core_initcall(cpufreq_tsc);
284
285#endif
286
287/* clock source code */
288
289static unsigned long current_tsc_khz;
290static struct clocksource clocksource_tsc;
291
292/*
293 * We compare the TSC to the cycle_last value in the clocksource
294 * structure to avoid a nasty time-warp issue. This can be observed in
295 * a very small window right after one CPU updated cycle_last under
296 * xtime lock and the other CPU reads a TSC value which is smaller
297 * than the cycle_last reference value due to a TSC which is slighty
298 * behind. This delta is nowhere else observable, but in that case it
299 * results in a forward time jump in the range of hours due to the
300 * unsigned delta calculation of the time keeping core code, which is
301 * necessary to support wrapping clocksources like pm timer.
302 */
303static cycle_t read_tsc(void)
304{
305 cycle_t ret;
306
307 rdtscll(ret);
308
309 return ret >= clocksource_tsc.cycle_last ?
310 ret : clocksource_tsc.cycle_last;
311}
312
313static struct clocksource clocksource_tsc = {
314 .name = "tsc",
315 .rating = 300,
316 .read = read_tsc,
317 .mask = CLOCKSOURCE_MASK(64),
318 .mult = 0, /* to be set */
319 .shift = 22,
320 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
321 CLOCK_SOURCE_MUST_VERIFY,
322};
323
324void mark_tsc_unstable(char *reason)
325{
326 if (!tsc_unstable) {
327 tsc_unstable = 1;
328 printk("Marking TSC unstable due to: %s.\n", reason);
329 /* Can be called before registration */
330 if (clocksource_tsc.mult)
331 clocksource_change_rating(&clocksource_tsc, 0);
332 else
333 clocksource_tsc.rating = 0;
334 }
335}
336EXPORT_SYMBOL_GPL(mark_tsc_unstable);
337
338static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d)
339{
340 printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
341 d->ident);
342 tsc_unstable = 1;
343 return 0;
344}
345
346/* List of systems that have known TSC problems */
347static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
348 {
349 .callback = dmi_mark_tsc_unstable,
350 .ident = "IBM Thinkpad 380XD",
351 .matches = {
352 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
353 DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
354 },
355 },
356 {}
357};
358
359/*
360 * Make an educated guess if the TSC is trustworthy and synchronized
361 * over all CPUs.
362 */
363__cpuinit int unsynchronized_tsc(void)
364{
365 if (!cpu_has_tsc || tsc_unstable)
366 return 1;
367
368 /* Anything with constant TSC should be synchronized */
369 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
370 return 0;
371
372 /*
373 * Intel systems are normally all synchronized.
374 * Exceptions must mark TSC as unstable:
375 */
376 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
377 /* assume multi socket systems are not synchronized: */
378 if (num_possible_cpus() > 1)
379 tsc_unstable = 1;
380 }
381 return tsc_unstable;
382}
383
384/*
385 * Geode_LX - the OLPC CPU has a possibly a very reliable TSC
386 */
387#ifdef CONFIG_MGEODE_LX
388/* RTSC counts during suspend */
389#define RTSC_SUSP 0x100
390
391static void __init check_geode_tsc_reliable(void)
392{
393 unsigned long res_low, res_high;
394
395 rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
396 if (res_low & RTSC_SUSP)
397 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
398}
399#else
400static inline void check_geode_tsc_reliable(void) { }
401#endif
402
403
404void __init tsc_init(void)
405{
406 int cpu;
407
408 if (!cpu_has_tsc || tsc_disabled > 0)
409 return;
410
411 cpu_khz = calculate_cpu_khz();
412 tsc_khz = cpu_khz;
413
414 if (!cpu_khz) {
415 mark_tsc_unstable("could not calculate TSC khz");
416 return;
417 }
418
419 /* now allow native_sched_clock() to use rdtsc */
420 tsc_disabled = 0;
421
422 printk("Detected %lu.%03lu MHz processor.\n",
423 (unsigned long)cpu_khz / 1000,
424 (unsigned long)cpu_khz % 1000);
425
426 /*
427 * Secondary CPUs do not run through tsc_init(), so set up
428 * all the scale factors for all CPUs, assuming the same
429 * speed as the bootup CPU. (cpufreq notifiers will fix this
430 * up if their speed diverges)
431 */
432 for_each_possible_cpu(cpu)
433 set_cyc2ns_scale(cpu_khz, cpu);
434
435 use_tsc_delay();
436
437 /* Check and install the TSC clocksource */
438 dmi_check_system(bad_tsc_dmi_table);
439
440 unsynchronized_tsc();
441 check_geode_tsc_reliable();
442 current_tsc_khz = tsc_khz;
443 clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
444 clocksource_tsc.shift);
445 /* lower the rating if we already know its unstable: */
446 if (check_tsc_unstable()) {
447 clocksource_tsc.rating = 0;
448 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
449 }
450 clocksource_register(&clocksource_tsc);
451}
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
deleted file mode 100644
index 1784b8077a12..000000000000
--- a/arch/x86/kernel/tsc_64.c
+++ /dev/null
@@ -1,357 +0,0 @@
1#include <linux/kernel.h>
2#include <linux/sched.h>
3#include <linux/interrupt.h>
4#include <linux/init.h>
5#include <linux/clocksource.h>
6#include <linux/time.h>
7#include <linux/acpi.h>
8#include <linux/cpufreq.h>
9#include <linux/acpi_pmtmr.h>
10
11#include <asm/hpet.h>
12#include <asm/timex.h>
13#include <asm/timer.h>
14#include <asm/vgtod.h>
15
16static int notsc __initdata = 0;
17
18unsigned int cpu_khz; /* TSC clocks / usec, not used here */
19EXPORT_SYMBOL(cpu_khz);
20unsigned int tsc_khz;
21EXPORT_SYMBOL(tsc_khz);
22
23/* Accelerators for sched_clock()
24 * convert from cycles(64bits) => nanoseconds (64bits)
25 * basic equation:
26 * ns = cycles / (freq / ns_per_sec)
27 * ns = cycles * (ns_per_sec / freq)
28 * ns = cycles * (10^9 / (cpu_khz * 10^3))
29 * ns = cycles * (10^6 / cpu_khz)
30 *
31 * Then we use scaling math (suggested by george@mvista.com) to get:
32 * ns = cycles * (10^6 * SC / cpu_khz) / SC
33 * ns = cycles * cyc2ns_scale / SC
34 *
35 * And since SC is a constant power of two, we can convert the div
36 * into a shift.
37 *
38 * We can use khz divisor instead of mhz to keep a better precision, since
39 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
40 * (mathieu.desnoyers@polymtl.ca)
41 *
42 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
43 */
44DEFINE_PER_CPU(unsigned long, cyc2ns);
45
46static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
47{
48 unsigned long long tsc_now, ns_now;
49 unsigned long flags, *scale;
50
51 local_irq_save(flags);
52 sched_clock_idle_sleep_event();
53
54 scale = &per_cpu(cyc2ns, cpu);
55
56 rdtscll(tsc_now);
57 ns_now = __cycles_2_ns(tsc_now);
58
59 if (cpu_khz)
60 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
61
62 sched_clock_idle_wakeup_event(0);
63 local_irq_restore(flags);
64}
65
66unsigned long long native_sched_clock(void)
67{
68 unsigned long a = 0;
69
70 /* Could do CPU core sync here. Opteron can execute rdtsc speculatively,
71 * which means it is not completely exact and may not be monotonous
72 * between CPUs. But the errors should be too small to matter for
73 * scheduling purposes.
74 */
75
76 rdtscll(a);
77 return cycles_2_ns(a);
78}
79
80/* We need to define a real function for sched_clock, to override the
81 weak default version */
82#ifdef CONFIG_PARAVIRT
83unsigned long long sched_clock(void)
84{
85 return paravirt_sched_clock();
86}
87#else
88unsigned long long
89sched_clock(void) __attribute__((alias("native_sched_clock")));
90#endif
91
92
93static int tsc_unstable;
94
95int check_tsc_unstable(void)
96{
97 return tsc_unstable;
98}
99EXPORT_SYMBOL_GPL(check_tsc_unstable);
100
101#ifdef CONFIG_CPU_FREQ
102
103/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
104 * changes.
105 *
106 * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's
107 * not that important because current Opteron setups do not support
108 * scaling on SMP anyroads.
109 *
110 * Should fix up last_tsc too. Currently gettimeofday in the
111 * first tick after the change will be slightly wrong.
112 */
113
114static unsigned int ref_freq;
115static unsigned long loops_per_jiffy_ref;
116static unsigned long tsc_khz_ref;
117
118static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
119 void *data)
120{
121 struct cpufreq_freqs *freq = data;
122 unsigned long *lpj, dummy;
123
124 if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC))
125 return 0;
126
127 lpj = &dummy;
128 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
129#ifdef CONFIG_SMP
130 lpj = &cpu_data(freq->cpu).loops_per_jiffy;
131#else
132 lpj = &boot_cpu_data.loops_per_jiffy;
133#endif
134
135 if (!ref_freq) {
136 ref_freq = freq->old;
137 loops_per_jiffy_ref = *lpj;
138 tsc_khz_ref = tsc_khz;
139 }
140 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
141 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
142 (val == CPUFREQ_RESUMECHANGE)) {
143 *lpj =
144 cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
145
146 tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
147 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
148 mark_tsc_unstable("cpufreq changes");
149 }
150
151 set_cyc2ns_scale(tsc_khz_ref, freq->cpu);
152
153 return 0;
154}
155
156static struct notifier_block time_cpufreq_notifier_block = {
157 .notifier_call = time_cpufreq_notifier
158};
159
160static int __init cpufreq_tsc(void)
161{
162 cpufreq_register_notifier(&time_cpufreq_notifier_block,
163 CPUFREQ_TRANSITION_NOTIFIER);
164 return 0;
165}
166
167core_initcall(cpufreq_tsc);
168
169#endif
170
171#define MAX_RETRIES 5
172#define SMI_TRESHOLD 50000
173
174/*
175 * Read TSC and the reference counters. Take care of SMI disturbance
176 */
177static unsigned long __init tsc_read_refs(unsigned long *pm,
178 unsigned long *hpet)
179{
180 unsigned long t1, t2;
181 int i;
182
183 for (i = 0; i < MAX_RETRIES; i++) {
184 t1 = get_cycles();
185 if (hpet)
186 *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
187 else
188 *pm = acpi_pm_read_early();
189 t2 = get_cycles();
190 if ((t2 - t1) < SMI_TRESHOLD)
191 return t2;
192 }
193 return ULONG_MAX;
194}
195
196/**
197 * tsc_calibrate - calibrate the tsc on boot
198 */
199void __init tsc_calibrate(void)
200{
201 unsigned long flags, tsc1, tsc2, tr1, tr2, pm1, pm2, hpet1, hpet2;
202 int hpet = is_hpet_enabled(), cpu;
203
204 local_irq_save(flags);
205
206 tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL);
207
208 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
209
210 outb(0xb0, 0x43);
211 outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
212 outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42);
213 tr1 = get_cycles();
214 while ((inb(0x61) & 0x20) == 0);
215 tr2 = get_cycles();
216
217 tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL);
218
219 local_irq_restore(flags);
220
221 /*
222 * Preset the result with the raw and inaccurate PIT
223 * calibration value
224 */
225 tsc_khz = (tr2 - tr1) / 50;
226
227 /* hpet or pmtimer available ? */
228 if (!hpet && !pm1 && !pm2) {
229 printk(KERN_INFO "TSC calibrated against PIT\n");
230 goto out;
231 }
232
233 /* Check, whether the sampling was disturbed by an SMI */
234 if (tsc1 == ULONG_MAX || tsc2 == ULONG_MAX) {
235 printk(KERN_WARNING "TSC calibration disturbed by SMI, "
236 "using PIT calibration result\n");
237 goto out;
238 }
239
240 tsc2 = (tsc2 - tsc1) * 1000000L;
241
242 if (hpet) {
243 printk(KERN_INFO "TSC calibrated against HPET\n");
244 if (hpet2 < hpet1)
245 hpet2 += 0x100000000;
246 hpet2 -= hpet1;
247 tsc1 = (hpet2 * hpet_readl(HPET_PERIOD)) / 1000000;
248 } else {
249 printk(KERN_INFO "TSC calibrated against PM_TIMER\n");
250 if (pm2 < pm1)
251 pm2 += ACPI_PM_OVRRUN;
252 pm2 -= pm1;
253 tsc1 = (pm2 * 1000000000) / PMTMR_TICKS_PER_SEC;
254 }
255
256 tsc_khz = tsc2 / tsc1;
257
258out:
259 for_each_possible_cpu(cpu)
260 set_cyc2ns_scale(tsc_khz, cpu);
261}
262
263/*
264 * Make an educated guess if the TSC is trustworthy and synchronized
265 * over all CPUs.
266 */
267__cpuinit int unsynchronized_tsc(void)
268{
269 if (tsc_unstable)
270 return 1;
271
272#ifdef CONFIG_SMP
273 if (apic_is_clustered_box())
274 return 1;
275#endif
276
277 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
278 return 0;
279
280 /* Assume multi socket systems are not synchronized */
281 return num_present_cpus() > 1;
282}
283
284int __init notsc_setup(char *s)
285{
286 notsc = 1;
287 return 1;
288}
289
290__setup("notsc", notsc_setup);
291
292static struct clocksource clocksource_tsc;
293
294/*
295 * We compare the TSC to the cycle_last value in the clocksource
296 * structure to avoid a nasty time-warp. This can be observed in a
297 * very small window right after one CPU updated cycle_last under
298 * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
299 * is smaller than the cycle_last reference value due to a TSC which
300 * is slighty behind. This delta is nowhere else observable, but in
301 * that case it results in a forward time jump in the range of hours
302 * due to the unsigned delta calculation of the time keeping core
303 * code, which is necessary to support wrapping clocksources like pm
304 * timer.
305 */
306static cycle_t read_tsc(void)
307{
308 cycle_t ret = (cycle_t)get_cycles();
309
310 return ret >= clocksource_tsc.cycle_last ?
311 ret : clocksource_tsc.cycle_last;
312}
313
314static cycle_t __vsyscall_fn vread_tsc(void)
315{
316 cycle_t ret = (cycle_t)vget_cycles();
317
318 return ret >= __vsyscall_gtod_data.clock.cycle_last ?
319 ret : __vsyscall_gtod_data.clock.cycle_last;
320}
321
322static struct clocksource clocksource_tsc = {
323 .name = "tsc",
324 .rating = 300,
325 .read = read_tsc,
326 .mask = CLOCKSOURCE_MASK(64),
327 .shift = 22,
328 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
329 CLOCK_SOURCE_MUST_VERIFY,
330 .vread = vread_tsc,
331};
332
333void mark_tsc_unstable(char *reason)
334{
335 if (!tsc_unstable) {
336 tsc_unstable = 1;
337 printk("Marking TSC unstable due to %s\n", reason);
338 /* Change only the rating, when not registered */
339 if (clocksource_tsc.mult)
340 clocksource_change_rating(&clocksource_tsc, 0);
341 else
342 clocksource_tsc.rating = 0;
343 }
344}
345EXPORT_SYMBOL_GPL(mark_tsc_unstable);
346
347void __init init_tsc_clocksource(void)
348{
349 if (!notsc) {
350 clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
351 clocksource_tsc.shift);
352 if (check_tsc_unstable())
353 clocksource_tsc.rating = 0;
354
355 clocksource_register(&clocksource_tsc);
356 }
357}
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
new file mode 100644
index 000000000000..41e01b145c48
--- /dev/null
+++ b/arch/x86/kernel/visws_quirks.c
@@ -0,0 +1,707 @@
1/*
2 * SGI Visual Workstation support and quirks, unmaintained.
3 *
4 * Split out from setup.c by davej@suse.de
5 *
6 * Copyright (C) 1999 Bent Hagemark, Ingo Molnar
7 *
8 * SGI Visual Workstation interrupt controller
9 *
10 * The Cobalt system ASIC in the Visual Workstation contains a "Cobalt" APIC
11 * which serves as the main interrupt controller in the system. Non-legacy
12 * hardware in the system uses this controller directly. Legacy devices
13 * are connected to the PIIX4 which in turn has its 8259(s) connected to
14 * a of the Cobalt APIC entry.
15 *
16 * 09/02/2000 - Updated for 2.4 by jbarnes@sgi.com
17 *
18 * 25/11/2002 - Updated for 2.5 by Andrey Panin <pazke@orbita1.ru>
19 */
20#include <linux/interrupt.h>
21#include <linux/module.h>
22#include <linux/init.h>
23#include <linux/smp.h>
24
25#include <asm/visws/cobalt.h>
26#include <asm/visws/piix4.h>
27#include <asm/arch_hooks.h>
28#include <asm/fixmap.h>
29#include <asm/reboot.h>
30#include <asm/setup.h>
31#include <asm/e820.h>
32#include <asm/smp.h>
33#include <asm/io.h>
34
35#include <mach_ipi.h>
36
37#include "mach_apic.h"
38
39#include <linux/init.h>
40#include <linux/smp.h>
41
42#include <linux/kernel_stat.h>
43#include <linux/interrupt.h>
44#include <linux/init.h>
45
46#include <asm/io.h>
47#include <asm/apic.h>
48#include <asm/i8259.h>
49#include <asm/irq_vectors.h>
50#include <asm/visws/cobalt.h>
51#include <asm/visws/lithium.h>
52#include <asm/visws/piix4.h>
53
54#include <linux/sched.h>
55#include <linux/kernel.h>
56#include <linux/init.h>
57#include <linux/pci.h>
58#include <linux/pci_ids.h>
59
60extern int no_broadcast;
61
62#include <asm/io.h>
63#include <asm/apic.h>
64#include <asm/arch_hooks.h>
65#include <asm/visws/cobalt.h>
66#include <asm/visws/lithium.h>
67
68char visws_board_type = -1;
69char visws_board_rev = -1;
70
71int is_visws_box(void)
72{
73 return visws_board_type >= 0;
74}
75
76static int __init visws_time_init(void)
77{
78 printk(KERN_INFO "Starting Cobalt Timer system clock\n");
79
80 /* Set the countdown value */
81 co_cpu_write(CO_CPU_TIMEVAL, CO_TIME_HZ/HZ);
82
83 /* Start the timer */
84 co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) | CO_CTRL_TIMERUN);
85
86 /* Enable (unmask) the timer interrupt */
87 co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK);
88
89 /*
90 * Zero return means the generic timer setup code will set up
91 * the standard vector:
92 */
93 return 0;
94}
95
96static int __init visws_pre_intr_init(void)
97{
98 init_VISWS_APIC_irqs();
99
100 /*
101 * We dont want ISA irqs to be set up by the generic code:
102 */
103 return 1;
104}
105
106/* Quirk for machine specific memory setup. */
107
108#define MB (1024 * 1024)
109
110unsigned long sgivwfb_mem_phys;
111unsigned long sgivwfb_mem_size;
112EXPORT_SYMBOL(sgivwfb_mem_phys);
113EXPORT_SYMBOL(sgivwfb_mem_size);
114
115long long mem_size __initdata = 0;
116
117static char * __init visws_memory_setup(void)
118{
119 long long gfx_mem_size = 8 * MB;
120
121 mem_size = boot_params.alt_mem_k;
122
123 if (!mem_size) {
124 printk(KERN_WARNING "Bootloader didn't set memory size, upgrade it !\n");
125 mem_size = 128 * MB;
126 }
127
128 /*
129 * this hardcodes the graphics memory to 8 MB
130 * it really should be sized dynamically (or at least
131 * set as a boot param)
132 */
133 if (!sgivwfb_mem_size) {
134 printk(KERN_WARNING "Defaulting to 8 MB framebuffer size\n");
135 sgivwfb_mem_size = 8 * MB;
136 }
137
138 /*
139 * Trim to nearest MB
140 */
141 sgivwfb_mem_size &= ~((1 << 20) - 1);
142 sgivwfb_mem_phys = mem_size - gfx_mem_size;
143
144 e820_add_region(0, LOWMEMSIZE(), E820_RAM);
145 e820_add_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM);
146 e820_add_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED);
147
148 return "PROM";
149}
150
151static void visws_machine_emergency_restart(void)
152{
153 /*
154 * Visual Workstations restart after this
155 * register is poked on the PIIX4
156 */
157 outb(PIIX4_RESET_VAL, PIIX4_RESET_PORT);
158}
159
160static void visws_machine_power_off(void)
161{
162 unsigned short pm_status;
163/* extern unsigned int pci_bus0; */
164
165 while ((pm_status = inw(PMSTS_PORT)) & 0x100)
166 outw(pm_status, PMSTS_PORT);
167
168 outw(PM_SUSPEND_ENABLE, PMCNTRL_PORT);
169
170 mdelay(10);
171
172#define PCI_CONF1_ADDRESS(bus, devfn, reg) \
173 (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3))
174
175/* outl(PCI_CONF1_ADDRESS(pci_bus0, SPECIAL_DEV, SPECIAL_REG), 0xCF8); */
176 outl(PIIX_SPECIAL_STOP, 0xCFC);
177}
178
179static int __init visws_get_smp_config(unsigned int early)
180{
181 /*
182 * Prevent MP-table parsing by the generic code:
183 */
184 return 1;
185}
186
187extern unsigned int __cpuinitdata maxcpus;
188
189/*
190 * The Visual Workstation is Intel MP compliant in the hardware
191 * sense, but it doesn't have a BIOS(-configuration table).
192 * No problem for Linux.
193 */
194
195static void __init MP_processor_info(struct mpc_config_processor *m)
196{
197 int ver, logical_apicid;
198 physid_mask_t apic_cpus;
199
200 if (!(m->mpc_cpuflag & CPU_ENABLED))
201 return;
202
203 logical_apicid = m->mpc_apicid;
204 printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n",
205 m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "",
206 m->mpc_apicid,
207 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
208 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
209 m->mpc_apicver);
210
211 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR)
212 boot_cpu_physical_apicid = m->mpc_apicid;
213
214 ver = m->mpc_apicver;
215 if ((ver >= 0x14 && m->mpc_apicid >= 0xff) || m->mpc_apicid >= 0xf) {
216 printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
217 m->mpc_apicid, MAX_APICS);
218 return;
219 }
220
221 apic_cpus = apicid_to_cpu_present(m->mpc_apicid);
222 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
223 /*
224 * Validate version
225 */
226 if (ver == 0x0) {
227 printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! "
228 "fixing up to 0x10. (tell your hw vendor)\n",
229 m->mpc_apicid);
230 ver = 0x10;
231 }
232 apic_version[m->mpc_apicid] = ver;
233}
234
235static int __init visws_find_smp_config(unsigned int reserve)
236{
237 struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS);
238 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
239
240 if (ncpus > CO_CPU_MAX) {
241 printk(KERN_WARNING "find_visws_smp: got cpu count of %d at %p\n",
242 ncpus, mp);
243
244 ncpus = CO_CPU_MAX;
245 }
246
247 if (ncpus > maxcpus)
248 ncpus = maxcpus;
249
250#ifdef CONFIG_X86_LOCAL_APIC
251 smp_found_config = 1;
252#endif
253 while (ncpus--)
254 MP_processor_info(mp++);
255
256 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
257
258 return 1;
259}
260
261static int visws_trap_init(void);
262
263static struct x86_quirks visws_x86_quirks __initdata = {
264 .arch_time_init = visws_time_init,
265 .arch_pre_intr_init = visws_pre_intr_init,
266 .arch_memory_setup = visws_memory_setup,
267 .arch_intr_init = NULL,
268 .arch_trap_init = visws_trap_init,
269 .mach_get_smp_config = visws_get_smp_config,
270 .mach_find_smp_config = visws_find_smp_config,
271};
272
273void __init visws_early_detect(void)
274{
275 int raw;
276
277 visws_board_type = (char)(inb_p(PIIX_GPI_BD_REG) & PIIX_GPI_BD_REG)
278 >> PIIX_GPI_BD_SHIFT;
279
280 if (visws_board_type < 0)
281 return;
282
283 /*
284 * Install special quirks for timer, interrupt and memory setup:
285 * Fall back to generic behavior for traps:
286 * Override generic MP-table parsing:
287 */
288 x86_quirks = &visws_x86_quirks;
289
290 /*
291 * Install reboot quirks:
292 */
293 pm_power_off = visws_machine_power_off;
294 machine_ops.emergency_restart = visws_machine_emergency_restart;
295
296 /*
297 * Do not use broadcast IPIs:
298 */
299 no_broadcast = 0;
300
301#ifdef CONFIG_X86_IO_APIC
302 /*
303 * Turn off IO-APIC detection and initialization:
304 */
305 skip_ioapic_setup = 1;
306#endif
307
308 /*
309 * Get Board rev.
310 * First, we have to initialize the 307 part to allow us access
311 * to the GPIO registers. Let's map them at 0x0fc0 which is right
312 * after the PIIX4 PM section.
313 */
314 outb_p(SIO_DEV_SEL, SIO_INDEX);
315 outb_p(SIO_GP_DEV, SIO_DATA); /* Talk to GPIO regs. */
316
317 outb_p(SIO_DEV_MSB, SIO_INDEX);
318 outb_p(SIO_GP_MSB, SIO_DATA); /* MSB of GPIO base address */
319
320 outb_p(SIO_DEV_LSB, SIO_INDEX);
321 outb_p(SIO_GP_LSB, SIO_DATA); /* LSB of GPIO base address */
322
323 outb_p(SIO_DEV_ENB, SIO_INDEX);
324 outb_p(1, SIO_DATA); /* Enable GPIO registers. */
325
326 /*
327 * Now, we have to map the power management section to write
328 * a bit which enables access to the GPIO registers.
329 * What lunatic came up with this shit?
330 */
331 outb_p(SIO_DEV_SEL, SIO_INDEX);
332 outb_p(SIO_PM_DEV, SIO_DATA); /* Talk to GPIO regs. */
333
334 outb_p(SIO_DEV_MSB, SIO_INDEX);
335 outb_p(SIO_PM_MSB, SIO_DATA); /* MSB of PM base address */
336
337 outb_p(SIO_DEV_LSB, SIO_INDEX);
338 outb_p(SIO_PM_LSB, SIO_DATA); /* LSB of PM base address */
339
340 outb_p(SIO_DEV_ENB, SIO_INDEX);
341 outb_p(1, SIO_DATA); /* Enable PM registers. */
342
343 /*
344 * Now, write the PM register which enables the GPIO registers.
345 */
346 outb_p(SIO_PM_FER2, SIO_PM_INDEX);
347 outb_p(SIO_PM_GP_EN, SIO_PM_DATA);
348
349 /*
350 * Now, initialize the GPIO registers.
351 * We want them all to be inputs which is the
352 * power on default, so let's leave them alone.
353 * So, let's just read the board rev!
354 */
355 raw = inb_p(SIO_GP_DATA1);
356 raw &= 0x7f; /* 7 bits of valid board revision ID. */
357
358 if (visws_board_type == VISWS_320) {
359 if (raw < 0x6) {
360 visws_board_rev = 4;
361 } else if (raw < 0xc) {
362 visws_board_rev = 5;
363 } else {
364 visws_board_rev = 6;
365 }
366 } else if (visws_board_type == VISWS_540) {
367 visws_board_rev = 2;
368 } else {
369 visws_board_rev = raw;
370 }
371
372 printk(KERN_INFO "Silicon Graphics Visual Workstation %s (rev %d) detected\n",
373 (visws_board_type == VISWS_320 ? "320" :
374 (visws_board_type == VISWS_540 ? "540" :
375 "unknown")), visws_board_rev);
376}
377
378#define A01234 (LI_INTA_0 | LI_INTA_1 | LI_INTA_2 | LI_INTA_3 | LI_INTA_4)
379#define BCD (LI_INTB | LI_INTC | LI_INTD)
380#define ALLDEVS (A01234 | BCD)
381
382static __init void lithium_init(void)
383{
384 set_fixmap(FIX_LI_PCIA, LI_PCI_A_PHYS);
385 set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS);
386
387 if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
388 (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
389 printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A');
390/* panic("This machine is not SGI Visual Workstation 320/540"); */
391 }
392
393 if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
394 (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
395 printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B');
396/* panic("This machine is not SGI Visual Workstation 320/540"); */
397 }
398
399 li_pcia_write16(LI_PCI_INTEN, ALLDEVS);
400 li_pcib_write16(LI_PCI_INTEN, ALLDEVS);
401}
402
403static __init void cobalt_init(void)
404{
405 /*
406 * On normal SMP PC this is used only with SMP, but we have to
407 * use it and set it up here to start the Cobalt clock
408 */
409 set_fixmap(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE);
410 setup_local_APIC();
411 printk(KERN_INFO "Local APIC Version %#x, ID %#x\n",
412 (unsigned int)apic_read(APIC_LVR),
413 (unsigned int)apic_read(APIC_ID));
414
415 set_fixmap(FIX_CO_CPU, CO_CPU_PHYS);
416 set_fixmap(FIX_CO_APIC, CO_APIC_PHYS);
417 printk(KERN_INFO "Cobalt Revision %#lx, APIC ID %#lx\n",
418 co_cpu_read(CO_CPU_REV), co_apic_read(CO_APIC_ID));
419
420 /* Enable Cobalt APIC being careful to NOT change the ID! */
421 co_apic_write(CO_APIC_ID, co_apic_read(CO_APIC_ID) | CO_APIC_ENABLE);
422
423 printk(KERN_INFO "Cobalt APIC enabled: ID reg %#lx\n",
424 co_apic_read(CO_APIC_ID));
425}
426
427static int __init visws_trap_init(void)
428{
429 lithium_init();
430 cobalt_init();
431
432 return 1;
433}
434
435/*
436 * IRQ controller / APIC support:
437 */
438
439static DEFINE_SPINLOCK(cobalt_lock);
440
441/*
442 * Set the given Cobalt APIC Redirection Table entry to point
443 * to the given IDT vector/index.
444 */
445static inline void co_apic_set(int entry, int irq)
446{
447 co_apic_write(CO_APIC_LO(entry), CO_APIC_LEVEL | (irq + FIRST_EXTERNAL_VECTOR));
448 co_apic_write(CO_APIC_HI(entry), 0);
449}
450
451/*
452 * Cobalt (IO)-APIC functions to handle PCI devices.
453 */
454static inline int co_apic_ide0_hack(void)
455{
456 extern char visws_board_type;
457 extern char visws_board_rev;
458
459 if (visws_board_type == VISWS_320 && visws_board_rev == 5)
460 return 5;
461 return CO_APIC_IDE0;
462}
463
464static int is_co_apic(unsigned int irq)
465{
466 if (IS_CO_APIC(irq))
467 return CO_APIC(irq);
468
469 switch (irq) {
470 case 0: return CO_APIC_CPU;
471 case CO_IRQ_IDE0: return co_apic_ide0_hack();
472 case CO_IRQ_IDE1: return CO_APIC_IDE1;
473 default: return -1;
474 }
475}
476
477
478/*
479 * This is the SGI Cobalt (IO-)APIC:
480 */
481
482static void enable_cobalt_irq(unsigned int irq)
483{
484 co_apic_set(is_co_apic(irq), irq);
485}
486
487static void disable_cobalt_irq(unsigned int irq)
488{
489 int entry = is_co_apic(irq);
490
491 co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK);
492 co_apic_read(CO_APIC_LO(entry));
493}
494
495/*
496 * "irq" really just serves to identify the device. Here is where we
497 * map this to the Cobalt APIC entry where it's physically wired.
498 * This is called via request_irq -> setup_irq -> irq_desc->startup()
499 */
500static unsigned int startup_cobalt_irq(unsigned int irq)
501{
502 unsigned long flags;
503
504 spin_lock_irqsave(&cobalt_lock, flags);
505 if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
506 irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
507 enable_cobalt_irq(irq);
508 spin_unlock_irqrestore(&cobalt_lock, flags);
509 return 0;
510}
511
512static void ack_cobalt_irq(unsigned int irq)
513{
514 unsigned long flags;
515
516 spin_lock_irqsave(&cobalt_lock, flags);
517 disable_cobalt_irq(irq);
518 apic_write(APIC_EOI, APIC_EIO_ACK);
519 spin_unlock_irqrestore(&cobalt_lock, flags);
520}
521
522static void end_cobalt_irq(unsigned int irq)
523{
524 unsigned long flags;
525
526 spin_lock_irqsave(&cobalt_lock, flags);
527 if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS)))
528 enable_cobalt_irq(irq);
529 spin_unlock_irqrestore(&cobalt_lock, flags);
530}
531
532static struct irq_chip cobalt_irq_type = {
533 .typename = "Cobalt-APIC",
534 .startup = startup_cobalt_irq,
535 .shutdown = disable_cobalt_irq,
536 .enable = enable_cobalt_irq,
537 .disable = disable_cobalt_irq,
538 .ack = ack_cobalt_irq,
539 .end = end_cobalt_irq,
540};
541
542
543/*
544 * This is the PIIX4-based 8259 that is wired up indirectly to Cobalt
545 * -- not the manner expected by the code in i8259.c.
546 *
547 * there is a 'master' physical interrupt source that gets sent to
548 * the CPU. But in the chipset there are various 'virtual' interrupts
549 * waiting to be handled. We represent this to Linux through a 'master'
550 * interrupt controller type, and through a special virtual interrupt-
551 * controller. Device drivers only see the virtual interrupt sources.
552 */
553static unsigned int startup_piix4_master_irq(unsigned int irq)
554{
555 init_8259A(0);
556
557 return startup_cobalt_irq(irq);
558}
559
560static void end_piix4_master_irq(unsigned int irq)
561{
562 unsigned long flags;
563
564 spin_lock_irqsave(&cobalt_lock, flags);
565 enable_cobalt_irq(irq);
566 spin_unlock_irqrestore(&cobalt_lock, flags);
567}
568
569static struct irq_chip piix4_master_irq_type = {
570 .typename = "PIIX4-master",
571 .startup = startup_piix4_master_irq,
572 .ack = ack_cobalt_irq,
573 .end = end_piix4_master_irq,
574};
575
576
577static struct irq_chip piix4_virtual_irq_type = {
578 .typename = "PIIX4-virtual",
579 .shutdown = disable_8259A_irq,
580 .enable = enable_8259A_irq,
581 .disable = disable_8259A_irq,
582};
583
584
585/*
586 * PIIX4-8259 master/virtual functions to handle interrupt requests
587 * from legacy devices: floppy, parallel, serial, rtc.
588 *
589 * None of these get Cobalt APIC entries, neither do they have IDT
590 * entries. These interrupts are purely virtual and distributed from
591 * the 'master' interrupt source: CO_IRQ_8259.
592 *
593 * When the 8259 interrupts its handler figures out which of these
594 * devices is interrupting and dispatches to its handler.
595 *
596 * CAREFUL: devices see the 'virtual' interrupt only. Thus disable/
597 * enable_irq gets the right irq. This 'master' irq is never directly
598 * manipulated by any driver.
599 */
600static irqreturn_t piix4_master_intr(int irq, void *dev_id)
601{
602 int realirq;
603 irq_desc_t *desc;
604 unsigned long flags;
605
606 spin_lock_irqsave(&i8259A_lock, flags);
607
608 /* Find out what's interrupting in the PIIX4 master 8259 */
609 outb(0x0c, 0x20); /* OCW3 Poll command */
610 realirq = inb(0x20);
611
612 /*
613 * Bit 7 == 0 means invalid/spurious
614 */
615 if (unlikely(!(realirq & 0x80)))
616 goto out_unlock;
617
618 realirq &= 7;
619
620 if (unlikely(realirq == 2)) {
621 outb(0x0c, 0xa0);
622 realirq = inb(0xa0);
623
624 if (unlikely(!(realirq & 0x80)))
625 goto out_unlock;
626
627 realirq = (realirq & 7) + 8;
628 }
629
630 /* mask and ack interrupt */
631 cached_irq_mask |= 1 << realirq;
632 if (unlikely(realirq > 7)) {
633 inb(0xa1);
634 outb(cached_slave_mask, 0xa1);
635 outb(0x60 + (realirq & 7), 0xa0);
636 outb(0x60 + 2, 0x20);
637 } else {
638 inb(0x21);
639 outb(cached_master_mask, 0x21);
640 outb(0x60 + realirq, 0x20);
641 }
642
643 spin_unlock_irqrestore(&i8259A_lock, flags);
644
645 desc = irq_desc + realirq;
646
647 /*
648 * handle this 'virtual interrupt' as a Cobalt one now.
649 */
650 kstat_cpu(smp_processor_id()).irqs[realirq]++;
651
652 if (likely(desc->action != NULL))
653 handle_IRQ_event(realirq, desc->action);
654
655 if (!(desc->status & IRQ_DISABLED))
656 enable_8259A_irq(realirq);
657
658 return IRQ_HANDLED;
659
660out_unlock:
661 spin_unlock_irqrestore(&i8259A_lock, flags);
662 return IRQ_NONE;
663}
664
665static struct irqaction master_action = {
666 .handler = piix4_master_intr,
667 .name = "PIIX4-8259",
668};
669
670static struct irqaction cascade_action = {
671 .handler = no_action,
672 .name = "cascade",
673};
674
675
676void init_VISWS_APIC_irqs(void)
677{
678 int i;
679
680 for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
681 irq_desc[i].status = IRQ_DISABLED;
682 irq_desc[i].action = 0;
683 irq_desc[i].depth = 1;
684
685 if (i == 0) {
686 irq_desc[i].chip = &cobalt_irq_type;
687 }
688 else if (i == CO_IRQ_IDE0) {
689 irq_desc[i].chip = &cobalt_irq_type;
690 }
691 else if (i == CO_IRQ_IDE1) {
692 irq_desc[i].chip = &cobalt_irq_type;
693 }
694 else if (i == CO_IRQ_8259) {
695 irq_desc[i].chip = &piix4_master_irq_type;
696 }
697 else if (i < CO_IRQ_APIC0) {
698 irq_desc[i].chip = &piix4_virtual_irq_type;
699 }
700 else if (IS_CO_APIC(i)) {
701 irq_desc[i].chip = &cobalt_irq_type;
702 }
703 }
704
705 setup_irq(CO_IRQ_8259, &master_action);
706 setup_irq(2, &cascade_action);
707}
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 956f38927aa7..0a1b1a9d922d 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -151,7 +151,7 @@ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
151 insns, ip); 151 insns, ip);
152 case PARAVIRT_PATCH(pv_cpu_ops.iret): 152 case PARAVIRT_PATCH(pv_cpu_ops.iret):
153 return patch_internal(VMI_CALL_IRET, len, insns, ip); 153 return patch_internal(VMI_CALL_IRET, len, insns, ip);
154 case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret): 154 case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit):
155 return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip); 155 return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip);
156 default: 156 default:
157 break; 157 break;
@@ -896,7 +896,7 @@ static inline int __init activate_vmi(void)
896 * the backend. They are performance critical anyway, so requiring 896 * the backend. They are performance critical anyway, so requiring
897 * a patch is not a big problem. 897 * a patch is not a big problem.
898 */ 898 */
899 pv_cpu_ops.irq_enable_syscall_ret = (void *)0xfeedbab0; 899 pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0;
900 pv_cpu_ops.iret = (void *)0xbadbab0; 900 pv_cpu_ops.iret = (void *)0xbadbab0;
901 901
902#ifdef CONFIG_SMP 902#ifdef CONFIG_SMP
@@ -906,7 +906,6 @@ static inline int __init activate_vmi(void)
906#ifdef CONFIG_X86_LOCAL_APIC 906#ifdef CONFIG_X86_LOCAL_APIC
907 para_fill(pv_apic_ops.apic_read, APICRead); 907 para_fill(pv_apic_ops.apic_read, APICRead);
908 para_fill(pv_apic_ops.apic_write, APICWrite); 908 para_fill(pv_apic_ops.apic_write, APICWrite);
909 para_fill(pv_apic_ops.apic_write_atomic, APICWrite);
910#endif 909#endif
911 910
912 /* 911 /*
@@ -932,7 +931,7 @@ static inline int __init activate_vmi(void)
932 pv_apic_ops.setup_secondary_clock = vmi_time_ap_init; 931 pv_apic_ops.setup_secondary_clock = vmi_time_ap_init;
933#endif 932#endif
934 pv_time_ops.sched_clock = vmi_sched_clock; 933 pv_time_ops.sched_clock = vmi_sched_clock;
935 pv_time_ops.get_cpu_khz = vmi_cpu_khz; 934 pv_time_ops.get_tsc_khz = vmi_tsc_khz;
936 935
937 /* We have true wallclock functions; disable CMOS clock sync */ 936 /* We have true wallclock functions; disable CMOS clock sync */
938 no_sync_cmos_clock = 1; 937 no_sync_cmos_clock = 1;
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index a2b030780aa9..6953859fe289 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -33,8 +33,7 @@
33#include <asm/apic.h> 33#include <asm/apic.h>
34#include <asm/timer.h> 34#include <asm/timer.h>
35#include <asm/i8253.h> 35#include <asm/i8253.h>
36 36#include <asm/irq_vectors.h>
37#include <irq_vectors.h>
38 37
39#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) 38#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
40#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) 39#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
@@ -70,8 +69,8 @@ unsigned long long vmi_sched_clock(void)
70 return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE)); 69 return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
71} 70}
72 71
73/* paravirt_ops.get_cpu_khz = vmi_cpu_khz */ 72/* paravirt_ops.get_tsc_khz = vmi_tsc_khz */
74unsigned long vmi_cpu_khz(void) 73unsigned long vmi_tsc_khz(void)
75{ 74{
76 unsigned long long khz; 75 unsigned long long khz;
77 khz = vmi_timer_ops.get_cycle_frequency(); 76 khz = vmi_timer_ops.get_cycle_frequency();
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index ce5ed083a1e9..cdb2363697d2 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -49,23 +49,14 @@ SECTIONS
49 _etext = .; /* End of text section */ 49 _etext = .; /* End of text section */
50 } :text = 0x9090 50 } :text = 0x9090
51 51
52 NOTES :text :note
53
52 . = ALIGN(16); /* Exception table */ 54 . = ALIGN(16); /* Exception table */
53 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { 55 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
54 __start___ex_table = .; 56 __start___ex_table = .;
55 *(__ex_table) 57 *(__ex_table)
56 __stop___ex_table = .; 58 __stop___ex_table = .;
57 } 59 } :text = 0x9090
58
59 NOTES :text :note
60
61 BUG_TABLE :text
62
63 . = ALIGN(4);
64 .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
65 __tracedata_start = .;
66 *(.tracedata)
67 __tracedata_end = .;
68 }
69 60
70 RODATA 61 RODATA
71 62
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index fad3674b06a5..63e5c1a22e88 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -19,7 +19,7 @@ PHDRS {
19 data PT_LOAD FLAGS(7); /* RWE */ 19 data PT_LOAD FLAGS(7); /* RWE */
20 user PT_LOAD FLAGS(7); /* RWE */ 20 user PT_LOAD FLAGS(7); /* RWE */
21 data.init PT_LOAD FLAGS(7); /* RWE */ 21 data.init PT_LOAD FLAGS(7); /* RWE */
22 note PT_NOTE FLAGS(4); /* R__ */ 22 note PT_NOTE FLAGS(0); /* ___ */
23} 23}
24SECTIONS 24SECTIONS
25{ 25{
@@ -40,26 +40,17 @@ SECTIONS
40 _etext = .; /* End of text section */ 40 _etext = .; /* End of text section */
41 } :text = 0x9090 41 } :text = 0x9090
42 42
43 NOTES :text :note
44
43 . = ALIGN(16); /* Exception table */ 45 . = ALIGN(16); /* Exception table */
44 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { 46 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
45 __start___ex_table = .; 47 __start___ex_table = .;
46 *(__ex_table) 48 *(__ex_table)
47 __stop___ex_table = .; 49 __stop___ex_table = .;
48 } 50 } :text = 0x9090
49
50 NOTES :text :note
51
52 BUG_TABLE :text
53 51
54 RODATA 52 RODATA
55 53
56 . = ALIGN(4);
57 .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
58 __tracedata_start = .;
59 *(.tracedata)
60 __tracedata_end = .;
61 }
62
63 . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ 54 . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */
64 /* Data */ 55 /* Data */
65 .data : AT(ADDR(.data) - LOAD_OFFSET) { 56 .data : AT(ADDR(.data) - LOAD_OFFSET) {
@@ -177,6 +168,7 @@ SECTIONS
177 *(.con_initcall.init) 168 *(.con_initcall.init)
178 } 169 }
179 __con_initcall_end = .; 170 __con_initcall_end = .;
171 . = ALIGN(16);
180 __x86cpuvendor_start = .; 172 __x86cpuvendor_start = .;
181 .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) { 173 .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) {
182 *(.x86cpuvendor.init) 174 *(.x86cpuvendor.init)
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index ba8c0b75ab0a..0c029e8959c7 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -15,9 +15,12 @@
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/pci_ids.h> 16#include <linux/pci_ids.h>
17#include <linux/pci_regs.h> 17#include <linux/pci_regs.h>
18
19#include <asm/apic.h>
18#include <asm/pci-direct.h> 20#include <asm/pci-direct.h>
19#include <asm/io.h> 21#include <asm/io.h>
20#include <asm/paravirt.h> 22#include <asm/paravirt.h>
23#include <asm/setup.h>
21 24
22#if defined CONFIG_PCI && defined CONFIG_PARAVIRT 25#if defined CONFIG_PCI && defined CONFIG_PARAVIRT
23/* 26/*
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 61efa2f7d564..0b8b6690a86d 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -42,7 +42,8 @@
42#include <asm/topology.h> 42#include <asm/topology.h>
43#include <asm/vgtod.h> 43#include <asm/vgtod.h>
44 44
45#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) 45#define __vsyscall(nr) \
46 __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
46#define __syscall_clobber "r11","cx","memory" 47#define __syscall_clobber "r11","cx","memory"
47 48
48/* 49/*
@@ -249,7 +250,7 @@ static ctl_table kernel_root_table2[] = {
249 doesn't violate that. We'll find out if it does. */ 250 doesn't violate that. We'll find out if it does. */
250static void __cpuinit vsyscall_set_cpu(int cpu) 251static void __cpuinit vsyscall_set_cpu(int cpu)
251{ 252{
252 unsigned long *d; 253 unsigned long d;
253 unsigned long node = 0; 254 unsigned long node = 0;
254#ifdef CONFIG_NUMA 255#ifdef CONFIG_NUMA
255 node = cpu_to_node(cpu); 256 node = cpu_to_node(cpu);
@@ -260,11 +261,11 @@ static void __cpuinit vsyscall_set_cpu(int cpu)
260 /* Store cpu number in limit so that it can be loaded quickly 261 /* Store cpu number in limit so that it can be loaded quickly
261 in user space in vgetcpu. 262 in user space in vgetcpu.
262 12 bits for the CPU and 8 bits for the node. */ 263 12 bits for the CPU and 8 bits for the node. */
263 d = (unsigned long *)(get_cpu_gdt_table(cpu) + GDT_ENTRY_PER_CPU); 264 d = 0x0f40000000000ULL;
264 *d = 0x0f40000000000ULL; 265 d |= cpu;
265 *d |= cpu; 266 d |= (node & 0xf) << 12;
266 *d |= (node & 0xf) << 12; 267 d |= (node >> 4) << 48;
267 *d |= (node >> 4) << 48; 268 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
268} 269}
269 270
270static void __cpuinit cpu_vsyscall_init(void *arg) 271static void __cpuinit cpu_vsyscall_init(void *arg)
@@ -278,7 +279,7 @@ cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
278{ 279{
279 long cpu = (long)arg; 280 long cpu = (long)arg;
280 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) 281 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
281 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1); 282 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
282 return NOTIFY_DONE; 283 return NOTIFY_DONE;
283} 284}
284 285
@@ -301,7 +302,7 @@ static int __init vsyscall_init(void)
301#ifdef CONFIG_SYSCTL 302#ifdef CONFIG_SYSCTL
302 register_sysctl_table(kernel_root_table2); 303 register_sysctl_table(kernel_root_table2);
303#endif 304#endif
304 on_each_cpu(cpu_vsyscall_init, NULL, 0, 1); 305 on_each_cpu(cpu_vsyscall_init, NULL, 1);
305 hotcpu_notifier(cpu_vsyscall_notifier, 0); 306 hotcpu_notifier(cpu_vsyscall_notifier, 0);
306 return 0; 307 return 0;
307} 308}
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index f6c05d0410fb..b545f371b5f5 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -2,13 +2,20 @@
2 All C exports should go in the respective C files. */ 2 All C exports should go in the respective C files. */
3 3
4#include <linux/module.h> 4#include <linux/module.h>
5#include <net/checksum.h>
6#include <linux/smp.h> 5#include <linux/smp.h>
7 6
7#include <net/checksum.h>
8
8#include <asm/processor.h> 9#include <asm/processor.h>
9#include <asm/uaccess.h>
10#include <asm/pgtable.h> 10#include <asm/pgtable.h>
11#include <asm/uaccess.h>
11#include <asm/desc.h> 12#include <asm/desc.h>
13#include <asm/ftrace.h>
14
15#ifdef CONFIG_FTRACE
16/* mcount is defined in assembly */
17EXPORT_SYMBOL(mcount);
18#endif
12 19
13EXPORT_SYMBOL(kernel_thread); 20EXPORT_SYMBOL(kernel_thread);
14 21
@@ -53,8 +60,3 @@ EXPORT_SYMBOL(init_level4_pgt);
53EXPORT_SYMBOL(load_gs_index); 60EXPORT_SYMBOL(load_gs_index);
54 61
55EXPORT_SYMBOL(_proxy_pda); 62EXPORT_SYMBOL(_proxy_pda);
56
57#ifdef CONFIG_PARAVIRT
58/* Virtualized guests may want to use it */
59EXPORT_SYMBOL_GPL(cpu_gdt_descr);
60#endif
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index c97d35c218db..d0e940bb6f40 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -2,7 +2,8 @@
2# Makefile for Kernel-based Virtual Machine module 2# Makefile for Kernel-based Virtual Machine module
3# 3#
4 4
5common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o) 5common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
6 coalesced_mmio.o)
6ifeq ($(CONFIG_KVM_TRACE),y) 7ifeq ($(CONFIG_KVM_TRACE),y)
7common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) 8common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
8endif 9endif
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 3829aa7b663f..c0f7872a9124 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -91,7 +91,7 @@ static void pit_set_gate(struct kvm *kvm, int channel, u32 val)
91 c->gate = val; 91 c->gate = val;
92} 92}
93 93
94int pit_get_gate(struct kvm *kvm, int channel) 94static int pit_get_gate(struct kvm *kvm, int channel)
95{ 95{
96 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); 96 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
97 97
@@ -193,19 +193,16 @@ static void pit_latch_status(struct kvm *kvm, int channel)
193 } 193 }
194} 194}
195 195
196int __pit_timer_fn(struct kvm_kpit_state *ps) 196static int __pit_timer_fn(struct kvm_kpit_state *ps)
197{ 197{
198 struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0]; 198 struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0];
199 struct kvm_kpit_timer *pt = &ps->pit_timer; 199 struct kvm_kpit_timer *pt = &ps->pit_timer;
200 200
201 atomic_inc(&pt->pending); 201 if (!atomic_inc_and_test(&pt->pending))
202 smp_mb__after_atomic_inc();
203 if (vcpu0) {
204 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); 202 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
205 if (waitqueue_active(&vcpu0->wq)) { 203 if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
206 vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; 204 vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
207 wake_up_interruptible(&vcpu0->wq); 205 wake_up_interruptible(&vcpu0->wq);
208 }
209 } 206 }
210 207
211 pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); 208 pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
@@ -308,6 +305,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
308 create_pit_timer(&ps->pit_timer, val, 0); 305 create_pit_timer(&ps->pit_timer, val, 0);
309 break; 306 break;
310 case 2: 307 case 2:
308 case 3:
311 create_pit_timer(&ps->pit_timer, val, 1); 309 create_pit_timer(&ps->pit_timer, val, 1);
312 break; 310 break;
313 default: 311 default:
@@ -459,7 +457,8 @@ static void pit_ioport_read(struct kvm_io_device *this,
459 mutex_unlock(&pit_state->lock); 457 mutex_unlock(&pit_state->lock);
460} 458}
461 459
462static int pit_in_range(struct kvm_io_device *this, gpa_t addr) 460static int pit_in_range(struct kvm_io_device *this, gpa_t addr,
461 int len, int is_write)
463{ 462{
464 return ((addr >= KVM_PIT_BASE_ADDRESS) && 463 return ((addr >= KVM_PIT_BASE_ADDRESS) &&
465 (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); 464 (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
@@ -500,7 +499,8 @@ static void speaker_ioport_read(struct kvm_io_device *this,
500 mutex_unlock(&pit_state->lock); 499 mutex_unlock(&pit_state->lock);
501} 500}
502 501
503static int speaker_in_range(struct kvm_io_device *this, gpa_t addr) 502static int speaker_in_range(struct kvm_io_device *this, gpa_t addr,
503 int len, int is_write)
504{ 504{
505 return (addr == KVM_SPEAKER_BASE_ADDRESS); 505 return (addr == KVM_SPEAKER_BASE_ADDRESS);
506} 506}
@@ -575,7 +575,7 @@ void kvm_free_pit(struct kvm *kvm)
575 } 575 }
576} 576}
577 577
578void __inject_pit_timer_intr(struct kvm *kvm) 578static void __inject_pit_timer_intr(struct kvm *kvm)
579{ 579{
580 mutex_lock(&kvm->lock); 580 mutex_lock(&kvm->lock);
581 kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1); 581 kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1);
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index ab29cf2def47..c31164e8aa46 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -130,8 +130,10 @@ void kvm_pic_set_irq(void *opaque, int irq, int level)
130{ 130{
131 struct kvm_pic *s = opaque; 131 struct kvm_pic *s = opaque;
132 132
133 pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); 133 if (irq >= 0 && irq < PIC_NUM_PINS) {
134 pic_update_irq(s); 134 pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
135 pic_update_irq(s);
136 }
135} 137}
136 138
137/* 139/*
@@ -346,7 +348,8 @@ static u32 elcr_ioport_read(void *opaque, u32 addr1)
346 return s->elcr; 348 return s->elcr;
347} 349}
348 350
349static int picdev_in_range(struct kvm_io_device *this, gpa_t addr) 351static int picdev_in_range(struct kvm_io_device *this, gpa_t addr,
352 int len, int is_write)
350{ 353{
351 switch (addr) { 354 switch (addr) {
352 case 0x20: 355 case 0x20:
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 2a15be2275c0..7ca47cbb48bb 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -30,6 +30,8 @@
30#include "ioapic.h" 30#include "ioapic.h"
31#include "lapic.h" 31#include "lapic.h"
32 32
33#define PIC_NUM_PINS 16
34
33struct kvm; 35struct kvm;
34struct kvm_vcpu; 36struct kvm_vcpu;
35 37
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ebc03f5ae162..73f43de69f67 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -356,8 +356,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
356 case APIC_DM_SMI: 356 case APIC_DM_SMI:
357 printk(KERN_DEBUG "Ignoring guest SMI\n"); 357 printk(KERN_DEBUG "Ignoring guest SMI\n");
358 break; 358 break;
359
359 case APIC_DM_NMI: 360 case APIC_DM_NMI:
360 printk(KERN_DEBUG "Ignoring guest NMI\n"); 361 kvm_inject_nmi(vcpu);
361 break; 362 break;
362 363
363 case APIC_DM_INIT: 364 case APIC_DM_INIT:
@@ -572,6 +573,8 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
572{ 573{
573 u32 val = 0; 574 u32 val = 0;
574 575
576 KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler);
577
575 if (offset >= LAPIC_MMIO_LENGTH) 578 if (offset >= LAPIC_MMIO_LENGTH)
576 return 0; 579 return 0;
577 580
@@ -695,6 +698,8 @@ static void apic_mmio_write(struct kvm_io_device *this,
695 698
696 offset &= 0xff0; 699 offset &= 0xff0;
697 700
701 KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler);
702
698 switch (offset) { 703 switch (offset) {
699 case APIC_ID: /* Local APIC ID */ 704 case APIC_ID: /* Local APIC ID */
700 apic_set_reg(apic, APIC_ID, val); 705 apic_set_reg(apic, APIC_ID, val);
@@ -780,7 +785,8 @@ static void apic_mmio_write(struct kvm_io_device *this,
780 785
781} 786}
782 787
783static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr) 788static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr,
789 int len, int size)
784{ 790{
785 struct kvm_lapic *apic = (struct kvm_lapic *)this->private; 791 struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
786 int ret = 0; 792 int ret = 0;
@@ -939,8 +945,8 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
939 int result = 0; 945 int result = 0;
940 wait_queue_head_t *q = &apic->vcpu->wq; 946 wait_queue_head_t *q = &apic->vcpu->wq;
941 947
942 atomic_inc(&apic->timer.pending); 948 if(!atomic_inc_and_test(&apic->timer.pending))
943 set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); 949 set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
944 if (waitqueue_active(q)) { 950 if (waitqueue_active(q)) {
945 apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 951 apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
946 wake_up_interruptible(q); 952 wake_up_interruptible(q);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 676c396c9cee..81858881287e 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -31,6 +31,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu);
31u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); 31u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
32void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); 32void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
33void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); 33void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
34u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
34 35
35int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); 36int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
36int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); 37int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7e7c3969f7a2..b0e4ddca6c18 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -66,7 +66,8 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
66#endif 66#endif
67 67
68#if defined(MMU_DEBUG) || defined(AUDIT) 68#if defined(MMU_DEBUG) || defined(AUDIT)
69static int dbg = 1; 69static int dbg = 0;
70module_param(dbg, bool, 0644);
70#endif 71#endif
71 72
72#ifndef MMU_DEBUG 73#ifndef MMU_DEBUG
@@ -776,6 +777,15 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
776 BUG(); 777 BUG();
777} 778}
778 779
780static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
781 struct kvm_mmu_page *sp)
782{
783 int i;
784
785 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
786 sp->spt[i] = shadow_trap_nonpresent_pte;
787}
788
779static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) 789static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
780{ 790{
781 unsigned index; 791 unsigned index;
@@ -841,7 +851,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
841 hlist_add_head(&sp->hash_link, bucket); 851 hlist_add_head(&sp->hash_link, bucket);
842 if (!metaphysical) 852 if (!metaphysical)
843 rmap_write_protect(vcpu->kvm, gfn); 853 rmap_write_protect(vcpu->kvm, gfn);
844 vcpu->arch.mmu.prefetch_page(vcpu, sp); 854 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
855 vcpu->arch.mmu.prefetch_page(vcpu, sp);
856 else
857 nonpaging_prefetch_page(vcpu, sp);
845 return sp; 858 return sp;
846} 859}
847 860
@@ -917,14 +930,17 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
917 } 930 }
918 kvm_mmu_page_unlink_children(kvm, sp); 931 kvm_mmu_page_unlink_children(kvm, sp);
919 if (!sp->root_count) { 932 if (!sp->root_count) {
920 if (!sp->role.metaphysical) 933 if (!sp->role.metaphysical && !sp->role.invalid)
921 unaccount_shadowed(kvm, sp->gfn); 934 unaccount_shadowed(kvm, sp->gfn);
922 hlist_del(&sp->hash_link); 935 hlist_del(&sp->hash_link);
923 kvm_mmu_free_page(kvm, sp); 936 kvm_mmu_free_page(kvm, sp);
924 } else { 937 } else {
938 int invalid = sp->role.invalid;
925 list_move(&sp->link, &kvm->arch.active_mmu_pages); 939 list_move(&sp->link, &kvm->arch.active_mmu_pages);
926 sp->role.invalid = 1; 940 sp->role.invalid = 1;
927 kvm_reload_remote_mmus(kvm); 941 kvm_reload_remote_mmus(kvm);
942 if (!sp->role.metaphysical && !invalid)
943 unaccount_shadowed(kvm, sp->gfn);
928 } 944 }
929 kvm_mmu_reset_last_pte_updated(kvm); 945 kvm_mmu_reset_last_pte_updated(kvm);
930} 946}
@@ -1103,7 +1119,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1103 mark_page_dirty(vcpu->kvm, gfn); 1119 mark_page_dirty(vcpu->kvm, gfn);
1104 1120
1105 pgprintk("%s: setting spte %llx\n", __func__, spte); 1121 pgprintk("%s: setting spte %llx\n", __func__, spte);
1106 pgprintk("instantiating %s PTE (%s) at %d (%llx) addr %llx\n", 1122 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
1107 (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB", 1123 (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
1108 (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte); 1124 (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte);
1109 set_shadow_pte(shadow_pte, spte); 1125 set_shadow_pte(shadow_pte, spte);
@@ -1122,8 +1138,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1122 else 1138 else
1123 kvm_release_pfn_clean(pfn); 1139 kvm_release_pfn_clean(pfn);
1124 } 1140 }
1125 if (!ptwrite || !*ptwrite) 1141 if (speculative) {
1126 vcpu->arch.last_pte_updated = shadow_pte; 1142 vcpu->arch.last_pte_updated = shadow_pte;
1143 vcpu->arch.last_pte_gfn = gfn;
1144 }
1127} 1145}
1128 1146
1129static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) 1147static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
@@ -1171,9 +1189,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1171 return -ENOMEM; 1189 return -ENOMEM;
1172 } 1190 }
1173 1191
1174 table[index] = __pa(new_table->spt) 1192 set_shadow_pte(&table[index],
1175 | PT_PRESENT_MASK | PT_WRITABLE_MASK 1193 __pa(new_table->spt)
1176 | shadow_user_mask | shadow_x_mask; 1194 | PT_PRESENT_MASK | PT_WRITABLE_MASK
1195 | shadow_user_mask | shadow_x_mask);
1177 } 1196 }
1178 table_addr = table[index] & PT64_BASE_ADDR_MASK; 1197 table_addr = table[index] & PT64_BASE_ADDR_MASK;
1179 } 1198 }
@@ -1211,15 +1230,6 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1211} 1230}
1212 1231
1213 1232
1214static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1215 struct kvm_mmu_page *sp)
1216{
1217 int i;
1218
1219 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1220 sp->spt[i] = shadow_trap_nonpresent_pte;
1221}
1222
1223static void mmu_free_roots(struct kvm_vcpu *vcpu) 1233static void mmu_free_roots(struct kvm_vcpu *vcpu)
1224{ 1234{
1225 int i; 1235 int i;
@@ -1671,6 +1681,18 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1671 vcpu->arch.update_pte.pfn = pfn; 1681 vcpu->arch.update_pte.pfn = pfn;
1672} 1682}
1673 1683
1684static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
1685{
1686 u64 *spte = vcpu->arch.last_pte_updated;
1687
1688 if (spte
1689 && vcpu->arch.last_pte_gfn == gfn
1690 && shadow_accessed_mask
1691 && !(*spte & shadow_accessed_mask)
1692 && is_shadow_present_pte(*spte))
1693 set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
1694}
1695
1674void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 1696void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1675 const u8 *new, int bytes) 1697 const u8 *new, int bytes)
1676{ 1698{
@@ -1694,6 +1716,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1694 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 1716 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
1695 mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); 1717 mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
1696 spin_lock(&vcpu->kvm->mmu_lock); 1718 spin_lock(&vcpu->kvm->mmu_lock);
1719 kvm_mmu_access_page(vcpu, gfn);
1697 kvm_mmu_free_some_pages(vcpu); 1720 kvm_mmu_free_some_pages(vcpu);
1698 ++vcpu->kvm->stat.mmu_pte_write; 1721 ++vcpu->kvm->stat.mmu_pte_write;
1699 kvm_mmu_audit(vcpu, "pre pte write"); 1722 kvm_mmu_audit(vcpu, "pre pte write");
@@ -1948,7 +1971,7 @@ void kvm_mmu_zap_all(struct kvm *kvm)
1948 kvm_flush_remote_tlbs(kvm); 1971 kvm_flush_remote_tlbs(kvm);
1949} 1972}
1950 1973
1951void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm) 1974static void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm)
1952{ 1975{
1953 struct kvm_mmu_page *page; 1976 struct kvm_mmu_page *page;
1954 1977
@@ -1968,6 +1991,8 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
1968 list_for_each_entry(kvm, &vm_list, vm_list) { 1991 list_for_each_entry(kvm, &vm_list, vm_list) {
1969 int npages; 1992 int npages;
1970 1993
1994 if (!down_read_trylock(&kvm->slots_lock))
1995 continue;
1971 spin_lock(&kvm->mmu_lock); 1996 spin_lock(&kvm->mmu_lock);
1972 npages = kvm->arch.n_alloc_mmu_pages - 1997 npages = kvm->arch.n_alloc_mmu_pages -
1973 kvm->arch.n_free_mmu_pages; 1998 kvm->arch.n_free_mmu_pages;
@@ -1980,6 +2005,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
1980 nr_to_scan--; 2005 nr_to_scan--;
1981 2006
1982 spin_unlock(&kvm->mmu_lock); 2007 spin_unlock(&kvm->mmu_lock);
2008 up_read(&kvm->slots_lock);
1983 } 2009 }
1984 if (kvm_freed) 2010 if (kvm_freed)
1985 list_move_tail(&kvm_freed->vm_list, &vm_list); 2011 list_move_tail(&kvm_freed->vm_list, &vm_list);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 1730757bbc7a..258e5d56298e 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -15,7 +15,8 @@
15#define PT_USER_MASK (1ULL << 2) 15#define PT_USER_MASK (1ULL << 2)
16#define PT_PWT_MASK (1ULL << 3) 16#define PT_PWT_MASK (1ULL << 3)
17#define PT_PCD_MASK (1ULL << 4) 17#define PT_PCD_MASK (1ULL << 4)
18#define PT_ACCESSED_MASK (1ULL << 5) 18#define PT_ACCESSED_SHIFT 5
19#define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
19#define PT_DIRTY_MASK (1ULL << 6) 20#define PT_DIRTY_MASK (1ULL << 6)
20#define PT_PAGE_SIZE_MASK (1ULL << 7) 21#define PT_PAGE_SIZE_MASK (1ULL << 7)
21#define PT_PAT_MASK (1ULL << 7) 22#define PT_PAT_MASK (1ULL << 7)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 934c7b619396..4d918220baeb 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -460,8 +460,9 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
460static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, 460static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
461 struct kvm_mmu_page *sp) 461 struct kvm_mmu_page *sp)
462{ 462{
463 int i, offset = 0, r = 0; 463 int i, j, offset, r;
464 pt_element_t pt; 464 pt_element_t pt[256 / sizeof(pt_element_t)];
465 gpa_t pte_gpa;
465 466
466 if (sp->role.metaphysical 467 if (sp->role.metaphysical
467 || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) { 468 || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
@@ -469,19 +470,20 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
469 return; 470 return;
470 } 471 }
471 472
472 if (PTTYPE == 32) 473 pte_gpa = gfn_to_gpa(sp->gfn);
474 if (PTTYPE == 32) {
473 offset = sp->role.quadrant << PT64_LEVEL_BITS; 475 offset = sp->role.quadrant << PT64_LEVEL_BITS;
476 pte_gpa += offset * sizeof(pt_element_t);
477 }
474 478
475 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 479 for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) {
476 gpa_t pte_gpa = gfn_to_gpa(sp->gfn); 480 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt);
477 pte_gpa += (i+offset) * sizeof(pt_element_t); 481 pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t);
478 482 for (j = 0; j < ARRAY_SIZE(pt); ++j)
479 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &pt, 483 if (r || is_present_pte(pt[j]))
480 sizeof(pt_element_t)); 484 sp->spt[i+j] = shadow_trap_nonpresent_pte;
481 if (r || is_present_pte(pt)) 485 else
482 sp->spt[i] = shadow_trap_nonpresent_pte; 486 sp->spt[i+j] = shadow_notrap_nonpresent_pte;
483 else
484 sp->spt[i] = shadow_notrap_nonpresent_pte;
485 } 487 }
486} 488}
487 489
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6b0d5fa5bab3..b756e876dce3 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -27,6 +27,8 @@
27 27
28#include <asm/desc.h> 28#include <asm/desc.h>
29 29
30#define __ex(x) __kvm_handle_fault_on_reboot(x)
31
30MODULE_AUTHOR("Qumranet"); 32MODULE_AUTHOR("Qumranet");
31MODULE_LICENSE("GPL"); 33MODULE_LICENSE("GPL");
32 34
@@ -129,17 +131,17 @@ static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
129 131
130static inline void clgi(void) 132static inline void clgi(void)
131{ 133{
132 asm volatile (SVM_CLGI); 134 asm volatile (__ex(SVM_CLGI));
133} 135}
134 136
135static inline void stgi(void) 137static inline void stgi(void)
136{ 138{
137 asm volatile (SVM_STGI); 139 asm volatile (__ex(SVM_STGI));
138} 140}
139 141
140static inline void invlpga(unsigned long addr, u32 asid) 142static inline void invlpga(unsigned long addr, u32 asid)
141{ 143{
142 asm volatile (SVM_INVLPGA :: "a"(addr), "c"(asid)); 144 asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid));
143} 145}
144 146
145static inline unsigned long kvm_read_cr2(void) 147static inline unsigned long kvm_read_cr2(void)
@@ -270,19 +272,11 @@ static int has_svm(void)
270 272
271static void svm_hardware_disable(void *garbage) 273static void svm_hardware_disable(void *garbage)
272{ 274{
273 struct svm_cpu_data *svm_data 275 uint64_t efer;
274 = per_cpu(svm_data, raw_smp_processor_id());
275
276 if (svm_data) {
277 uint64_t efer;
278 276
279 wrmsrl(MSR_VM_HSAVE_PA, 0); 277 wrmsrl(MSR_VM_HSAVE_PA, 0);
280 rdmsrl(MSR_EFER, efer); 278 rdmsrl(MSR_EFER, efer);
281 wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK); 279 wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
282 per_cpu(svm_data, raw_smp_processor_id()) = NULL;
283 __free_page(svm_data->save_area);
284 kfree(svm_data);
285 }
286} 280}
287 281
288static void svm_hardware_enable(void *garbage) 282static void svm_hardware_enable(void *garbage)
@@ -321,6 +315,19 @@ static void svm_hardware_enable(void *garbage)
321 page_to_pfn(svm_data->save_area) << PAGE_SHIFT); 315 page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
322} 316}
323 317
318static void svm_cpu_uninit(int cpu)
319{
320 struct svm_cpu_data *svm_data
321 = per_cpu(svm_data, raw_smp_processor_id());
322
323 if (!svm_data)
324 return;
325
326 per_cpu(svm_data, raw_smp_processor_id()) = NULL;
327 __free_page(svm_data->save_area);
328 kfree(svm_data);
329}
330
324static int svm_cpu_init(int cpu) 331static int svm_cpu_init(int cpu)
325{ 332{
326 struct svm_cpu_data *svm_data; 333 struct svm_cpu_data *svm_data;
@@ -458,6 +465,11 @@ err:
458 465
459static __exit void svm_hardware_unsetup(void) 466static __exit void svm_hardware_unsetup(void)
460{ 467{
468 int cpu;
469
470 for_each_online_cpu(cpu)
471 svm_cpu_uninit(cpu);
472
461 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); 473 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
462 iopm_base = 0; 474 iopm_base = 0;
463} 475}
@@ -707,10 +719,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
707 rdtscll(vcpu->arch.host_tsc); 719 rdtscll(vcpu->arch.host_tsc);
708} 720}
709 721
710static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
711{
712}
713
714static void svm_cache_regs(struct kvm_vcpu *vcpu) 722static void svm_cache_regs(struct kvm_vcpu *vcpu)
715{ 723{
716 struct vcpu_svm *svm = to_svm(vcpu); 724 struct vcpu_svm *svm = to_svm(vcpu);
@@ -949,7 +957,9 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
949 957
950static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) 958static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
951{ 959{
952 return to_svm(vcpu)->db_regs[dr]; 960 unsigned long val = to_svm(vcpu)->db_regs[dr];
961 KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
962 return val;
953} 963}
954 964
955static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, 965static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
@@ -1004,6 +1014,16 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1004 1014
1005 fault_address = svm->vmcb->control.exit_info_2; 1015 fault_address = svm->vmcb->control.exit_info_2;
1006 error_code = svm->vmcb->control.exit_info_1; 1016 error_code = svm->vmcb->control.exit_info_1;
1017
1018 if (!npt_enabled)
1019 KVMTRACE_3D(PAGE_FAULT, &svm->vcpu, error_code,
1020 (u32)fault_address, (u32)(fault_address >> 32),
1021 handler);
1022 else
1023 KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code,
1024 (u32)fault_address, (u32)(fault_address >> 32),
1025 handler);
1026
1007 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1027 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1008} 1028}
1009 1029
@@ -1081,6 +1101,19 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1081 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); 1101 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
1082} 1102}
1083 1103
1104static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1105{
1106 KVMTRACE_0D(NMI, &svm->vcpu, handler);
1107 return 1;
1108}
1109
1110static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1111{
1112 ++svm->vcpu.stat.irq_exits;
1113 KVMTRACE_0D(INTR, &svm->vcpu, handler);
1114 return 1;
1115}
1116
1084static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1117static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1085{ 1118{
1086 return 1; 1119 return 1;
@@ -1219,6 +1252,9 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1219 if (svm_get_msr(&svm->vcpu, ecx, &data)) 1252 if (svm_get_msr(&svm->vcpu, ecx, &data))
1220 kvm_inject_gp(&svm->vcpu, 0); 1253 kvm_inject_gp(&svm->vcpu, 0);
1221 else { 1254 else {
1255 KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data,
1256 (u32)(data >> 32), handler);
1257
1222 svm->vmcb->save.rax = data & 0xffffffff; 1258 svm->vmcb->save.rax = data & 0xffffffff;
1223 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; 1259 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
1224 svm->next_rip = svm->vmcb->save.rip + 2; 1260 svm->next_rip = svm->vmcb->save.rip + 2;
@@ -1284,16 +1320,19 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1284 case MSR_K7_EVNTSEL1: 1320 case MSR_K7_EVNTSEL1:
1285 case MSR_K7_EVNTSEL2: 1321 case MSR_K7_EVNTSEL2:
1286 case MSR_K7_EVNTSEL3: 1322 case MSR_K7_EVNTSEL3:
1323 case MSR_K7_PERFCTR0:
1324 case MSR_K7_PERFCTR1:
1325 case MSR_K7_PERFCTR2:
1326 case MSR_K7_PERFCTR3:
1287 /* 1327 /*
1288 * only support writing 0 to the performance counters for now 1328 * Just discard all writes to the performance counters; this
1289 * to make Windows happy. Should be replaced by a real 1329 * should keep both older linux and windows 64-bit guests
1290 * performance counter emulation later. 1330 * happy
1291 */ 1331 */
1292 if (data != 0) 1332 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", ecx, data);
1293 goto unhandled; 1333
1294 break; 1334 break;
1295 default: 1335 default:
1296 unhandled:
1297 return kvm_set_msr_common(vcpu, ecx, data); 1336 return kvm_set_msr_common(vcpu, ecx, data);
1298 } 1337 }
1299 return 0; 1338 return 0;
@@ -1304,6 +1343,10 @@ static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1304 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 1343 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1305 u64 data = (svm->vmcb->save.rax & -1u) 1344 u64 data = (svm->vmcb->save.rax & -1u)
1306 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); 1345 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
1346
1347 KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32),
1348 handler);
1349
1307 svm->next_rip = svm->vmcb->save.rip + 2; 1350 svm->next_rip = svm->vmcb->save.rip + 2;
1308 if (svm_set_msr(&svm->vcpu, ecx, data)) 1351 if (svm_set_msr(&svm->vcpu, ecx, data))
1309 kvm_inject_gp(&svm->vcpu, 0); 1352 kvm_inject_gp(&svm->vcpu, 0);
@@ -1323,6 +1366,8 @@ static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1323static int interrupt_window_interception(struct vcpu_svm *svm, 1366static int interrupt_window_interception(struct vcpu_svm *svm,
1324 struct kvm_run *kvm_run) 1367 struct kvm_run *kvm_run)
1325{ 1368{
1369 KVMTRACE_0D(PEND_INTR, &svm->vcpu, handler);
1370
1326 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); 1371 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
1327 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 1372 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
1328 /* 1373 /*
@@ -1364,8 +1409,8 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
1364 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, 1409 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
1365 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, 1410 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
1366 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, 1411 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
1367 [SVM_EXIT_INTR] = nop_on_interception, 1412 [SVM_EXIT_INTR] = intr_interception,
1368 [SVM_EXIT_NMI] = nop_on_interception, 1413 [SVM_EXIT_NMI] = nmi_interception,
1369 [SVM_EXIT_SMI] = nop_on_interception, 1414 [SVM_EXIT_SMI] = nop_on_interception,
1370 [SVM_EXIT_INIT] = nop_on_interception, 1415 [SVM_EXIT_INIT] = nop_on_interception,
1371 [SVM_EXIT_VINTR] = interrupt_window_interception, 1416 [SVM_EXIT_VINTR] = interrupt_window_interception,
@@ -1397,6 +1442,9 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1397 struct vcpu_svm *svm = to_svm(vcpu); 1442 struct vcpu_svm *svm = to_svm(vcpu);
1398 u32 exit_code = svm->vmcb->control.exit_code; 1443 u32 exit_code = svm->vmcb->control.exit_code;
1399 1444
1445 KVMTRACE_3D(VMEXIT, vcpu, exit_code, (u32)svm->vmcb->save.rip,
1446 (u32)((u64)svm->vmcb->save.rip >> 32), entryexit);
1447
1400 if (npt_enabled) { 1448 if (npt_enabled) {
1401 int mmu_reload = 0; 1449 int mmu_reload = 0;
1402 if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { 1450 if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
@@ -1470,6 +1518,8 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
1470{ 1518{
1471 struct vmcb_control_area *control; 1519 struct vmcb_control_area *control;
1472 1520
1521 KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler);
1522
1473 control = &svm->vmcb->control; 1523 control = &svm->vmcb->control;
1474 control->int_vector = irq; 1524 control->int_vector = irq;
1475 control->int_ctl &= ~V_INTR_PRIO_MASK; 1525 control->int_ctl &= ~V_INTR_PRIO_MASK;
@@ -1660,9 +1710,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1660 sync_lapic_to_cr8(vcpu); 1710 sync_lapic_to_cr8(vcpu);
1661 1711
1662 save_host_msrs(vcpu); 1712 save_host_msrs(vcpu);
1663 fs_selector = read_fs(); 1713 fs_selector = kvm_read_fs();
1664 gs_selector = read_gs(); 1714 gs_selector = kvm_read_gs();
1665 ldt_selector = read_ldt(); 1715 ldt_selector = kvm_read_ldt();
1666 svm->host_cr2 = kvm_read_cr2(); 1716 svm->host_cr2 = kvm_read_cr2();
1667 svm->host_dr6 = read_dr6(); 1717 svm->host_dr6 = read_dr6();
1668 svm->host_dr7 = read_dr7(); 1718 svm->host_dr7 = read_dr7();
@@ -1716,17 +1766,17 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1716 /* Enter guest mode */ 1766 /* Enter guest mode */
1717 "push %%rax \n\t" 1767 "push %%rax \n\t"
1718 "mov %c[vmcb](%[svm]), %%rax \n\t" 1768 "mov %c[vmcb](%[svm]), %%rax \n\t"
1719 SVM_VMLOAD "\n\t" 1769 __ex(SVM_VMLOAD) "\n\t"
1720 SVM_VMRUN "\n\t" 1770 __ex(SVM_VMRUN) "\n\t"
1721 SVM_VMSAVE "\n\t" 1771 __ex(SVM_VMSAVE) "\n\t"
1722 "pop %%rax \n\t" 1772 "pop %%rax \n\t"
1723#else 1773#else
1724 /* Enter guest mode */ 1774 /* Enter guest mode */
1725 "push %%eax \n\t" 1775 "push %%eax \n\t"
1726 "mov %c[vmcb](%[svm]), %%eax \n\t" 1776 "mov %c[vmcb](%[svm]), %%eax \n\t"
1727 SVM_VMLOAD "\n\t" 1777 __ex(SVM_VMLOAD) "\n\t"
1728 SVM_VMRUN "\n\t" 1778 __ex(SVM_VMRUN) "\n\t"
1729 SVM_VMSAVE "\n\t" 1779 __ex(SVM_VMSAVE) "\n\t"
1730 "pop %%eax \n\t" 1780 "pop %%eax \n\t"
1731#endif 1781#endif
1732 1782
@@ -1795,9 +1845,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1795 write_dr7(svm->host_dr7); 1845 write_dr7(svm->host_dr7);
1796 kvm_write_cr2(svm->host_cr2); 1846 kvm_write_cr2(svm->host_cr2);
1797 1847
1798 load_fs(fs_selector); 1848 kvm_load_fs(fs_selector);
1799 load_gs(gs_selector); 1849 kvm_load_gs(gs_selector);
1800 load_ldt(ldt_selector); 1850 kvm_load_ldt(ldt_selector);
1801 load_host_msrs(vcpu); 1851 load_host_msrs(vcpu);
1802 1852
1803 reload_tss(vcpu); 1853 reload_tss(vcpu);
@@ -1889,7 +1939,6 @@ static struct kvm_x86_ops svm_x86_ops = {
1889 .prepare_guest_switch = svm_prepare_guest_switch, 1939 .prepare_guest_switch = svm_prepare_guest_switch,
1890 .vcpu_load = svm_vcpu_load, 1940 .vcpu_load = svm_vcpu_load,
1891 .vcpu_put = svm_vcpu_put, 1941 .vcpu_put = svm_vcpu_put,
1892 .vcpu_decache = svm_vcpu_decache,
1893 1942
1894 .set_guest_debug = svm_guest_debug, 1943 .set_guest_debug = svm_guest_debug,
1895 .get_msr = svm_get_msr, 1944 .get_msr = svm_get_msr,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 540e95179074..0cac63701719 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -30,6 +30,8 @@
30#include <asm/io.h> 30#include <asm/io.h>
31#include <asm/desc.h> 31#include <asm/desc.h>
32 32
33#define __ex(x) __kvm_handle_fault_on_reboot(x)
34
33MODULE_AUTHOR("Qumranet"); 35MODULE_AUTHOR("Qumranet");
34MODULE_LICENSE("GPL"); 36MODULE_LICENSE("GPL");
35 37
@@ -53,6 +55,7 @@ struct vmcs {
53 55
54struct vcpu_vmx { 56struct vcpu_vmx {
55 struct kvm_vcpu vcpu; 57 struct kvm_vcpu vcpu;
58 struct list_head local_vcpus_link;
56 int launched; 59 int launched;
57 u8 fail; 60 u8 fail;
58 u32 idt_vectoring_info; 61 u32 idt_vectoring_info;
@@ -88,9 +91,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
88} 91}
89 92
90static int init_rmode(struct kvm *kvm); 93static int init_rmode(struct kvm *kvm);
94static u64 construct_eptp(unsigned long root_hpa);
91 95
92static DEFINE_PER_CPU(struct vmcs *, vmxarea); 96static DEFINE_PER_CPU(struct vmcs *, vmxarea);
93static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 97static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
98static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
94 99
95static struct page *vmx_io_bitmap_a; 100static struct page *vmx_io_bitmap_a;
96static struct page *vmx_io_bitmap_b; 101static struct page *vmx_io_bitmap_b;
@@ -260,6 +265,11 @@ static inline int cpu_has_vmx_vpid(void)
260 SECONDARY_EXEC_ENABLE_VPID); 265 SECONDARY_EXEC_ENABLE_VPID);
261} 266}
262 267
268static inline int cpu_has_virtual_nmis(void)
269{
270 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
271}
272
263static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) 273static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
264{ 274{
265 int i; 275 int i;
@@ -278,7 +288,7 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva)
278 u64 gva; 288 u64 gva;
279 } operand = { vpid, 0, gva }; 289 } operand = { vpid, 0, gva };
280 290
281 asm volatile (ASM_VMX_INVVPID 291 asm volatile (__ex(ASM_VMX_INVVPID)
282 /* CF==1 or ZF==1 --> rc = -1 */ 292 /* CF==1 or ZF==1 --> rc = -1 */
283 "; ja 1f ; ud2 ; 1:" 293 "; ja 1f ; ud2 ; 1:"
284 : : "a"(&operand), "c"(ext) : "cc", "memory"); 294 : : "a"(&operand), "c"(ext) : "cc", "memory");
@@ -290,7 +300,7 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa)
290 u64 eptp, gpa; 300 u64 eptp, gpa;
291 } operand = {eptp, gpa}; 301 } operand = {eptp, gpa};
292 302
293 asm volatile (ASM_VMX_INVEPT 303 asm volatile (__ex(ASM_VMX_INVEPT)
294 /* CF==1 or ZF==1 --> rc = -1 */ 304 /* CF==1 or ZF==1 --> rc = -1 */
295 "; ja 1f ; ud2 ; 1:\n" 305 "; ja 1f ; ud2 ; 1:\n"
296 : : "a" (&operand), "c" (ext) : "cc", "memory"); 306 : : "a" (&operand), "c" (ext) : "cc", "memory");
@@ -311,7 +321,7 @@ static void vmcs_clear(struct vmcs *vmcs)
311 u64 phys_addr = __pa(vmcs); 321 u64 phys_addr = __pa(vmcs);
312 u8 error; 322 u8 error;
313 323
314 asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0" 324 asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
315 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 325 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
316 : "cc", "memory"); 326 : "cc", "memory");
317 if (error) 327 if (error)
@@ -329,14 +339,16 @@ static void __vcpu_clear(void *arg)
329 if (per_cpu(current_vmcs, cpu) == vmx->vmcs) 339 if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
330 per_cpu(current_vmcs, cpu) = NULL; 340 per_cpu(current_vmcs, cpu) = NULL;
331 rdtscll(vmx->vcpu.arch.host_tsc); 341 rdtscll(vmx->vcpu.arch.host_tsc);
342 list_del(&vmx->local_vcpus_link);
343 vmx->vcpu.cpu = -1;
344 vmx->launched = 0;
332} 345}
333 346
334static void vcpu_clear(struct vcpu_vmx *vmx) 347static void vcpu_clear(struct vcpu_vmx *vmx)
335{ 348{
336 if (vmx->vcpu.cpu == -1) 349 if (vmx->vcpu.cpu == -1)
337 return; 350 return;
338 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1); 351 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
339 vmx->launched = 0;
340} 352}
341 353
342static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) 354static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
@@ -378,7 +390,7 @@ static unsigned long vmcs_readl(unsigned long field)
378{ 390{
379 unsigned long value; 391 unsigned long value;
380 392
381 asm volatile (ASM_VMX_VMREAD_RDX_RAX 393 asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
382 : "=a"(value) : "d"(field) : "cc"); 394 : "=a"(value) : "d"(field) : "cc");
383 return value; 395 return value;
384} 396}
@@ -413,7 +425,7 @@ static void vmcs_writel(unsigned long field, unsigned long value)
413{ 425{
414 u8 error; 426 u8 error;
415 427
416 asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" 428 asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
417 : "=q"(error) : "a"(value), "d"(field) : "cc"); 429 : "=q"(error) : "a"(value), "d"(field) : "cc");
418 if (unlikely(error)) 430 if (unlikely(error))
419 vmwrite_error(field, value); 431 vmwrite_error(field, value);
@@ -431,10 +443,8 @@ static void vmcs_write32(unsigned long field, u32 value)
431 443
432static void vmcs_write64(unsigned long field, u64 value) 444static void vmcs_write64(unsigned long field, u64 value)
433{ 445{
434#ifdef CONFIG_X86_64
435 vmcs_writel(field, value);
436#else
437 vmcs_writel(field, value); 446 vmcs_writel(field, value);
447#ifndef CONFIG_X86_64
438 asm volatile (""); 448 asm volatile ("");
439 vmcs_writel(field+1, value >> 32); 449 vmcs_writel(field+1, value >> 32);
440#endif 450#endif
@@ -474,7 +484,7 @@ static void reload_tss(void)
474 struct descriptor_table gdt; 484 struct descriptor_table gdt;
475 struct desc_struct *descs; 485 struct desc_struct *descs;
476 486
477 get_gdt(&gdt); 487 kvm_get_gdt(&gdt);
478 descs = (void *)gdt.base; 488 descs = (void *)gdt.base;
479 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ 489 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
480 load_TR_desc(); 490 load_TR_desc();
@@ -530,9 +540,9 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
530 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not 540 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
531 * allow segment selectors with cpl > 0 or ti == 1. 541 * allow segment selectors with cpl > 0 or ti == 1.
532 */ 542 */
533 vmx->host_state.ldt_sel = read_ldt(); 543 vmx->host_state.ldt_sel = kvm_read_ldt();
534 vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; 544 vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
535 vmx->host_state.fs_sel = read_fs(); 545 vmx->host_state.fs_sel = kvm_read_fs();
536 if (!(vmx->host_state.fs_sel & 7)) { 546 if (!(vmx->host_state.fs_sel & 7)) {
537 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); 547 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
538 vmx->host_state.fs_reload_needed = 0; 548 vmx->host_state.fs_reload_needed = 0;
@@ -540,7 +550,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
540 vmcs_write16(HOST_FS_SELECTOR, 0); 550 vmcs_write16(HOST_FS_SELECTOR, 0);
541 vmx->host_state.fs_reload_needed = 1; 551 vmx->host_state.fs_reload_needed = 1;
542 } 552 }
543 vmx->host_state.gs_sel = read_gs(); 553 vmx->host_state.gs_sel = kvm_read_gs();
544 if (!(vmx->host_state.gs_sel & 7)) 554 if (!(vmx->host_state.gs_sel & 7))
545 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); 555 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
546 else { 556 else {
@@ -576,15 +586,15 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
576 ++vmx->vcpu.stat.host_state_reload; 586 ++vmx->vcpu.stat.host_state_reload;
577 vmx->host_state.loaded = 0; 587 vmx->host_state.loaded = 0;
578 if (vmx->host_state.fs_reload_needed) 588 if (vmx->host_state.fs_reload_needed)
579 load_fs(vmx->host_state.fs_sel); 589 kvm_load_fs(vmx->host_state.fs_sel);
580 if (vmx->host_state.gs_ldt_reload_needed) { 590 if (vmx->host_state.gs_ldt_reload_needed) {
581 load_ldt(vmx->host_state.ldt_sel); 591 kvm_load_ldt(vmx->host_state.ldt_sel);
582 /* 592 /*
583 * If we have to reload gs, we must take care to 593 * If we have to reload gs, we must take care to
584 * preserve our gs base. 594 * preserve our gs base.
585 */ 595 */
586 local_irq_save(flags); 596 local_irq_save(flags);
587 load_gs(vmx->host_state.gs_sel); 597 kvm_load_gs(vmx->host_state.gs_sel);
588#ifdef CONFIG_X86_64 598#ifdef CONFIG_X86_64
589 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); 599 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
590#endif 600#endif
@@ -617,13 +627,17 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
617 vcpu_clear(vmx); 627 vcpu_clear(vmx);
618 kvm_migrate_timers(vcpu); 628 kvm_migrate_timers(vcpu);
619 vpid_sync_vcpu_all(vmx); 629 vpid_sync_vcpu_all(vmx);
630 local_irq_disable();
631 list_add(&vmx->local_vcpus_link,
632 &per_cpu(vcpus_on_cpu, cpu));
633 local_irq_enable();
620 } 634 }
621 635
622 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { 636 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
623 u8 error; 637 u8 error;
624 638
625 per_cpu(current_vmcs, cpu) = vmx->vmcs; 639 per_cpu(current_vmcs, cpu) = vmx->vmcs;
626 asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0" 640 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
627 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 641 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
628 : "cc"); 642 : "cc");
629 if (error) 643 if (error)
@@ -640,8 +654,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
640 * Linux uses per-cpu TSS and GDT, so set these when switching 654 * Linux uses per-cpu TSS and GDT, so set these when switching
641 * processors. 655 * processors.
642 */ 656 */
643 vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */ 657 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
644 get_gdt(&dt); 658 kvm_get_gdt(&dt);
645 vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */ 659 vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */
646 660
647 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 661 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
@@ -684,11 +698,6 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
684 update_exception_bitmap(vcpu); 698 update_exception_bitmap(vcpu);
685} 699}
686 700
687static void vmx_vcpu_decache(struct kvm_vcpu *vcpu)
688{
689 vcpu_clear(to_vmx(vcpu));
690}
691
692static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 701static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
693{ 702{
694 return vmcs_readl(GUEST_RFLAGS); 703 return vmcs_readl(GUEST_RFLAGS);
@@ -913,6 +922,18 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
913 case MSR_IA32_TIME_STAMP_COUNTER: 922 case MSR_IA32_TIME_STAMP_COUNTER:
914 guest_write_tsc(data); 923 guest_write_tsc(data);
915 break; 924 break;
925 case MSR_P6_PERFCTR0:
926 case MSR_P6_PERFCTR1:
927 case MSR_P6_EVNTSEL0:
928 case MSR_P6_EVNTSEL1:
929 /*
930 * Just discard all writes to the performance counters; this
931 * should keep both older linux and windows 64-bit guests
932 * happy
933 */
934 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data);
935
936 break;
916 default: 937 default:
917 vmx_load_host_state(vmx); 938 vmx_load_host_state(vmx);
918 msr = find_msr_entry(vmx, msr_index); 939 msr = find_msr_entry(vmx, msr_index);
@@ -1022,6 +1043,7 @@ static void hardware_enable(void *garbage)
1022 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 1043 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1023 u64 old; 1044 u64 old;
1024 1045
1046 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
1025 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 1047 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1026 if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED | 1048 if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED |
1027 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) 1049 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
@@ -1032,13 +1054,25 @@ static void hardware_enable(void *garbage)
1032 MSR_IA32_FEATURE_CONTROL_LOCKED | 1054 MSR_IA32_FEATURE_CONTROL_LOCKED |
1033 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED); 1055 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED);
1034 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ 1056 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
1035 asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) 1057 asm volatile (ASM_VMX_VMXON_RAX
1058 : : "a"(&phys_addr), "m"(phys_addr)
1036 : "memory", "cc"); 1059 : "memory", "cc");
1037} 1060}
1038 1061
1062static void vmclear_local_vcpus(void)
1063{
1064 int cpu = raw_smp_processor_id();
1065 struct vcpu_vmx *vmx, *n;
1066
1067 list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu),
1068 local_vcpus_link)
1069 __vcpu_clear(vmx);
1070}
1071
1039static void hardware_disable(void *garbage) 1072static void hardware_disable(void *garbage)
1040{ 1073{
1041 asm volatile (ASM_VMX_VMXOFF : : : "cc"); 1074 vmclear_local_vcpus();
1075 asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
1042 write_cr4(read_cr4() & ~X86_CR4_VMXE); 1076 write_cr4(read_cr4() & ~X86_CR4_VMXE);
1043} 1077}
1044 1078
@@ -1072,7 +1106,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1072 u32 _vmentry_control = 0; 1106 u32 _vmentry_control = 0;
1073 1107
1074 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; 1108 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
1075 opt = 0; 1109 opt = PIN_BASED_VIRTUAL_NMIS;
1076 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, 1110 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
1077 &_pin_based_exec_control) < 0) 1111 &_pin_based_exec_control) < 0)
1078 return -EIO; 1112 return -EIO;
@@ -1389,6 +1423,8 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
1389static void vmx_flush_tlb(struct kvm_vcpu *vcpu) 1423static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1390{ 1424{
1391 vpid_sync_vcpu_all(to_vmx(vcpu)); 1425 vpid_sync_vcpu_all(to_vmx(vcpu));
1426 if (vm_need_ept())
1427 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
1392} 1428}
1393 1429
1394static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 1430static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
@@ -1420,7 +1456,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1420 if (!(cr0 & X86_CR0_PG)) { 1456 if (!(cr0 & X86_CR0_PG)) {
1421 /* From paging/starting to nonpaging */ 1457 /* From paging/starting to nonpaging */
1422 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 1458 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1423 vmcs_config.cpu_based_exec_ctrl | 1459 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
1424 (CPU_BASED_CR3_LOAD_EXITING | 1460 (CPU_BASED_CR3_LOAD_EXITING |
1425 CPU_BASED_CR3_STORE_EXITING)); 1461 CPU_BASED_CR3_STORE_EXITING));
1426 vcpu->arch.cr0 = cr0; 1462 vcpu->arch.cr0 = cr0;
@@ -1430,7 +1466,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1430 } else if (!is_paging(vcpu)) { 1466 } else if (!is_paging(vcpu)) {
1431 /* From nonpaging to paging */ 1467 /* From nonpaging to paging */
1432 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 1468 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1433 vmcs_config.cpu_based_exec_ctrl & 1469 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
1434 ~(CPU_BASED_CR3_LOAD_EXITING | 1470 ~(CPU_BASED_CR3_LOAD_EXITING |
1435 CPU_BASED_CR3_STORE_EXITING)); 1471 CPU_BASED_CR3_STORE_EXITING));
1436 vcpu->arch.cr0 = cr0; 1472 vcpu->arch.cr0 = cr0;
@@ -1821,7 +1857,7 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
1821 spin_unlock(&vmx_vpid_lock); 1857 spin_unlock(&vmx_vpid_lock);
1822} 1858}
1823 1859
1824void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr) 1860static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
1825{ 1861{
1826 void *va; 1862 void *va;
1827 1863
@@ -1907,8 +1943,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1907 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 1943 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
1908 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 1944 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1909 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 1945 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1910 vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */ 1946 vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs()); /* 22.2.4 */
1911 vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */ 1947 vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs()); /* 22.2.4 */
1912 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 1948 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1913#ifdef CONFIG_X86_64 1949#ifdef CONFIG_X86_64
1914 rdmsrl(MSR_FS_BASE, a); 1950 rdmsrl(MSR_FS_BASE, a);
@@ -1922,7 +1958,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1922 1958
1923 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 1959 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
1924 1960
1925 get_idt(&dt); 1961 kvm_get_idt(&dt);
1926 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ 1962 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
1927 1963
1928 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); 1964 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
@@ -2114,6 +2150,13 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
2114 irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); 2150 irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
2115} 2151}
2116 2152
2153static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2154{
2155 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2156 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
2157 vcpu->arch.nmi_pending = 0;
2158}
2159
2117static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) 2160static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
2118{ 2161{
2119 int word_index = __ffs(vcpu->arch.irq_summary); 2162 int word_index = __ffs(vcpu->arch.irq_summary);
@@ -2554,8 +2597,6 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2554 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); 2597 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2555 offset = exit_qualification & 0xffful; 2598 offset = exit_qualification & 0xffful;
2556 2599
2557 KVMTRACE_1D(APIC_ACCESS, vcpu, (u32)offset, handler);
2558
2559 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 2600 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
2560 2601
2561 if (er != EMULATE_DONE) { 2602 if (er != EMULATE_DONE) {
@@ -2639,6 +2680,19 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2639 return 1; 2680 return 1;
2640} 2681}
2641 2682
2683static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2684{
2685 u32 cpu_based_vm_exec_control;
2686
2687 /* clear pending NMI */
2688 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2689 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
2690 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2691 ++vcpu->stat.nmi_window_exits;
2692
2693 return 1;
2694}
2695
2642/* 2696/*
2643 * The exit handlers return 1 if the exit was handled fully and guest execution 2697 * The exit handlers return 1 if the exit was handled fully and guest execution
2644 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 2698 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -2649,6 +2703,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
2649 [EXIT_REASON_EXCEPTION_NMI] = handle_exception, 2703 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
2650 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 2704 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
2651 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 2705 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
2706 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
2652 [EXIT_REASON_IO_INSTRUCTION] = handle_io, 2707 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
2653 [EXIT_REASON_CR_ACCESS] = handle_cr, 2708 [EXIT_REASON_CR_ACCESS] = handle_cr,
2654 [EXIT_REASON_DR_ACCESS] = handle_dr, 2709 [EXIT_REASON_DR_ACCESS] = handle_dr,
@@ -2736,17 +2791,52 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
2736 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 2791 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2737} 2792}
2738 2793
2794static void enable_nmi_window(struct kvm_vcpu *vcpu)
2795{
2796 u32 cpu_based_vm_exec_control;
2797
2798 if (!cpu_has_virtual_nmis())
2799 return;
2800
2801 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2802 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
2803 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2804}
2805
2806static int vmx_nmi_enabled(struct kvm_vcpu *vcpu)
2807{
2808 u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2809 return !(guest_intr & (GUEST_INTR_STATE_NMI |
2810 GUEST_INTR_STATE_MOV_SS |
2811 GUEST_INTR_STATE_STI));
2812}
2813
2814static int vmx_irq_enabled(struct kvm_vcpu *vcpu)
2815{
2816 u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2817 return (!(guest_intr & (GUEST_INTR_STATE_MOV_SS |
2818 GUEST_INTR_STATE_STI)) &&
2819 (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF));
2820}
2821
2822static void enable_intr_window(struct kvm_vcpu *vcpu)
2823{
2824 if (vcpu->arch.nmi_pending)
2825 enable_nmi_window(vcpu);
2826 else if (kvm_cpu_has_interrupt(vcpu))
2827 enable_irq_window(vcpu);
2828}
2829
2739static void vmx_intr_assist(struct kvm_vcpu *vcpu) 2830static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2740{ 2831{
2741 struct vcpu_vmx *vmx = to_vmx(vcpu); 2832 struct vcpu_vmx *vmx = to_vmx(vcpu);
2742 u32 idtv_info_field, intr_info_field; 2833 u32 idtv_info_field, intr_info_field, exit_intr_info_field;
2743 int has_ext_irq, interrupt_window_open;
2744 int vector; 2834 int vector;
2745 2835
2746 update_tpr_threshold(vcpu); 2836 update_tpr_threshold(vcpu);
2747 2837
2748 has_ext_irq = kvm_cpu_has_interrupt(vcpu);
2749 intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); 2838 intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
2839 exit_intr_info_field = vmcs_read32(VM_EXIT_INTR_INFO);
2750 idtv_info_field = vmx->idt_vectoring_info; 2840 idtv_info_field = vmx->idt_vectoring_info;
2751 if (intr_info_field & INTR_INFO_VALID_MASK) { 2841 if (intr_info_field & INTR_INFO_VALID_MASK) {
2752 if (idtv_info_field & INTR_INFO_VALID_MASK) { 2842 if (idtv_info_field & INTR_INFO_VALID_MASK) {
@@ -2754,8 +2844,7 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2754 if (printk_ratelimit()) 2844 if (printk_ratelimit())
2755 printk(KERN_ERR "Fault when IDT_Vectoring\n"); 2845 printk(KERN_ERR "Fault when IDT_Vectoring\n");
2756 } 2846 }
2757 if (has_ext_irq) 2847 enable_intr_window(vcpu);
2758 enable_irq_window(vcpu);
2759 return; 2848 return;
2760 } 2849 }
2761 if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) { 2850 if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
@@ -2765,30 +2854,56 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2765 u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK; 2854 u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
2766 2855
2767 vmx_inject_irq(vcpu, vect); 2856 vmx_inject_irq(vcpu, vect);
2768 if (unlikely(has_ext_irq)) 2857 enable_intr_window(vcpu);
2769 enable_irq_window(vcpu);
2770 return; 2858 return;
2771 } 2859 }
2772 2860
2773 KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler); 2861 KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler);
2774 2862
2775 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); 2863 /*
2864 * SDM 3: 25.7.1.2
2865 * Clear bit "block by NMI" before VM entry if a NMI delivery
2866 * faulted.
2867 */
2868 if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
2869 == INTR_TYPE_NMI_INTR && cpu_has_virtual_nmis())
2870 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2871 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2872 ~GUEST_INTR_STATE_NMI);
2873
2874 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field
2875 & ~INTR_INFO_RESVD_BITS_MASK);
2776 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2876 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2777 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 2877 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
2778 2878
2779 if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK)) 2879 if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK))
2780 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2880 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2781 vmcs_read32(IDT_VECTORING_ERROR_CODE)); 2881 vmcs_read32(IDT_VECTORING_ERROR_CODE));
2782 if (unlikely(has_ext_irq)) 2882 enable_intr_window(vcpu);
2783 enable_irq_window(vcpu);
2784 return; 2883 return;
2785 } 2884 }
2786 if (!has_ext_irq) 2885 if (cpu_has_virtual_nmis()) {
2886 /*
2887 * SDM 3: 25.7.1.2
2888 * Re-set bit "block by NMI" before VM entry if vmexit caused by
2889 * a guest IRET fault.
2890 */
2891 if ((exit_intr_info_field & INTR_INFO_UNBLOCK_NMI) &&
2892 (exit_intr_info_field & INTR_INFO_VECTOR_MASK) != 8)
2893 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2894 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) |
2895 GUEST_INTR_STATE_NMI);
2896 else if (vcpu->arch.nmi_pending) {
2897 if (vmx_nmi_enabled(vcpu))
2898 vmx_inject_nmi(vcpu);
2899 enable_intr_window(vcpu);
2900 return;
2901 }
2902
2903 }
2904 if (!kvm_cpu_has_interrupt(vcpu))
2787 return; 2905 return;
2788 interrupt_window_open = 2906 if (vmx_irq_enabled(vcpu)) {
2789 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
2790 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
2791 if (interrupt_window_open) {
2792 vector = kvm_cpu_get_interrupt(vcpu); 2907 vector = kvm_cpu_get_interrupt(vcpu);
2793 vmx_inject_irq(vcpu, vector); 2908 vmx_inject_irq(vcpu, vector);
2794 kvm_timer_intr_post(vcpu, vector); 2909 kvm_timer_intr_post(vcpu, vector);
@@ -2838,7 +2953,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2838 "push %%edx; push %%ebp;" 2953 "push %%edx; push %%ebp;"
2839 "push %%ecx \n\t" 2954 "push %%ecx \n\t"
2840#endif 2955#endif
2841 ASM_VMX_VMWRITE_RSP_RDX "\n\t" 2956 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
2842 /* Check if vmlaunch of vmresume is needed */ 2957 /* Check if vmlaunch of vmresume is needed */
2843 "cmpl $0, %c[launched](%0) \n\t" 2958 "cmpl $0, %c[launched](%0) \n\t"
2844 /* Load guest registers. Don't clobber flags. */ 2959 /* Load guest registers. Don't clobber flags. */
@@ -2873,9 +2988,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2873#endif 2988#endif
2874 /* Enter guest mode */ 2989 /* Enter guest mode */
2875 "jne .Llaunched \n\t" 2990 "jne .Llaunched \n\t"
2876 ASM_VMX_VMLAUNCH "\n\t" 2991 __ex(ASM_VMX_VMLAUNCH) "\n\t"
2877 "jmp .Lkvm_vmx_return \n\t" 2992 "jmp .Lkvm_vmx_return \n\t"
2878 ".Llaunched: " ASM_VMX_VMRESUME "\n\t" 2993 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
2879 ".Lkvm_vmx_return: " 2994 ".Lkvm_vmx_return: "
2880 /* Save guest registers, load host registers, keep flags */ 2995 /* Save guest registers, load host registers, keep flags */
2881#ifdef CONFIG_X86_64 2996#ifdef CONFIG_X86_64
@@ -2949,7 +3064,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2949 fixup_rmode_irq(vmx); 3064 fixup_rmode_irq(vmx);
2950 3065
2951 vcpu->arch.interrupt_window_open = 3066 vcpu->arch.interrupt_window_open =
2952 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; 3067 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
3068 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)) == 0;
2953 3069
2954 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 3070 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
2955 vmx->launched = 1; 3071 vmx->launched = 1;
@@ -2957,7 +3073,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2957 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 3073 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2958 3074
2959 /* We need to handle NMIs before interrupts are enabled */ 3075 /* We need to handle NMIs before interrupts are enabled */
2960 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */ 3076 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200 &&
3077 (intr_info & INTR_INFO_VALID_MASK)) {
2961 KVMTRACE_0D(NMI, vcpu, handler); 3078 KVMTRACE_0D(NMI, vcpu, handler);
2962 asm("int $2"); 3079 asm("int $2");
2963 } 3080 }
@@ -2968,7 +3085,7 @@ static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
2968 struct vcpu_vmx *vmx = to_vmx(vcpu); 3085 struct vcpu_vmx *vmx = to_vmx(vcpu);
2969 3086
2970 if (vmx->vmcs) { 3087 if (vmx->vmcs) {
2971 on_each_cpu(__vcpu_clear, vmx, 0, 1); 3088 vcpu_clear(vmx);
2972 free_vmcs(vmx->vmcs); 3089 free_vmcs(vmx->vmcs);
2973 vmx->vmcs = NULL; 3090 vmx->vmcs = NULL;
2974 } 3091 }
@@ -3095,7 +3212,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
3095 .prepare_guest_switch = vmx_save_host_state, 3212 .prepare_guest_switch = vmx_save_host_state,
3096 .vcpu_load = vmx_vcpu_load, 3213 .vcpu_load = vmx_vcpu_load,
3097 .vcpu_put = vmx_vcpu_put, 3214 .vcpu_put = vmx_vcpu_put,
3098 .vcpu_decache = vmx_vcpu_decache,
3099 3215
3100 .set_guest_debug = set_guest_debug, 3216 .set_guest_debug = set_guest_debug,
3101 .guest_debug_pre = kvm_guest_debug_pre, 3217 .guest_debug_pre = kvm_guest_debug_pre,
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index 79d94c610dfe..425a13436b3f 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -40,6 +40,7 @@
40#define CPU_BASED_CR8_LOAD_EXITING 0x00080000 40#define CPU_BASED_CR8_LOAD_EXITING 0x00080000
41#define CPU_BASED_CR8_STORE_EXITING 0x00100000 41#define CPU_BASED_CR8_STORE_EXITING 0x00100000
42#define CPU_BASED_TPR_SHADOW 0x00200000 42#define CPU_BASED_TPR_SHADOW 0x00200000
43#define CPU_BASED_VIRTUAL_NMI_PENDING 0x00400000
43#define CPU_BASED_MOV_DR_EXITING 0x00800000 44#define CPU_BASED_MOV_DR_EXITING 0x00800000
44#define CPU_BASED_UNCOND_IO_EXITING 0x01000000 45#define CPU_BASED_UNCOND_IO_EXITING 0x01000000
45#define CPU_BASED_USE_IO_BITMAPS 0x02000000 46#define CPU_BASED_USE_IO_BITMAPS 0x02000000
@@ -216,7 +217,7 @@ enum vmcs_field {
216#define EXIT_REASON_TRIPLE_FAULT 2 217#define EXIT_REASON_TRIPLE_FAULT 2
217 218
218#define EXIT_REASON_PENDING_INTERRUPT 7 219#define EXIT_REASON_PENDING_INTERRUPT 7
219 220#define EXIT_REASON_NMI_WINDOW 8
220#define EXIT_REASON_TASK_SWITCH 9 221#define EXIT_REASON_TASK_SWITCH 9
221#define EXIT_REASON_CPUID 10 222#define EXIT_REASON_CPUID 10
222#define EXIT_REASON_HLT 12 223#define EXIT_REASON_HLT 12
@@ -251,7 +252,9 @@ enum vmcs_field {
251#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */ 252#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */
252#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */ 253#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */
253#define INTR_INFO_DELIVER_CODE_MASK 0x800 /* 11 */ 254#define INTR_INFO_DELIVER_CODE_MASK 0x800 /* 11 */
255#define INTR_INFO_UNBLOCK_NMI 0x1000 /* 12 */
254#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */ 256#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */
257#define INTR_INFO_RESVD_BITS_MASK 0x7ffff000
255 258
256#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK 259#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK
257#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK 260#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK
@@ -259,9 +262,16 @@ enum vmcs_field {
259#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK 262#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK
260 263
261#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ 264#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */
265#define INTR_TYPE_NMI_INTR (2 << 8) /* NMI */
262#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */ 266#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */
263#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ 267#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */
264 268
269/* GUEST_INTERRUPTIBILITY_INFO flags. */
270#define GUEST_INTR_STATE_STI 0x00000001
271#define GUEST_INTR_STATE_MOV_SS 0x00000002
272#define GUEST_INTR_STATE_SMI 0x00000004
273#define GUEST_INTR_STATE_NMI 0x00000008
274
265/* 275/*
266 * Exit Qualifications for MOV for Control Register Access 276 * Exit Qualifications for MOV for Control Register Access
267 */ 277 */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 63a77caa59f1..9f1cdb011cff 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -72,6 +72,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
72 { "mmio_exits", VCPU_STAT(mmio_exits) }, 72 { "mmio_exits", VCPU_STAT(mmio_exits) },
73 { "signal_exits", VCPU_STAT(signal_exits) }, 73 { "signal_exits", VCPU_STAT(signal_exits) },
74 { "irq_window", VCPU_STAT(irq_window_exits) }, 74 { "irq_window", VCPU_STAT(irq_window_exits) },
75 { "nmi_window", VCPU_STAT(nmi_window_exits) },
75 { "halt_exits", VCPU_STAT(halt_exits) }, 76 { "halt_exits", VCPU_STAT(halt_exits) },
76 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 77 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
77 { "hypercalls", VCPU_STAT(hypercalls) }, 78 { "hypercalls", VCPU_STAT(hypercalls) },
@@ -173,6 +174,12 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
173 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 174 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
174} 175}
175 176
177void kvm_inject_nmi(struct kvm_vcpu *vcpu)
178{
179 vcpu->arch.nmi_pending = 1;
180}
181EXPORT_SYMBOL_GPL(kvm_inject_nmi);
182
176void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 183void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
177{ 184{
178 WARN_ON(vcpu->arch.exception.pending); 185 WARN_ON(vcpu->arch.exception.pending);
@@ -604,6 +611,38 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
604 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); 611 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
605} 612}
606 613
614static bool msr_mtrr_valid(unsigned msr)
615{
616 switch (msr) {
617 case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
618 case MSR_MTRRfix64K_00000:
619 case MSR_MTRRfix16K_80000:
620 case MSR_MTRRfix16K_A0000:
621 case MSR_MTRRfix4K_C0000:
622 case MSR_MTRRfix4K_C8000:
623 case MSR_MTRRfix4K_D0000:
624 case MSR_MTRRfix4K_D8000:
625 case MSR_MTRRfix4K_E0000:
626 case MSR_MTRRfix4K_E8000:
627 case MSR_MTRRfix4K_F0000:
628 case MSR_MTRRfix4K_F8000:
629 case MSR_MTRRdefType:
630 case MSR_IA32_CR_PAT:
631 return true;
632 case 0x2f8:
633 return true;
634 }
635 return false;
636}
637
638static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
639{
640 if (!msr_mtrr_valid(msr))
641 return 1;
642
643 vcpu->arch.mtrr[msr - 0x200] = data;
644 return 0;
645}
607 646
608int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 647int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
609{ 648{
@@ -625,8 +664,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
625 break; 664 break;
626 case MSR_IA32_UCODE_REV: 665 case MSR_IA32_UCODE_REV:
627 case MSR_IA32_UCODE_WRITE: 666 case MSR_IA32_UCODE_WRITE:
628 case 0x200 ... 0x2ff: /* MTRRs */
629 break; 667 break;
668 case 0x200 ... 0x2ff:
669 return set_msr_mtrr(vcpu, msr, data);
630 case MSR_IA32_APICBASE: 670 case MSR_IA32_APICBASE:
631 kvm_set_apic_base(vcpu, data); 671 kvm_set_apic_base(vcpu, data);
632 break; 672 break;
@@ -684,6 +724,15 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
684 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); 724 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
685} 725}
686 726
727static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
728{
729 if (!msr_mtrr_valid(msr))
730 return 1;
731
732 *pdata = vcpu->arch.mtrr[msr - 0x200];
733 return 0;
734}
735
687int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 736int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
688{ 737{
689 u64 data; 738 u64 data;
@@ -705,11 +754,13 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
705 case MSR_IA32_MC0_MISC+16: 754 case MSR_IA32_MC0_MISC+16:
706 case MSR_IA32_UCODE_REV: 755 case MSR_IA32_UCODE_REV:
707 case MSR_IA32_EBL_CR_POWERON: 756 case MSR_IA32_EBL_CR_POWERON:
708 /* MTRR registers */
709 case 0xfe:
710 case 0x200 ... 0x2ff:
711 data = 0; 757 data = 0;
712 break; 758 break;
759 case MSR_MTRRcap:
760 data = 0x500 | KVM_NR_VAR_MTRR;
761 break;
762 case 0x200 ... 0x2ff:
763 return get_msr_mtrr(vcpu, msr, pdata);
713 case 0xcd: /* fsb frequency */ 764 case 0xcd: /* fsb frequency */
714 data = 3; 765 data = 3;
715 break; 766 break;
@@ -817,41 +868,6 @@ out:
817 return r; 868 return r;
818} 869}
819 870
820/*
821 * Make sure that a cpu that is being hot-unplugged does not have any vcpus
822 * cached on it.
823 */
824void decache_vcpus_on_cpu(int cpu)
825{
826 struct kvm *vm;
827 struct kvm_vcpu *vcpu;
828 int i;
829
830 spin_lock(&kvm_lock);
831 list_for_each_entry(vm, &vm_list, vm_list)
832 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
833 vcpu = vm->vcpus[i];
834 if (!vcpu)
835 continue;
836 /*
837 * If the vcpu is locked, then it is running on some
838 * other cpu and therefore it is not cached on the
839 * cpu in question.
840 *
841 * If it's not locked, check the last cpu it executed
842 * on.
843 */
844 if (mutex_trylock(&vcpu->mutex)) {
845 if (vcpu->cpu == cpu) {
846 kvm_x86_ops->vcpu_decache(vcpu);
847 vcpu->cpu = -1;
848 }
849 mutex_unlock(&vcpu->mutex);
850 }
851 }
852 spin_unlock(&kvm_lock);
853}
854
855int kvm_dev_ioctl_check_extension(long ext) 871int kvm_dev_ioctl_check_extension(long ext)
856{ 872{
857 int r; 873 int r;
@@ -869,6 +885,9 @@ int kvm_dev_ioctl_check_extension(long ext)
869 case KVM_CAP_MP_STATE: 885 case KVM_CAP_MP_STATE:
870 r = 1; 886 r = 1;
871 break; 887 break;
888 case KVM_CAP_COALESCED_MMIO:
889 r = KVM_COALESCED_MMIO_PAGE_OFFSET;
890 break;
872 case KVM_CAP_VAPIC: 891 case KVM_CAP_VAPIC:
873 r = !kvm_x86_ops->cpu_has_accelerated_tpr(); 892 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
874 break; 893 break;
@@ -1781,13 +1800,14 @@ static void kvm_init_msr_list(void)
1781 * Only apic need an MMIO device hook, so shortcut now.. 1800 * Only apic need an MMIO device hook, so shortcut now..
1782 */ 1801 */
1783static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, 1802static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1784 gpa_t addr) 1803 gpa_t addr, int len,
1804 int is_write)
1785{ 1805{
1786 struct kvm_io_device *dev; 1806 struct kvm_io_device *dev;
1787 1807
1788 if (vcpu->arch.apic) { 1808 if (vcpu->arch.apic) {
1789 dev = &vcpu->arch.apic->dev; 1809 dev = &vcpu->arch.apic->dev;
1790 if (dev->in_range(dev, addr)) 1810 if (dev->in_range(dev, addr, len, is_write))
1791 return dev; 1811 return dev;
1792 } 1812 }
1793 return NULL; 1813 return NULL;
@@ -1795,13 +1815,15 @@ static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1795 1815
1796 1816
1797static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, 1817static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1798 gpa_t addr) 1818 gpa_t addr, int len,
1819 int is_write)
1799{ 1820{
1800 struct kvm_io_device *dev; 1821 struct kvm_io_device *dev;
1801 1822
1802 dev = vcpu_find_pervcpu_dev(vcpu, addr); 1823 dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write);
1803 if (dev == NULL) 1824 if (dev == NULL)
1804 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr); 1825 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len,
1826 is_write);
1805 return dev; 1827 return dev;
1806} 1828}
1807 1829
@@ -1869,7 +1891,7 @@ mmio:
1869 * Is this MMIO handled locally? 1891 * Is this MMIO handled locally?
1870 */ 1892 */
1871 mutex_lock(&vcpu->kvm->lock); 1893 mutex_lock(&vcpu->kvm->lock);
1872 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); 1894 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0);
1873 if (mmio_dev) { 1895 if (mmio_dev) {
1874 kvm_iodevice_read(mmio_dev, gpa, bytes, val); 1896 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1875 mutex_unlock(&vcpu->kvm->lock); 1897 mutex_unlock(&vcpu->kvm->lock);
@@ -1924,7 +1946,7 @@ mmio:
1924 * Is this MMIO handled locally? 1946 * Is this MMIO handled locally?
1925 */ 1947 */
1926 mutex_lock(&vcpu->kvm->lock); 1948 mutex_lock(&vcpu->kvm->lock);
1927 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); 1949 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
1928 if (mmio_dev) { 1950 if (mmio_dev) {
1929 kvm_iodevice_write(mmio_dev, gpa, bytes, val); 1951 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1930 mutex_unlock(&vcpu->kvm->lock); 1952 mutex_unlock(&vcpu->kvm->lock);
@@ -2020,6 +2042,7 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2020 2042
2021int emulate_clts(struct kvm_vcpu *vcpu) 2043int emulate_clts(struct kvm_vcpu *vcpu)
2022{ 2044{
2045 KVMTRACE_0D(CLTS, vcpu, handler);
2023 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); 2046 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
2024 return X86EMUL_CONTINUE; 2047 return X86EMUL_CONTINUE;
2025} 2048}
@@ -2053,21 +2076,19 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
2053 2076
2054void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 2077void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2055{ 2078{
2056 static int reported;
2057 u8 opcodes[4]; 2079 u8 opcodes[4];
2058 unsigned long rip = vcpu->arch.rip; 2080 unsigned long rip = vcpu->arch.rip;
2059 unsigned long rip_linear; 2081 unsigned long rip_linear;
2060 2082
2061 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 2083 if (!printk_ratelimit())
2062
2063 if (reported)
2064 return; 2084 return;
2065 2085
2086 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
2087
2066 emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu); 2088 emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
2067 2089
2068 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", 2090 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
2069 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); 2091 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
2070 reported = 1;
2071} 2092}
2072EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 2093EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
2073 2094
@@ -2105,27 +2126,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2105 ? X86EMUL_MODE_PROT64 : cs_db 2126 ? X86EMUL_MODE_PROT64 : cs_db
2106 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 2127 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2107 2128
2108 if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
2109 vcpu->arch.emulate_ctxt.cs_base = 0;
2110 vcpu->arch.emulate_ctxt.ds_base = 0;
2111 vcpu->arch.emulate_ctxt.es_base = 0;
2112 vcpu->arch.emulate_ctxt.ss_base = 0;
2113 } else {
2114 vcpu->arch.emulate_ctxt.cs_base =
2115 get_segment_base(vcpu, VCPU_SREG_CS);
2116 vcpu->arch.emulate_ctxt.ds_base =
2117 get_segment_base(vcpu, VCPU_SREG_DS);
2118 vcpu->arch.emulate_ctxt.es_base =
2119 get_segment_base(vcpu, VCPU_SREG_ES);
2120 vcpu->arch.emulate_ctxt.ss_base =
2121 get_segment_base(vcpu, VCPU_SREG_SS);
2122 }
2123
2124 vcpu->arch.emulate_ctxt.gs_base =
2125 get_segment_base(vcpu, VCPU_SREG_GS);
2126 vcpu->arch.emulate_ctxt.fs_base =
2127 get_segment_base(vcpu, VCPU_SREG_FS);
2128
2129 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2129 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2130 2130
2131 /* Reject the instructions other than VMCALL/VMMCALL when 2131 /* Reject the instructions other than VMCALL/VMMCALL when
@@ -2300,9 +2300,10 @@ static void pio_string_write(struct kvm_io_device *pio_dev,
2300} 2300}
2301 2301
2302static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, 2302static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
2303 gpa_t addr) 2303 gpa_t addr, int len,
2304 int is_write)
2304{ 2305{
2305 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr); 2306 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write);
2306} 2307}
2307 2308
2308int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 2309int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
@@ -2331,11 +2332,10 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2331 2332
2332 kvm_x86_ops->cache_regs(vcpu); 2333 kvm_x86_ops->cache_regs(vcpu);
2333 memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); 2334 memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
2334 kvm_x86_ops->decache_regs(vcpu);
2335 2335
2336 kvm_x86_ops->skip_emulated_instruction(vcpu); 2336 kvm_x86_ops->skip_emulated_instruction(vcpu);
2337 2337
2338 pio_dev = vcpu_find_pio_dev(vcpu, port); 2338 pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
2339 if (pio_dev) { 2339 if (pio_dev) {
2340 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); 2340 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
2341 complete_pio(vcpu); 2341 complete_pio(vcpu);
@@ -2417,7 +2417,9 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2417 } 2417 }
2418 } 2418 }
2419 2419
2420 pio_dev = vcpu_find_pio_dev(vcpu, port); 2420 pio_dev = vcpu_find_pio_dev(vcpu, port,
2421 vcpu->arch.pio.cur_count,
2422 !vcpu->arch.pio.in);
2421 if (!vcpu->arch.pio.in) { 2423 if (!vcpu->arch.pio.in) {
2422 /* string PIO write */ 2424 /* string PIO write */
2423 ret = pio_copy_data(vcpu); 2425 ret = pio_copy_data(vcpu);
@@ -2600,27 +2602,41 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
2600 2602
2601unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 2603unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2602{ 2604{
2605 unsigned long value;
2606
2603 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 2607 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2604 switch (cr) { 2608 switch (cr) {
2605 case 0: 2609 case 0:
2606 return vcpu->arch.cr0; 2610 value = vcpu->arch.cr0;
2611 break;
2607 case 2: 2612 case 2:
2608 return vcpu->arch.cr2; 2613 value = vcpu->arch.cr2;
2614 break;
2609 case 3: 2615 case 3:
2610 return vcpu->arch.cr3; 2616 value = vcpu->arch.cr3;
2617 break;
2611 case 4: 2618 case 4:
2612 return vcpu->arch.cr4; 2619 value = vcpu->arch.cr4;
2620 break;
2613 case 8: 2621 case 8:
2614 return kvm_get_cr8(vcpu); 2622 value = kvm_get_cr8(vcpu);
2623 break;
2615 default: 2624 default:
2616 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 2625 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
2617 return 0; 2626 return 0;
2618 } 2627 }
2628 KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value,
2629 (u32)((u64)value >> 32), handler);
2630
2631 return value;
2619} 2632}
2620 2633
2621void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, 2634void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
2622 unsigned long *rflags) 2635 unsigned long *rflags)
2623{ 2636{
2637 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val,
2638 (u32)((u64)val >> 32), handler);
2639
2624 switch (cr) { 2640 switch (cr) {
2625 case 0: 2641 case 0:
2626 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 2642 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
@@ -2771,8 +2787,10 @@ static void vapic_exit(struct kvm_vcpu *vcpu)
2771 if (!apic || !apic->vapic_addr) 2787 if (!apic || !apic->vapic_addr)
2772 return; 2788 return;
2773 2789
2790 down_read(&vcpu->kvm->slots_lock);
2774 kvm_release_page_dirty(apic->vapic_page); 2791 kvm_release_page_dirty(apic->vapic_page);
2775 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 2792 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
2793 up_read(&vcpu->kvm->slots_lock);
2776} 2794}
2777 2795
2778static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2796static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -2928,9 +2946,7 @@ out:
2928 2946
2929 post_kvm_run_save(vcpu, kvm_run); 2947 post_kvm_run_save(vcpu, kvm_run);
2930 2948
2931 down_read(&vcpu->kvm->slots_lock);
2932 vapic_exit(vcpu); 2949 vapic_exit(vcpu);
2933 up_read(&vcpu->kvm->slots_lock);
2934 2950
2935 return r; 2951 return r;
2936} 2952}
@@ -2942,15 +2958,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2942 2958
2943 vcpu_load(vcpu); 2959 vcpu_load(vcpu);
2944 2960
2961 if (vcpu->sigset_active)
2962 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
2963
2945 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 2964 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
2946 kvm_vcpu_block(vcpu); 2965 kvm_vcpu_block(vcpu);
2947 vcpu_put(vcpu); 2966 r = -EAGAIN;
2948 return -EAGAIN; 2967 goto out;
2949 } 2968 }
2950 2969
2951 if (vcpu->sigset_active)
2952 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
2953
2954 /* re-sync apic's tpr */ 2970 /* re-sync apic's tpr */
2955 if (!irqchip_in_kernel(vcpu->kvm)) 2971 if (!irqchip_in_kernel(vcpu->kvm))
2956 kvm_set_cr8(vcpu, kvm_run->cr8); 2972 kvm_set_cr8(vcpu, kvm_run->cr8);
@@ -3070,8 +3086,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3070 return 0; 3086 return 0;
3071} 3087}
3072 3088
3073static void get_segment(struct kvm_vcpu *vcpu, 3089void kvm_get_segment(struct kvm_vcpu *vcpu,
3074 struct kvm_segment *var, int seg) 3090 struct kvm_segment *var, int seg)
3075{ 3091{
3076 kvm_x86_ops->get_segment(vcpu, var, seg); 3092 kvm_x86_ops->get_segment(vcpu, var, seg);
3077} 3093}
@@ -3080,7 +3096,7 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3080{ 3096{
3081 struct kvm_segment cs; 3097 struct kvm_segment cs;
3082 3098
3083 get_segment(vcpu, &cs, VCPU_SREG_CS); 3099 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
3084 *db = cs.db; 3100 *db = cs.db;
3085 *l = cs.l; 3101 *l = cs.l;
3086} 3102}
@@ -3094,15 +3110,15 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
3094 3110
3095 vcpu_load(vcpu); 3111 vcpu_load(vcpu);
3096 3112
3097 get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3113 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3098 get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 3114 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3099 get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 3115 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3100 get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 3116 kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3101 get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 3117 kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3102 get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 3118 kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
3103 3119
3104 get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3120 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3105 get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3121 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3106 3122
3107 kvm_x86_ops->get_idt(vcpu, &dt); 3123 kvm_x86_ops->get_idt(vcpu, &dt);
3108 sregs->idt.limit = dt.limit; 3124 sregs->idt.limit = dt.limit;
@@ -3154,7 +3170,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
3154 return 0; 3170 return 0;
3155} 3171}
3156 3172
3157static void set_segment(struct kvm_vcpu *vcpu, 3173static void kvm_set_segment(struct kvm_vcpu *vcpu,
3158 struct kvm_segment *var, int seg) 3174 struct kvm_segment *var, int seg)
3159{ 3175{
3160 kvm_x86_ops->set_segment(vcpu, var, seg); 3176 kvm_x86_ops->set_segment(vcpu, var, seg);
@@ -3191,7 +3207,7 @@ static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu,
3191 if (selector & 1 << 2) { 3207 if (selector & 1 << 2) {
3192 struct kvm_segment kvm_seg; 3208 struct kvm_segment kvm_seg;
3193 3209
3194 get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); 3210 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
3195 3211
3196 if (kvm_seg.unusable) 3212 if (kvm_seg.unusable)
3197 dtable->limit = 0; 3213 dtable->limit = 0;
@@ -3297,7 +3313,7 @@ static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
3297{ 3313{
3298 struct kvm_segment kvm_seg; 3314 struct kvm_segment kvm_seg;
3299 3315
3300 get_segment(vcpu, &kvm_seg, seg); 3316 kvm_get_segment(vcpu, &kvm_seg, seg);
3301 return kvm_seg.selector; 3317 return kvm_seg.selector;
3302} 3318}
3303 3319
@@ -3313,8 +3329,8 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
3313 return 0; 3329 return 0;
3314} 3330}
3315 3331
3316static int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3332int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3317 int type_bits, int seg) 3333 int type_bits, int seg)
3318{ 3334{
3319 struct kvm_segment kvm_seg; 3335 struct kvm_segment kvm_seg;
3320 3336
@@ -3327,7 +3343,7 @@ static int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3327 if (!kvm_seg.s) 3343 if (!kvm_seg.s)
3328 kvm_seg.unusable = 1; 3344 kvm_seg.unusable = 1;
3329 3345
3330 set_segment(vcpu, &kvm_seg, seg); 3346 kvm_set_segment(vcpu, &kvm_seg, seg);
3331 return 0; 3347 return 0;
3332} 3348}
3333 3349
@@ -3373,25 +3389,25 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
3373 vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi; 3389 vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi;
3374 vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi; 3390 vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi;
3375 3391
3376 if (load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) 3392 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
3377 return 1; 3393 return 1;
3378 3394
3379 if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 3395 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3380 return 1; 3396 return 1;
3381 3397
3382 if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 3398 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3383 return 1; 3399 return 1;
3384 3400
3385 if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 3401 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3386 return 1; 3402 return 1;
3387 3403
3388 if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 3404 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3389 return 1; 3405 return 1;
3390 3406
3391 if (load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) 3407 if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
3392 return 1; 3408 return 1;
3393 3409
3394 if (load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) 3410 if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
3395 return 1; 3411 return 1;
3396 return 0; 3412 return 0;
3397} 3413}
@@ -3432,24 +3448,24 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu,
3432 vcpu->arch.regs[VCPU_REGS_RSI] = tss->si; 3448 vcpu->arch.regs[VCPU_REGS_RSI] = tss->si;
3433 vcpu->arch.regs[VCPU_REGS_RDI] = tss->di; 3449 vcpu->arch.regs[VCPU_REGS_RDI] = tss->di;
3434 3450
3435 if (load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) 3451 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
3436 return 1; 3452 return 1;
3437 3453
3438 if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 3454 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3439 return 1; 3455 return 1;
3440 3456
3441 if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 3457 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3442 return 1; 3458 return 1;
3443 3459
3444 if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 3460 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3445 return 1; 3461 return 1;
3446 3462
3447 if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 3463 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3448 return 1; 3464 return 1;
3449 return 0; 3465 return 0;
3450} 3466}
3451 3467
3452int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, 3468static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
3453 struct desc_struct *cseg_desc, 3469 struct desc_struct *cseg_desc,
3454 struct desc_struct *nseg_desc) 3470 struct desc_struct *nseg_desc)
3455{ 3471{
@@ -3472,7 +3488,7 @@ out:
3472 return ret; 3488 return ret;
3473} 3489}
3474 3490
3475int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, 3491static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
3476 struct desc_struct *cseg_desc, 3492 struct desc_struct *cseg_desc,
3477 struct desc_struct *nseg_desc) 3493 struct desc_struct *nseg_desc)
3478{ 3494{
@@ -3502,7 +3518,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3502 struct desc_struct nseg_desc; 3518 struct desc_struct nseg_desc;
3503 int ret = 0; 3519 int ret = 0;
3504 3520
3505 get_segment(vcpu, &tr_seg, VCPU_SREG_TR); 3521 kvm_get_segment(vcpu, &tr_seg, VCPU_SREG_TR);
3506 3522
3507 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) 3523 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
3508 goto out; 3524 goto out;
@@ -3561,7 +3577,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3561 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); 3577 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);
3562 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); 3578 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
3563 tr_seg.type = 11; 3579 tr_seg.type = 11;
3564 set_segment(vcpu, &tr_seg, VCPU_SREG_TR); 3580 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
3565out: 3581out:
3566 kvm_x86_ops->decache_regs(vcpu); 3582 kvm_x86_ops->decache_regs(vcpu);
3567 return ret; 3583 return ret;
@@ -3628,15 +3644,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3628 } 3644 }
3629 } 3645 }
3630 3646
3631 set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3647 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3632 set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 3648 kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3633 set_segment(vcpu, &sregs->es, VCPU_SREG_ES); 3649 kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3634 set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 3650 kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3635 set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 3651 kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3636 set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 3652 kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
3637 3653
3638 set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3654 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3639 set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3655 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3640 3656
3641 vcpu_put(vcpu); 3657 vcpu_put(vcpu);
3642 3658
@@ -3751,14 +3767,14 @@ void fx_init(struct kvm_vcpu *vcpu)
3751 * allocate ram with GFP_KERNEL. 3767 * allocate ram with GFP_KERNEL.
3752 */ 3768 */
3753 if (!used_math()) 3769 if (!used_math())
3754 fx_save(&vcpu->arch.host_fx_image); 3770 kvm_fx_save(&vcpu->arch.host_fx_image);
3755 3771
3756 /* Initialize guest FPU by resetting ours and saving into guest's */ 3772 /* Initialize guest FPU by resetting ours and saving into guest's */
3757 preempt_disable(); 3773 preempt_disable();
3758 fx_save(&vcpu->arch.host_fx_image); 3774 kvm_fx_save(&vcpu->arch.host_fx_image);
3759 fx_finit(); 3775 kvm_fx_finit();
3760 fx_save(&vcpu->arch.guest_fx_image); 3776 kvm_fx_save(&vcpu->arch.guest_fx_image);
3761 fx_restore(&vcpu->arch.host_fx_image); 3777 kvm_fx_restore(&vcpu->arch.host_fx_image);
3762 preempt_enable(); 3778 preempt_enable();
3763 3779
3764 vcpu->arch.cr0 |= X86_CR0_ET; 3780 vcpu->arch.cr0 |= X86_CR0_ET;
@@ -3775,8 +3791,8 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
3775 return; 3791 return;
3776 3792
3777 vcpu->guest_fpu_loaded = 1; 3793 vcpu->guest_fpu_loaded = 1;
3778 fx_save(&vcpu->arch.host_fx_image); 3794 kvm_fx_save(&vcpu->arch.host_fx_image);
3779 fx_restore(&vcpu->arch.guest_fx_image); 3795 kvm_fx_restore(&vcpu->arch.guest_fx_image);
3780} 3796}
3781EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); 3797EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
3782 3798
@@ -3786,8 +3802,8 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
3786 return; 3802 return;
3787 3803
3788 vcpu->guest_fpu_loaded = 0; 3804 vcpu->guest_fpu_loaded = 0;
3789 fx_save(&vcpu->arch.guest_fx_image); 3805 kvm_fx_save(&vcpu->arch.guest_fx_image);
3790 fx_restore(&vcpu->arch.host_fx_image); 3806 kvm_fx_restore(&vcpu->arch.host_fx_image);
3791 ++vcpu->stat.fpu_reload; 3807 ++vcpu->stat.fpu_reload;
3792} 3808}
3793EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); 3809EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
@@ -4016,6 +4032,11 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
4016 return 0; 4032 return 0;
4017} 4033}
4018 4034
4035void kvm_arch_flush_shadow(struct kvm *kvm)
4036{
4037 kvm_mmu_zap_all(kvm);
4038}
4039
4019int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 4040int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
4020{ 4041{
4021 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 4042 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
@@ -4044,6 +4065,6 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
4044 * So need not to call smp_call_function_single() in that case. 4065 * So need not to call smp_call_function_single() in that case.
4045 */ 4066 */
4046 if (vcpu->guest_mode && vcpu->cpu != cpu) 4067 if (vcpu->guest_mode && vcpu->cpu != cpu)
4047 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0); 4068 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);
4048 put_cpu(); 4069 put_cpu();
4049} 4070}
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 932f216d890c..f2f90468f8b1 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -121,7 +121,7 @@ static u16 opcode_table[256] = {
121 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , 121 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
122 0, 0, 0, 0, 122 0, 0, 0, 0,
123 /* 0x68 - 0x6F */ 123 /* 0x68 - 0x6F */
124 0, 0, ImplicitOps | Mov | Stack, 0, 124 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
125 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ 125 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
126 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ 126 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
127 /* 0x70 - 0x77 */ 127 /* 0x70 - 0x77 */
@@ -138,9 +138,11 @@ static u16 opcode_table[256] = {
138 /* 0x88 - 0x8F */ 138 /* 0x88 - 0x8F */
139 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, 139 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
140 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, 140 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
141 0, ModRM | DstReg, 0, Group | Group1A, 141 DstMem | SrcReg | ModRM | Mov, ModRM | DstReg,
142 /* 0x90 - 0x9F */ 142 DstReg | SrcMem | ModRM | Mov, Group | Group1A,
143 0, 0, 0, 0, 0, 0, 0, 0, 143 /* 0x90 - 0x97 */
144 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
145 /* 0x98 - 0x9F */
144 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, 146 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
145 /* 0xA0 - 0xA7 */ 147 /* 0xA0 - 0xA7 */
146 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, 148 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
@@ -152,7 +154,8 @@ static u16 opcode_table[256] = {
152 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 154 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
153 ByteOp | ImplicitOps | String, ImplicitOps | String, 155 ByteOp | ImplicitOps | String, ImplicitOps | String,
154 /* 0xB0 - 0xBF */ 156 /* 0xB0 - 0xBF */
155 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 157 0, 0, 0, 0, 0, 0, 0, 0,
158 DstReg | SrcImm | Mov, 0, 0, 0, 0, 0, 0, 0,
156 /* 0xC0 - 0xC7 */ 159 /* 0xC0 - 0xC7 */
157 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, 160 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
158 0, ImplicitOps | Stack, 0, 0, 161 0, ImplicitOps | Stack, 0, 0,
@@ -168,7 +171,8 @@ static u16 opcode_table[256] = {
168 /* 0xE0 - 0xE7 */ 171 /* 0xE0 - 0xE7 */
169 0, 0, 0, 0, 0, 0, 0, 0, 172 0, 0, 0, 0, 0, 0, 0, 0,
170 /* 0xE8 - 0xEF */ 173 /* 0xE8 - 0xEF */
171 ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, 174 ImplicitOps | Stack, SrcImm | ImplicitOps,
175 ImplicitOps, SrcImmByte | ImplicitOps,
172 0, 0, 0, 0, 176 0, 0, 0, 0,
173 /* 0xF0 - 0xF7 */ 177 /* 0xF0 - 0xF7 */
174 0, 0, 0, 0, 178 0, 0, 0, 0,
@@ -215,7 +219,7 @@ static u16 twobyte_table[256] = {
215 /* 0xA0 - 0xA7 */ 219 /* 0xA0 - 0xA7 */
216 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, 220 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
217 /* 0xA8 - 0xAF */ 221 /* 0xA8 - 0xAF */
218 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, 222 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, ModRM, 0,
219 /* 0xB0 - 0xB7 */ 223 /* 0xB0 - 0xB7 */
220 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, 224 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
221 DstMem | SrcReg | ModRM | BitOp, 225 DstMem | SrcReg | ModRM | BitOp,
@@ -518,6 +522,39 @@ static inline void jmp_rel(struct decode_cache *c, int rel)
518 register_address_increment(c, &c->eip, rel); 522 register_address_increment(c, &c->eip, rel);
519} 523}
520 524
525static void set_seg_override(struct decode_cache *c, int seg)
526{
527 c->has_seg_override = true;
528 c->seg_override = seg;
529}
530
531static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
532{
533 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
534 return 0;
535
536 return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg);
537}
538
539static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt,
540 struct decode_cache *c)
541{
542 if (!c->has_seg_override)
543 return 0;
544
545 return seg_base(ctxt, c->seg_override);
546}
547
548static unsigned long es_base(struct x86_emulate_ctxt *ctxt)
549{
550 return seg_base(ctxt, VCPU_SREG_ES);
551}
552
553static unsigned long ss_base(struct x86_emulate_ctxt *ctxt)
554{
555 return seg_base(ctxt, VCPU_SREG_SS);
556}
557
521static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 558static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
522 struct x86_emulate_ops *ops, 559 struct x86_emulate_ops *ops,
523 unsigned long linear, u8 *dest) 560 unsigned long linear, u8 *dest)
@@ -660,7 +697,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
660{ 697{
661 struct decode_cache *c = &ctxt->decode; 698 struct decode_cache *c = &ctxt->decode;
662 u8 sib; 699 u8 sib;
663 int index_reg = 0, base_reg = 0, scale, rip_relative = 0; 700 int index_reg = 0, base_reg = 0, scale;
664 int rc = 0; 701 int rc = 0;
665 702
666 if (c->rex_prefix) { 703 if (c->rex_prefix) {
@@ -731,47 +768,28 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
731 } 768 }
732 if (c->modrm_rm == 2 || c->modrm_rm == 3 || 769 if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
733 (c->modrm_rm == 6 && c->modrm_mod != 0)) 770 (c->modrm_rm == 6 && c->modrm_mod != 0))
734 if (!c->override_base) 771 if (!c->has_seg_override)
735 c->override_base = &ctxt->ss_base; 772 set_seg_override(c, VCPU_SREG_SS);
736 c->modrm_ea = (u16)c->modrm_ea; 773 c->modrm_ea = (u16)c->modrm_ea;
737 } else { 774 } else {
738 /* 32/64-bit ModR/M decode. */ 775 /* 32/64-bit ModR/M decode. */
739 switch (c->modrm_rm) { 776 if ((c->modrm_rm & 7) == 4) {
740 case 4:
741 case 12:
742 sib = insn_fetch(u8, 1, c->eip); 777 sib = insn_fetch(u8, 1, c->eip);
743 index_reg |= (sib >> 3) & 7; 778 index_reg |= (sib >> 3) & 7;
744 base_reg |= sib & 7; 779 base_reg |= sib & 7;
745 scale = sib >> 6; 780 scale = sib >> 6;
746 781
747 switch (base_reg) { 782 if ((base_reg & 7) == 5 && c->modrm_mod == 0)
748 case 5: 783 c->modrm_ea += insn_fetch(s32, 4, c->eip);
749 if (c->modrm_mod != 0) 784 else
750 c->modrm_ea += c->regs[base_reg];
751 else
752 c->modrm_ea +=
753 insn_fetch(s32, 4, c->eip);
754 break;
755 default:
756 c->modrm_ea += c->regs[base_reg]; 785 c->modrm_ea += c->regs[base_reg];
757 } 786 if (index_reg != 4)
758 switch (index_reg) {
759 case 4:
760 break;
761 default:
762 c->modrm_ea += c->regs[index_reg] << scale; 787 c->modrm_ea += c->regs[index_reg] << scale;
763 } 788 } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) {
764 break; 789 if (ctxt->mode == X86EMUL_MODE_PROT64)
765 case 5: 790 c->rip_relative = 1;
766 if (c->modrm_mod != 0) 791 } else
767 c->modrm_ea += c->regs[c->modrm_rm];
768 else if (ctxt->mode == X86EMUL_MODE_PROT64)
769 rip_relative = 1;
770 break;
771 default:
772 c->modrm_ea += c->regs[c->modrm_rm]; 792 c->modrm_ea += c->regs[c->modrm_rm];
773 break;
774 }
775 switch (c->modrm_mod) { 793 switch (c->modrm_mod) {
776 case 0: 794 case 0:
777 if (c->modrm_rm == 5) 795 if (c->modrm_rm == 5)
@@ -785,22 +803,6 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
785 break; 803 break;
786 } 804 }
787 } 805 }
788 if (rip_relative) {
789 c->modrm_ea += c->eip;
790 switch (c->d & SrcMask) {
791 case SrcImmByte:
792 c->modrm_ea += 1;
793 break;
794 case SrcImm:
795 if (c->d & ByteOp)
796 c->modrm_ea += 1;
797 else
798 if (c->op_bytes == 8)
799 c->modrm_ea += 4;
800 else
801 c->modrm_ea += c->op_bytes;
802 }
803 }
804done: 806done:
805 return rc; 807 return rc;
806} 808}
@@ -838,6 +840,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
838 840
839 memset(c, 0, sizeof(struct decode_cache)); 841 memset(c, 0, sizeof(struct decode_cache));
840 c->eip = ctxt->vcpu->arch.rip; 842 c->eip = ctxt->vcpu->arch.rip;
843 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
841 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 844 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
842 845
843 switch (mode) { 846 switch (mode) {
@@ -876,23 +879,15 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
876 /* switch between 2/4 bytes */ 879 /* switch between 2/4 bytes */
877 c->ad_bytes = def_ad_bytes ^ 6; 880 c->ad_bytes = def_ad_bytes ^ 6;
878 break; 881 break;
882 case 0x26: /* ES override */
879 case 0x2e: /* CS override */ 883 case 0x2e: /* CS override */
880 c->override_base = &ctxt->cs_base; 884 case 0x36: /* SS override */
881 break;
882 case 0x3e: /* DS override */ 885 case 0x3e: /* DS override */
883 c->override_base = &ctxt->ds_base; 886 set_seg_override(c, (c->b >> 3) & 3);
884 break;
885 case 0x26: /* ES override */
886 c->override_base = &ctxt->es_base;
887 break; 887 break;
888 case 0x64: /* FS override */ 888 case 0x64: /* FS override */
889 c->override_base = &ctxt->fs_base;
890 break;
891 case 0x65: /* GS override */ 889 case 0x65: /* GS override */
892 c->override_base = &ctxt->gs_base; 890 set_seg_override(c, c->b & 7);
893 break;
894 case 0x36: /* SS override */
895 c->override_base = &ctxt->ss_base;
896 break; 891 break;
897 case 0x40 ... 0x4f: /* REX */ 892 case 0x40 ... 0x4f: /* REX */
898 if (mode != X86EMUL_MODE_PROT64) 893 if (mode != X86EMUL_MODE_PROT64)
@@ -964,15 +959,11 @@ done_prefixes:
964 if (rc) 959 if (rc)
965 goto done; 960 goto done;
966 961
967 if (!c->override_base) 962 if (!c->has_seg_override)
968 c->override_base = &ctxt->ds_base; 963 set_seg_override(c, VCPU_SREG_DS);
969 if (mode == X86EMUL_MODE_PROT64 &&
970 c->override_base != &ctxt->fs_base &&
971 c->override_base != &ctxt->gs_base)
972 c->override_base = NULL;
973 964
974 if (c->override_base) 965 if (!(!c->twobyte && c->b == 0x8d))
975 c->modrm_ea += *c->override_base; 966 c->modrm_ea += seg_override_base(ctxt, c);
976 967
977 if (c->ad_bytes != 8) 968 if (c->ad_bytes != 8)
978 c->modrm_ea = (u32)c->modrm_ea; 969 c->modrm_ea = (u32)c->modrm_ea;
@@ -1049,6 +1040,7 @@ done_prefixes:
1049 break; 1040 break;
1050 case DstMem: 1041 case DstMem:
1051 if ((c->d & ModRM) && c->modrm_mod == 3) { 1042 if ((c->d & ModRM) && c->modrm_mod == 3) {
1043 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1052 c->dst.type = OP_REG; 1044 c->dst.type = OP_REG;
1053 c->dst.val = c->dst.orig_val = c->modrm_val; 1045 c->dst.val = c->dst.orig_val = c->modrm_val;
1054 c->dst.ptr = c->modrm_ptr; 1046 c->dst.ptr = c->modrm_ptr;
@@ -1058,6 +1050,9 @@ done_prefixes:
1058 break; 1050 break;
1059 } 1051 }
1060 1052
1053 if (c->rip_relative)
1054 c->modrm_ea += c->eip;
1055
1061done: 1056done:
1062 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 1057 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
1063} 1058}
@@ -1070,7 +1065,7 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
1070 c->dst.bytes = c->op_bytes; 1065 c->dst.bytes = c->op_bytes;
1071 c->dst.val = c->src.val; 1066 c->dst.val = c->src.val;
1072 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); 1067 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
1073 c->dst.ptr = (void *) register_address(c, ctxt->ss_base, 1068 c->dst.ptr = (void *) register_address(c, ss_base(ctxt),
1074 c->regs[VCPU_REGS_RSP]); 1069 c->regs[VCPU_REGS_RSP]);
1075} 1070}
1076 1071
@@ -1080,7 +1075,7 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
1080 struct decode_cache *c = &ctxt->decode; 1075 struct decode_cache *c = &ctxt->decode;
1081 int rc; 1076 int rc;
1082 1077
1083 rc = ops->read_std(register_address(c, ctxt->ss_base, 1078 rc = ops->read_std(register_address(c, ss_base(ctxt),
1084 c->regs[VCPU_REGS_RSP]), 1079 c->regs[VCPU_REGS_RSP]),
1085 &c->dst.val, c->dst.bytes, ctxt->vcpu); 1080 &c->dst.val, c->dst.bytes, ctxt->vcpu);
1086 if (rc != 0) 1081 if (rc != 0)
@@ -1402,11 +1397,11 @@ special_insn:
1402 register_address_increment(c, &c->regs[VCPU_REGS_RSP], 1397 register_address_increment(c, &c->regs[VCPU_REGS_RSP],
1403 -c->op_bytes); 1398 -c->op_bytes);
1404 c->dst.ptr = (void *) register_address( 1399 c->dst.ptr = (void *) register_address(
1405 c, ctxt->ss_base, c->regs[VCPU_REGS_RSP]); 1400 c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]);
1406 break; 1401 break;
1407 case 0x58 ... 0x5f: /* pop reg */ 1402 case 0x58 ... 0x5f: /* pop reg */
1408 pop_instruction: 1403 pop_instruction:
1409 if ((rc = ops->read_std(register_address(c, ctxt->ss_base, 1404 if ((rc = ops->read_std(register_address(c, ss_base(ctxt),
1410 c->regs[VCPU_REGS_RSP]), c->dst.ptr, 1405 c->regs[VCPU_REGS_RSP]), c->dst.ptr,
1411 c->op_bytes, ctxt->vcpu)) != 0) 1406 c->op_bytes, ctxt->vcpu)) != 0)
1412 goto done; 1407 goto done;
@@ -1420,9 +1415,8 @@ special_insn:
1420 goto cannot_emulate; 1415 goto cannot_emulate;
1421 c->dst.val = (s32) c->src.val; 1416 c->dst.val = (s32) c->src.val;
1422 break; 1417 break;
1418 case 0x68: /* push imm */
1423 case 0x6a: /* push imm8 */ 1419 case 0x6a: /* push imm8 */
1424 c->src.val = 0L;
1425 c->src.val = insn_fetch(s8, 1, c->eip);
1426 emulate_push(ctxt); 1420 emulate_push(ctxt);
1427 break; 1421 break;
1428 case 0x6c: /* insb */ 1422 case 0x6c: /* insb */
@@ -1433,7 +1427,7 @@ special_insn:
1433 c->rep_prefix ? 1427 c->rep_prefix ?
1434 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, 1428 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
1435 (ctxt->eflags & EFLG_DF), 1429 (ctxt->eflags & EFLG_DF),
1436 register_address(c, ctxt->es_base, 1430 register_address(c, es_base(ctxt),
1437 c->regs[VCPU_REGS_RDI]), 1431 c->regs[VCPU_REGS_RDI]),
1438 c->rep_prefix, 1432 c->rep_prefix,
1439 c->regs[VCPU_REGS_RDX]) == 0) { 1433 c->regs[VCPU_REGS_RDX]) == 0) {
@@ -1449,9 +1443,8 @@ special_insn:
1449 c->rep_prefix ? 1443 c->rep_prefix ?
1450 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, 1444 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
1451 (ctxt->eflags & EFLG_DF), 1445 (ctxt->eflags & EFLG_DF),
1452 register_address(c, c->override_base ? 1446 register_address(c,
1453 *c->override_base : 1447 seg_override_base(ctxt, c),
1454 ctxt->ds_base,
1455 c->regs[VCPU_REGS_RSI]), 1448 c->regs[VCPU_REGS_RSI]),
1456 c->rep_prefix, 1449 c->rep_prefix,
1457 c->regs[VCPU_REGS_RDX]) == 0) { 1450 c->regs[VCPU_REGS_RDX]) == 0) {
@@ -1490,6 +1483,7 @@ special_insn:
1490 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); 1483 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
1491 break; 1484 break;
1492 case 0x86 ... 0x87: /* xchg */ 1485 case 0x86 ... 0x87: /* xchg */
1486 xchg:
1493 /* Write back the register source. */ 1487 /* Write back the register source. */
1494 switch (c->dst.bytes) { 1488 switch (c->dst.bytes) {
1495 case 1: 1489 case 1:
@@ -1514,14 +1508,60 @@ special_insn:
1514 break; 1508 break;
1515 case 0x88 ... 0x8b: /* mov */ 1509 case 0x88 ... 0x8b: /* mov */
1516 goto mov; 1510 goto mov;
1511 case 0x8c: { /* mov r/m, sreg */
1512 struct kvm_segment segreg;
1513
1514 if (c->modrm_reg <= 5)
1515 kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
1516 else {
1517 printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n",
1518 c->modrm);
1519 goto cannot_emulate;
1520 }
1521 c->dst.val = segreg.selector;
1522 break;
1523 }
1517 case 0x8d: /* lea r16/r32, m */ 1524 case 0x8d: /* lea r16/r32, m */
1518 c->dst.val = c->modrm_ea; 1525 c->dst.val = c->modrm_ea;
1519 break; 1526 break;
1527 case 0x8e: { /* mov seg, r/m16 */
1528 uint16_t sel;
1529 int type_bits;
1530 int err;
1531
1532 sel = c->src.val;
1533 if (c->modrm_reg <= 5) {
1534 type_bits = (c->modrm_reg == 1) ? 9 : 1;
1535 err = kvm_load_segment_descriptor(ctxt->vcpu, sel,
1536 type_bits, c->modrm_reg);
1537 } else {
1538 printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n",
1539 c->modrm);
1540 goto cannot_emulate;
1541 }
1542
1543 if (err < 0)
1544 goto cannot_emulate;
1545
1546 c->dst.type = OP_NONE; /* Disable writeback. */
1547 break;
1548 }
1520 case 0x8f: /* pop (sole member of Grp1a) */ 1549 case 0x8f: /* pop (sole member of Grp1a) */
1521 rc = emulate_grp1a(ctxt, ops); 1550 rc = emulate_grp1a(ctxt, ops);
1522 if (rc != 0) 1551 if (rc != 0)
1523 goto done; 1552 goto done;
1524 break; 1553 break;
1554 case 0x90: /* nop / xchg r8,rax */
1555 if (!(c->rex_prefix & 1)) { /* nop */
1556 c->dst.type = OP_NONE;
1557 break;
1558 }
1559 case 0x91 ... 0x97: /* xchg reg,rax */
1560 c->src.type = c->dst.type = OP_REG;
1561 c->src.bytes = c->dst.bytes = c->op_bytes;
1562 c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
1563 c->src.val = *(c->src.ptr);
1564 goto xchg;
1525 case 0x9c: /* pushf */ 1565 case 0x9c: /* pushf */
1526 c->src.val = (unsigned long) ctxt->eflags; 1566 c->src.val = (unsigned long) ctxt->eflags;
1527 emulate_push(ctxt); 1567 emulate_push(ctxt);
@@ -1540,11 +1580,10 @@ special_insn:
1540 c->dst.type = OP_MEM; 1580 c->dst.type = OP_MEM;
1541 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1581 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1542 c->dst.ptr = (unsigned long *)register_address(c, 1582 c->dst.ptr = (unsigned long *)register_address(c,
1543 ctxt->es_base, 1583 es_base(ctxt),
1544 c->regs[VCPU_REGS_RDI]); 1584 c->regs[VCPU_REGS_RDI]);
1545 if ((rc = ops->read_emulated(register_address(c, 1585 if ((rc = ops->read_emulated(register_address(c,
1546 c->override_base ? *c->override_base : 1586 seg_override_base(ctxt, c),
1547 ctxt->ds_base,
1548 c->regs[VCPU_REGS_RSI]), 1587 c->regs[VCPU_REGS_RSI]),
1549 &c->dst.val, 1588 &c->dst.val,
1550 c->dst.bytes, ctxt->vcpu)) != 0) 1589 c->dst.bytes, ctxt->vcpu)) != 0)
@@ -1560,8 +1599,7 @@ special_insn:
1560 c->src.type = OP_NONE; /* Disable writeback. */ 1599 c->src.type = OP_NONE; /* Disable writeback. */
1561 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1600 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1562 c->src.ptr = (unsigned long *)register_address(c, 1601 c->src.ptr = (unsigned long *)register_address(c,
1563 c->override_base ? *c->override_base : 1602 seg_override_base(ctxt, c),
1564 ctxt->ds_base,
1565 c->regs[VCPU_REGS_RSI]); 1603 c->regs[VCPU_REGS_RSI]);
1566 if ((rc = ops->read_emulated((unsigned long)c->src.ptr, 1604 if ((rc = ops->read_emulated((unsigned long)c->src.ptr,
1567 &c->src.val, 1605 &c->src.val,
@@ -1572,7 +1610,7 @@ special_insn:
1572 c->dst.type = OP_NONE; /* Disable writeback. */ 1610 c->dst.type = OP_NONE; /* Disable writeback. */
1573 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1611 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1574 c->dst.ptr = (unsigned long *)register_address(c, 1612 c->dst.ptr = (unsigned long *)register_address(c,
1575 ctxt->es_base, 1613 es_base(ctxt),
1576 c->regs[VCPU_REGS_RDI]); 1614 c->regs[VCPU_REGS_RDI]);
1577 if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, 1615 if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
1578 &c->dst.val, 1616 &c->dst.val,
@@ -1596,7 +1634,7 @@ special_insn:
1596 c->dst.type = OP_MEM; 1634 c->dst.type = OP_MEM;
1597 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1635 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1598 c->dst.ptr = (unsigned long *)register_address(c, 1636 c->dst.ptr = (unsigned long *)register_address(c,
1599 ctxt->es_base, 1637 es_base(ctxt),
1600 c->regs[VCPU_REGS_RDI]); 1638 c->regs[VCPU_REGS_RDI]);
1601 c->dst.val = c->regs[VCPU_REGS_RAX]; 1639 c->dst.val = c->regs[VCPU_REGS_RAX];
1602 register_address_increment(c, &c->regs[VCPU_REGS_RDI], 1640 register_address_increment(c, &c->regs[VCPU_REGS_RDI],
@@ -1608,8 +1646,7 @@ special_insn:
1608 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1646 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1609 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; 1647 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1610 if ((rc = ops->read_emulated(register_address(c, 1648 if ((rc = ops->read_emulated(register_address(c,
1611 c->override_base ? *c->override_base : 1649 seg_override_base(ctxt, c),
1612 ctxt->ds_base,
1613 c->regs[VCPU_REGS_RSI]), 1650 c->regs[VCPU_REGS_RSI]),
1614 &c->dst.val, 1651 &c->dst.val,
1615 c->dst.bytes, 1652 c->dst.bytes,
@@ -1622,6 +1659,8 @@ special_insn:
1622 case 0xae ... 0xaf: /* scas */ 1659 case 0xae ... 0xaf: /* scas */
1623 DPRINTF("Urk! I don't handle SCAS.\n"); 1660 DPRINTF("Urk! I don't handle SCAS.\n");
1624 goto cannot_emulate; 1661 goto cannot_emulate;
1662 case 0xb8: /* mov r, imm */
1663 goto mov;
1625 case 0xc0 ... 0xc1: 1664 case 0xc0 ... 0xc1:
1626 emulate_grp2(ctxt); 1665 emulate_grp2(ctxt);
1627 break; 1666 break;
@@ -1660,13 +1699,39 @@ special_insn:
1660 break; 1699 break;
1661 } 1700 }
1662 case 0xe9: /* jmp rel */ 1701 case 0xe9: /* jmp rel */
1663 case 0xeb: /* jmp rel short */ 1702 goto jmp;
1703 case 0xea: /* jmp far */ {
1704 uint32_t eip;
1705 uint16_t sel;
1706
1707 switch (c->op_bytes) {
1708 case 2:
1709 eip = insn_fetch(u16, 2, c->eip);
1710 break;
1711 case 4:
1712 eip = insn_fetch(u32, 4, c->eip);
1713 break;
1714 default:
1715 DPRINTF("jmp far: Invalid op_bytes\n");
1716 goto cannot_emulate;
1717 }
1718 sel = insn_fetch(u16, 2, c->eip);
1719 if (kvm_load_segment_descriptor(ctxt->vcpu, sel, 9, VCPU_SREG_CS) < 0) {
1720 DPRINTF("jmp far: Failed to load CS descriptor\n");
1721 goto cannot_emulate;
1722 }
1723
1724 c->eip = eip;
1725 break;
1726 }
1727 case 0xeb:
1728 jmp: /* jmp rel short */
1664 jmp_rel(c, c->src.val); 1729 jmp_rel(c, c->src.val);
1665 c->dst.type = OP_NONE; /* Disable writeback. */ 1730 c->dst.type = OP_NONE; /* Disable writeback. */
1666 break; 1731 break;
1667 case 0xf4: /* hlt */ 1732 case 0xf4: /* hlt */
1668 ctxt->vcpu->arch.halt_request = 1; 1733 ctxt->vcpu->arch.halt_request = 1;
1669 goto done; 1734 break;
1670 case 0xf5: /* cmc */ 1735 case 0xf5: /* cmc */
1671 /* complement carry flag from eflags reg */ 1736 /* complement carry flag from eflags reg */
1672 ctxt->eflags ^= EFLG_CF; 1737 ctxt->eflags ^= EFLG_CF;
@@ -1882,6 +1947,8 @@ twobyte_insn:
1882 c->src.val &= (c->dst.bytes << 3) - 1; 1947 c->src.val &= (c->dst.bytes << 3) - 1;
1883 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); 1948 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
1884 break; 1949 break;
1950 case 0xae: /* clflush */
1951 break;
1885 case 0xb0 ... 0xb1: /* cmpxchg */ 1952 case 0xb0 ... 0xb1: /* cmpxchg */
1886 /* 1953 /*
1887 * Save real source value, then compare EAX against 1954 * Save real source value, then compare EAX against
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 964dfa36d367..c70e12b1a637 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -3,7 +3,7 @@ config LGUEST_GUEST
3 select PARAVIRT 3 select PARAVIRT
4 depends on X86_32 4 depends on X86_32
5 depends on !X86_PAE 5 depends on !X86_PAE
6 depends on !(X86_VISWS || X86_VOYAGER) 6 depends on !X86_VOYAGER
7 select VIRTIO 7 select VIRTIO
8 select VIRTIO_RING 8 select VIRTIO_RING
9 select VIRTIO_CONSOLE 9 select VIRTIO_CONSOLE
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 5c7e2fd52075..0313a5eec412 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -607,7 +607,7 @@ static unsigned long lguest_get_wallclock(void)
607 * what speed it runs at, or 0 if it's unusable as a reliable clock source. 607 * what speed it runs at, or 0 if it's unusable as a reliable clock source.
608 * This matches what we want here: if we return 0 from this function, the x86 608 * This matches what we want here: if we return 0 from this function, the x86
609 * TSC clock will give up and not register itself. */ 609 * TSC clock will give up and not register itself. */
610static unsigned long lguest_cpu_khz(void) 610static unsigned long lguest_tsc_khz(void)
611{ 611{
612 return lguest_data.tsc_khz; 612 return lguest_data.tsc_khz;
613} 613}
@@ -835,7 +835,7 @@ static __init char *lguest_memory_setup(void)
835 835
836 /* The Linux bootloader header contains an "e820" memory map: the 836 /* The Linux bootloader header contains an "e820" memory map: the
837 * Launcher populated the first entry with our memory limit. */ 837 * Launcher populated the first entry with our memory limit. */
838 add_memory_region(boot_params.e820_map[0].addr, 838 e820_add_region(boot_params.e820_map[0].addr,
839 boot_params.e820_map[0].size, 839 boot_params.e820_map[0].size,
840 boot_params.e820_map[0].type); 840 boot_params.e820_map[0].type);
841 841
@@ -991,14 +991,13 @@ __init void lguest_init(void)
991#ifdef CONFIG_X86_LOCAL_APIC 991#ifdef CONFIG_X86_LOCAL_APIC
992 /* apic read/write intercepts */ 992 /* apic read/write intercepts */
993 pv_apic_ops.apic_write = lguest_apic_write; 993 pv_apic_ops.apic_write = lguest_apic_write;
994 pv_apic_ops.apic_write_atomic = lguest_apic_write;
995 pv_apic_ops.apic_read = lguest_apic_read; 994 pv_apic_ops.apic_read = lguest_apic_read;
996#endif 995#endif
997 996
998 /* time operations */ 997 /* time operations */
999 pv_time_ops.get_wallclock = lguest_get_wallclock; 998 pv_time_ops.get_wallclock = lguest_get_wallclock;
1000 pv_time_ops.time_init = lguest_time_init; 999 pv_time_ops.time_init = lguest_time_init;
1001 pv_time_ops.get_cpu_khz = lguest_cpu_khz; 1000 pv_time_ops.get_tsc_khz = lguest_tsc_khz;
1002 1001
1003 /* Now is a good time to look at the implementations of these functions 1002 /* Now is a good time to look at the implementations of these functions
1004 * before returning to the rest of lguest_init(). */ 1003 * before returning to the rest of lguest_init(). */
@@ -1012,6 +1011,7 @@ __init void lguest_init(void)
1012 * clobbered. The Launcher places our initial pagetables somewhere at 1011 * clobbered. The Launcher places our initial pagetables somewhere at
1013 * the top of our physical memory, so we don't need extra space: set 1012 * the top of our physical memory, so we don't need extra space: set
1014 * init_pg_tables_end to the end of the kernel. */ 1013 * init_pg_tables_end to the end of the kernel. */
1014 init_pg_tables_start = __pa(pg0);
1015 init_pg_tables_end = __pa(pg0); 1015 init_pg_tables_end = __pa(pg0);
1016 1016
1017 /* Load the %fs segment register (the per-cpu segment register) with 1017 /* Load the %fs segment register (the per-cpu segment register) with
@@ -1065,9 +1065,9 @@ __init void lguest_init(void)
1065 pm_power_off = lguest_power_off; 1065 pm_power_off = lguest_power_off;
1066 machine_ops.restart = lguest_restart; 1066 machine_ops.restart = lguest_restart;
1067 1067
1068 /* Now we're set up, call start_kernel() in init/main.c and we proceed 1068 /* Now we're set up, call i386_start_kernel() in head32.c and we proceed
1069 * to boot as normal. It never returns. */ 1069 * to boot as normal. It never returns. */
1070 start_kernel(); 1070 i386_start_kernel();
1071} 1071}
1072/* 1072/*
1073 * This marks the end of stage II of our journey, The Guest. 1073 * This marks the end of stage II of our journey, The Guest.
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 76f60f52a885..aa3fa4119424 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -4,8 +4,9 @@
4 4
5obj-$(CONFIG_SMP) := msr-on-cpu.o 5obj-$(CONFIG_SMP) := msr-on-cpu.o
6 6
7lib-y := delay_$(BITS).o 7lib-y := delay.o
8lib-y += usercopy_$(BITS).o getuser_$(BITS).o putuser_$(BITS).o 8lib-y += thunk_$(BITS).o
9lib-y += usercopy_$(BITS).o getuser.o putuser.o
9lib-y += memcpy_$(BITS).o 10lib-y += memcpy_$(BITS).o
10 11
11ifeq ($(CONFIG_X86_32),y) 12ifeq ($(CONFIG_X86_32),y)
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index ee1c3f635157..dfdf428975c0 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -1,8 +1,10 @@
1/* Copyright 2002 Andi Kleen, SuSE Labs. 1/*
2 * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
3 * Copyright 2002 Andi Kleen, SuSE Labs.
2 * Subject to the GNU Public License v2. 4 * Subject to the GNU Public License v2.
3 * 5 *
4 * Functions to copy from and to user space. 6 * Functions to copy from and to user space.
5 */ 7 */
6 8
7#include <linux/linkage.h> 9#include <linux/linkage.h>
8#include <asm/dwarf2.h> 10#include <asm/dwarf2.h>
@@ -20,60 +22,88 @@
20 .long \orig-1f /* by default jump to orig */ 22 .long \orig-1f /* by default jump to orig */
211: 231:
22 .section .altinstr_replacement,"ax" 24 .section .altinstr_replacement,"ax"
232: .byte 0xe9 /* near jump with 32bit immediate */ 252: .byte 0xe9 /* near jump with 32bit immediate */
24 .long \alt-1b /* offset */ /* or alternatively to alt */ 26 .long \alt-1b /* offset */ /* or alternatively to alt */
25 .previous 27 .previous
26 .section .altinstructions,"a" 28 .section .altinstructions,"a"
27 .align 8 29 .align 8
28 .quad 0b 30 .quad 0b
29 .quad 2b 31 .quad 2b
30 .byte \feature /* when feature is set */ 32 .byte \feature /* when feature is set */
31 .byte 5 33 .byte 5
32 .byte 5 34 .byte 5
33 .previous 35 .previous
34 .endm 36 .endm
35 37
36/* Standard copy_to_user with segment limit checking */ 38 .macro ALIGN_DESTINATION
39#ifdef FIX_ALIGNMENT
40 /* check for bad alignment of destination */
41 movl %edi,%ecx
42 andl $7,%ecx
43 jz 102f /* already aligned */
44 subl $8,%ecx
45 negl %ecx
46 subl %ecx,%edx
47100: movb (%rsi),%al
48101: movb %al,(%rdi)
49 incq %rsi
50 incq %rdi
51 decl %ecx
52 jnz 100b
53102:
54 .section .fixup,"ax"
55103: addl %r8d,%edx /* ecx is zerorest also */
56 jmp copy_user_handle_tail
57 .previous
58
59 .section __ex_table,"a"
60 .align 8
61 .quad 100b,103b
62 .quad 101b,103b
63 .previous
64#endif
65 .endm
66
67/* Standard copy_to_user with segment limit checking */
37ENTRY(copy_to_user) 68ENTRY(copy_to_user)
38 CFI_STARTPROC 69 CFI_STARTPROC
39 GET_THREAD_INFO(%rax) 70 GET_THREAD_INFO(%rax)
40 movq %rdi,%rcx 71 movq %rdi,%rcx
41 addq %rdx,%rcx 72 addq %rdx,%rcx
42 jc bad_to_user 73 jc bad_to_user
43 cmpq threadinfo_addr_limit(%rax),%rcx 74 cmpq TI_addr_limit(%rax),%rcx
44 jae bad_to_user 75 jae bad_to_user
45 xorl %eax,%eax /* clear zero flag */
46 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 76 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
47 CFI_ENDPROC 77 CFI_ENDPROC
48 78
49ENTRY(copy_user_generic) 79/* Standard copy_from_user with segment limit checking */
80ENTRY(copy_from_user)
50 CFI_STARTPROC 81 CFI_STARTPROC
51 movl $1,%ecx /* set zero flag */ 82 GET_THREAD_INFO(%rax)
83 movq %rsi,%rcx
84 addq %rdx,%rcx
85 jc bad_from_user
86 cmpq TI_addr_limit(%rax),%rcx
87 jae bad_from_user
52 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 88 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
53 CFI_ENDPROC 89 CFI_ENDPROC
90ENDPROC(copy_from_user)
54 91
55ENTRY(__copy_from_user_inatomic) 92ENTRY(copy_user_generic)
56 CFI_STARTPROC 93 CFI_STARTPROC
57 xorl %ecx,%ecx /* clear zero flag */
58 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 94 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
59 CFI_ENDPROC 95 CFI_ENDPROC
96ENDPROC(copy_user_generic)
60 97
61/* Standard copy_from_user with segment limit checking */ 98ENTRY(__copy_from_user_inatomic)
62ENTRY(copy_from_user)
63 CFI_STARTPROC 99 CFI_STARTPROC
64 GET_THREAD_INFO(%rax)
65 movq %rsi,%rcx
66 addq %rdx,%rcx
67 jc bad_from_user
68 cmpq threadinfo_addr_limit(%rax),%rcx
69 jae bad_from_user
70 movl $1,%ecx /* set zero flag */
71 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 100 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
72 CFI_ENDPROC 101 CFI_ENDPROC
73ENDPROC(copy_from_user) 102ENDPROC(__copy_from_user_inatomic)
74 103
75 .section .fixup,"ax" 104 .section .fixup,"ax"
76 /* must zero dest */ 105 /* must zero dest */
106ENTRY(bad_from_user)
77bad_from_user: 107bad_from_user:
78 CFI_STARTPROC 108 CFI_STARTPROC
79 movl %edx,%ecx 109 movl %edx,%ecx
@@ -81,271 +111,158 @@ bad_from_user:
81 rep 111 rep
82 stosb 112 stosb
83bad_to_user: 113bad_to_user:
84 movl %edx,%eax 114 movl %edx,%eax
85 ret 115 ret
86 CFI_ENDPROC 116 CFI_ENDPROC
87END(bad_from_user) 117ENDPROC(bad_from_user)
88 .previous 118 .previous
89 119
90
91/* 120/*
92 * copy_user_generic_unrolled - memory copy with exception handling. 121 * copy_user_generic_unrolled - memory copy with exception handling.
93 * This version is for CPUs like P4 that don't have efficient micro code for rep movsq 122 * This version is for CPUs like P4 that don't have efficient micro
94 * 123 * code for rep movsq
95 * Input: 124 *
125 * Input:
96 * rdi destination 126 * rdi destination
97 * rsi source 127 * rsi source
98 * rdx count 128 * rdx count
99 * ecx zero flag -- if true zero destination on error
100 * 129 *
101 * Output: 130 * Output:
102 * eax uncopied bytes or 0 if successful. 131 * eax uncopied bytes or 0 if successfull.
103 */ 132 */
104ENTRY(copy_user_generic_unrolled) 133ENTRY(copy_user_generic_unrolled)
105 CFI_STARTPROC 134 CFI_STARTPROC
106 pushq %rbx 135 cmpl $8,%edx
107 CFI_ADJUST_CFA_OFFSET 8 136 jb 20f /* less then 8 bytes, go to byte copy loop */
108 CFI_REL_OFFSET rbx, 0 137 ALIGN_DESTINATION
109 pushq %rcx 138 movl %edx,%ecx
110 CFI_ADJUST_CFA_OFFSET 8 139 andl $63,%edx
111 CFI_REL_OFFSET rcx, 0 140 shrl $6,%ecx
112 xorl %eax,%eax /*zero for the exception handler */ 141 jz 17f
113 1421: movq (%rsi),%r8
114#ifdef FIX_ALIGNMENT 1432: movq 1*8(%rsi),%r9
115 /* check for bad alignment of destination */ 1443: movq 2*8(%rsi),%r10
116 movl %edi,%ecx 1454: movq 3*8(%rsi),%r11
117 andl $7,%ecx 1465: movq %r8,(%rdi)
118 jnz .Lbad_alignment 1476: movq %r9,1*8(%rdi)
119.Lafter_bad_alignment: 1487: movq %r10,2*8(%rdi)
120#endif 1498: movq %r11,3*8(%rdi)
121 1509: movq 4*8(%rsi),%r8
122 movq %rdx,%rcx 15110: movq 5*8(%rsi),%r9
123 15211: movq 6*8(%rsi),%r10
124 movl $64,%ebx 15312: movq 7*8(%rsi),%r11
125 shrq $6,%rdx 15413: movq %r8,4*8(%rdi)
126 decq %rdx 15514: movq %r9,5*8(%rdi)
127 js .Lhandle_tail 15615: movq %r10,6*8(%rdi)
128 15716: movq %r11,7*8(%rdi)
129 .p2align 4
130.Lloop:
131.Ls1: movq (%rsi),%r11
132.Ls2: movq 1*8(%rsi),%r8
133.Ls3: movq 2*8(%rsi),%r9
134.Ls4: movq 3*8(%rsi),%r10
135.Ld1: movq %r11,(%rdi)
136.Ld2: movq %r8,1*8(%rdi)
137.Ld3: movq %r9,2*8(%rdi)
138.Ld4: movq %r10,3*8(%rdi)
139
140.Ls5: movq 4*8(%rsi),%r11
141.Ls6: movq 5*8(%rsi),%r8
142.Ls7: movq 6*8(%rsi),%r9
143.Ls8: movq 7*8(%rsi),%r10
144.Ld5: movq %r11,4*8(%rdi)
145.Ld6: movq %r8,5*8(%rdi)
146.Ld7: movq %r9,6*8(%rdi)
147.Ld8: movq %r10,7*8(%rdi)
148
149 decq %rdx
150
151 leaq 64(%rsi),%rsi 158 leaq 64(%rsi),%rsi
152 leaq 64(%rdi),%rdi 159 leaq 64(%rdi),%rdi
153
154 jns .Lloop
155
156 .p2align 4
157.Lhandle_tail:
158 movl %ecx,%edx
159 andl $63,%ecx
160 shrl $3,%ecx
161 jz .Lhandle_7
162 movl $8,%ebx
163 .p2align 4
164.Lloop_8:
165.Ls9: movq (%rsi),%r8
166.Ld9: movq %r8,(%rdi)
167 decl %ecx 160 decl %ecx
168 leaq 8(%rdi),%rdi 161 jnz 1b
16217: movl %edx,%ecx
163 andl $7,%edx
164 shrl $3,%ecx
165 jz 20f
16618: movq (%rsi),%r8
16719: movq %r8,(%rdi)
169 leaq 8(%rsi),%rsi 168 leaq 8(%rsi),%rsi
170 jnz .Lloop_8 169 leaq 8(%rdi),%rdi
171 170 decl %ecx
172.Lhandle_7: 171 jnz 18b
17220: andl %edx,%edx
173 jz 23f
173 movl %edx,%ecx 174 movl %edx,%ecx
174 andl $7,%ecx 17521: movb (%rsi),%al
175 jz .Lende 17622: movb %al,(%rdi)
176 .p2align 4
177.Lloop_1:
178.Ls10: movb (%rsi),%bl
179.Ld10: movb %bl,(%rdi)
180 incq %rdi
181 incq %rsi 177 incq %rsi
178 incq %rdi
182 decl %ecx 179 decl %ecx
183 jnz .Lloop_1 180 jnz 21b
184 18123: xor %eax,%eax
185 CFI_REMEMBER_STATE
186.Lende:
187 popq %rcx
188 CFI_ADJUST_CFA_OFFSET -8
189 CFI_RESTORE rcx
190 popq %rbx
191 CFI_ADJUST_CFA_OFFSET -8
192 CFI_RESTORE rbx
193 ret 182 ret
194 CFI_RESTORE_STATE
195 183
196#ifdef FIX_ALIGNMENT 184 .section .fixup,"ax"
197 /* align destination */ 18530: shll $6,%ecx
198 .p2align 4 186 addl %ecx,%edx
199.Lbad_alignment: 187 jmp 60f
200 movl $8,%r9d 18840: lea (%rdx,%rcx,8),%rdx
201 subl %ecx,%r9d 189 jmp 60f
202 movl %r9d,%ecx 19050: movl %ecx,%edx
203 cmpq %r9,%rdx 19160: jmp copy_user_handle_tail /* ecx is zerorest also */
204 jz .Lhandle_7 192 .previous
205 js .Lhandle_7
206.Lalign_1:
207.Ls11: movb (%rsi),%bl
208.Ld11: movb %bl,(%rdi)
209 incq %rsi
210 incq %rdi
211 decl %ecx
212 jnz .Lalign_1
213 subq %r9,%rdx
214 jmp .Lafter_bad_alignment
215#endif
216 193
217 /* table sorted by exception address */
218 .section __ex_table,"a" 194 .section __ex_table,"a"
219 .align 8 195 .align 8
220 .quad .Ls1,.Ls1e /* Ls1-Ls4 have copied zero bytes */ 196 .quad 1b,30b
221 .quad .Ls2,.Ls1e 197 .quad 2b,30b
222 .quad .Ls3,.Ls1e 198 .quad 3b,30b
223 .quad .Ls4,.Ls1e 199 .quad 4b,30b
224 .quad .Ld1,.Ls1e /* Ld1-Ld4 have copied 0-24 bytes */ 200 .quad 5b,30b
225 .quad .Ld2,.Ls2e 201 .quad 6b,30b
226 .quad .Ld3,.Ls3e 202 .quad 7b,30b
227 .quad .Ld4,.Ls4e 203 .quad 8b,30b
228 .quad .Ls5,.Ls5e /* Ls5-Ls8 have copied 32 bytes */ 204 .quad 9b,30b
229 .quad .Ls6,.Ls5e 205 .quad 10b,30b
230 .quad .Ls7,.Ls5e 206 .quad 11b,30b
231 .quad .Ls8,.Ls5e 207 .quad 12b,30b
232 .quad .Ld5,.Ls5e /* Ld5-Ld8 have copied 32-56 bytes */ 208 .quad 13b,30b
233 .quad .Ld6,.Ls6e 209 .quad 14b,30b
234 .quad .Ld7,.Ls7e 210 .quad 15b,30b
235 .quad .Ld8,.Ls8e 211 .quad 16b,30b
236 .quad .Ls9,.Le_quad 212 .quad 18b,40b
237 .quad .Ld9,.Le_quad 213 .quad 19b,40b
238 .quad .Ls10,.Le_byte 214 .quad 21b,50b
239 .quad .Ld10,.Le_byte 215 .quad 22b,50b
240#ifdef FIX_ALIGNMENT
241 .quad .Ls11,.Lzero_rest
242 .quad .Ld11,.Lzero_rest
243#endif
244 .quad .Le5,.Le_zero
245 .previous 216 .previous
246
247 /* eax: zero, ebx: 64 */
248.Ls1e: addl $8,%eax /* eax is bytes left uncopied within the loop (Ls1e: 64 .. Ls8e: 8) */
249.Ls2e: addl $8,%eax
250.Ls3e: addl $8,%eax
251.Ls4e: addl $8,%eax
252.Ls5e: addl $8,%eax
253.Ls6e: addl $8,%eax
254.Ls7e: addl $8,%eax
255.Ls8e: addl $8,%eax
256 addq %rbx,%rdi /* +64 */
257 subq %rax,%rdi /* correct destination with computed offset */
258
259 shlq $6,%rdx /* loop counter * 64 (stride length) */
260 addq %rax,%rdx /* add offset to loopcnt */
261 andl $63,%ecx /* remaining bytes */
262 addq %rcx,%rdx /* add them */
263 jmp .Lzero_rest
264
265 /* exception on quad word loop in tail handling */
266 /* ecx: loopcnt/8, %edx: length, rdi: correct */
267.Le_quad:
268 shll $3,%ecx
269 andl $7,%edx
270 addl %ecx,%edx
271 /* edx: bytes to zero, rdi: dest, eax:zero */
272.Lzero_rest:
273 cmpl $0,(%rsp)
274 jz .Le_zero
275 movq %rdx,%rcx
276.Le_byte:
277 xorl %eax,%eax
278.Le5: rep
279 stosb
280 /* when there is another exception while zeroing the rest just return */
281.Le_zero:
282 movq %rdx,%rax
283 jmp .Lende
284 CFI_ENDPROC 217 CFI_ENDPROC
285ENDPROC(copy_user_generic) 218ENDPROC(copy_user_generic_unrolled)
286 219
287 220/* Some CPUs run faster using the string copy instructions.
288 /* Some CPUs run faster using the string copy instructions. 221 * This is also a lot simpler. Use them when possible.
289 This is also a lot simpler. Use them when possible. 222 *
290 Patch in jmps to this code instead of copying it fully 223 * Only 4GB of copy is supported. This shouldn't be a problem
291 to avoid unwanted aliasing in the exception tables. */ 224 * because the kernel normally only writes from/to page sized chunks
292 225 * even if user space passed a longer buffer.
293 /* rdi destination 226 * And more would be dangerous because both Intel and AMD have
294 * rsi source 227 * errata with rep movsq > 4GB. If someone feels the need to fix
295 * rdx count 228 * this please consider this.
296 * ecx zero flag 229 *
297 * 230 * Input:
298 * Output: 231 * rdi destination
299 * eax uncopied bytes or 0 if successfull. 232 * rsi source
300 * 233 * rdx count
301 * Only 4GB of copy is supported. This shouldn't be a problem 234 *
302 * because the kernel normally only writes from/to page sized chunks 235 * Output:
303 * even if user space passed a longer buffer. 236 * eax uncopied bytes or 0 if successful.
304 * And more would be dangerous because both Intel and AMD have 237 */
305 * errata with rep movsq > 4GB. If someone feels the need to fix
306 * this please consider this.
307 */
308ENTRY(copy_user_generic_string) 238ENTRY(copy_user_generic_string)
309 CFI_STARTPROC 239 CFI_STARTPROC
310 movl %ecx,%r8d /* save zero flag */ 240 andl %edx,%edx
241 jz 4f
242 cmpl $8,%edx
243 jb 2f /* less than 8 bytes, go to byte copy loop */
244 ALIGN_DESTINATION
311 movl %edx,%ecx 245 movl %edx,%ecx
312 shrl $3,%ecx 246 shrl $3,%ecx
313 andl $7,%edx 247 andl $7,%edx
314 jz 10f 2481: rep
3151: rep
316 movsq
317 movl %edx,%ecx
3182: rep
319 movsb
3209: movl %ecx,%eax
321 ret
322
323 /* multiple of 8 byte */
32410: rep
325 movsq 249 movsq
326 xor %eax,%eax 2502: movl %edx,%ecx
2513: rep
252 movsb
2534: xorl %eax,%eax
327 ret 254 ret
328 255
329 /* exception handling */ 256 .section .fixup,"ax"
3303: lea (%rdx,%rcx,8),%rax /* exception on quad loop */ 25711: lea (%rdx,%rcx,8),%rcx
331 jmp 6f 25812: movl %ecx,%edx /* ecx is zerorest also */
3325: movl %ecx,%eax /* exception on byte loop */ 259 jmp copy_user_handle_tail
333 /* eax: left over bytes */ 260 .previous
3346: testl %r8d,%r8d /* zero flag set? */
335 jz 7f
336 movl %eax,%ecx /* initialize x86 loop counter */
337 push %rax
338 xorl %eax,%eax
3398: rep
340 stosb /* zero the rest */
34111: pop %rax
3427: ret
343 CFI_ENDPROC
344END(copy_user_generic_c)
345 261
346 .section __ex_table,"a" 262 .section __ex_table,"a"
347 .quad 1b,3b 263 .align 8
348 .quad 2b,5b 264 .quad 1b,11b
349 .quad 8b,11b 265 .quad 3b,12b
350 .quad 10b,3b
351 .previous 266 .previous
267 CFI_ENDPROC
268ENDPROC(copy_user_generic_string)
diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S
index 9d3d1ab83763..40e0e309d27e 100644
--- a/arch/x86/lib/copy_user_nocache_64.S
+++ b/arch/x86/lib/copy_user_nocache_64.S
@@ -1,4 +1,6 @@
1/* Copyright 2002 Andi Kleen, SuSE Labs. 1/*
2 * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
3 * Copyright 2002 Andi Kleen, SuSE Labs.
2 * Subject to the GNU Public License v2. 4 * Subject to the GNU Public License v2.
3 * 5 *
4 * Functions to copy from and to user space. 6 * Functions to copy from and to user space.
@@ -12,204 +14,125 @@
12#include <asm/current.h> 14#include <asm/current.h>
13#include <asm/asm-offsets.h> 15#include <asm/asm-offsets.h>
14#include <asm/thread_info.h> 16#include <asm/thread_info.h>
15#include <asm/cpufeature.h>
16
17/*
18 * copy_user_nocache - Uncached memory copy with exception handling
19 * This will force destination/source out of cache for more performance.
20 *
21 * Input:
22 * rdi destination
23 * rsi source
24 * rdx count
25 * rcx zero flag when 1 zero on exception
26 *
27 * Output:
28 * eax uncopied bytes or 0 if successful.
29 */
30ENTRY(__copy_user_nocache)
31 CFI_STARTPROC
32 pushq %rbx
33 CFI_ADJUST_CFA_OFFSET 8
34 CFI_REL_OFFSET rbx, 0
35 pushq %rcx /* save zero flag */
36 CFI_ADJUST_CFA_OFFSET 8
37 CFI_REL_OFFSET rcx, 0
38
39 xorl %eax,%eax /* zero for the exception handler */
40 17
18 .macro ALIGN_DESTINATION
41#ifdef FIX_ALIGNMENT 19#ifdef FIX_ALIGNMENT
42 /* check for bad alignment of destination */ 20 /* check for bad alignment of destination */
43 movl %edi,%ecx 21 movl %edi,%ecx
44 andl $7,%ecx 22 andl $7,%ecx
45 jnz .Lbad_alignment 23 jz 102f /* already aligned */
46.Lafter_bad_alignment: 24 subl $8,%ecx
47#endif 25 negl %ecx
48 26 subl %ecx,%edx
49 movq %rdx,%rcx 27100: movb (%rsi),%al
50 28101: movb %al,(%rdi)
51 movl $64,%ebx 29 incq %rsi
52 shrq $6,%rdx 30 incq %rdi
53 decq %rdx 31 decl %ecx
54 js .Lhandle_tail 32 jnz 100b
55 33102:
56 .p2align 4 34 .section .fixup,"ax"
57.Lloop: 35103: addl %r8d,%edx /* ecx is zerorest also */
58.Ls1: movq (%rsi),%r11 36 jmp copy_user_handle_tail
59.Ls2: movq 1*8(%rsi),%r8 37 .previous
60.Ls3: movq 2*8(%rsi),%r9
61.Ls4: movq 3*8(%rsi),%r10
62.Ld1: movnti %r11,(%rdi)
63.Ld2: movnti %r8,1*8(%rdi)
64.Ld3: movnti %r9,2*8(%rdi)
65.Ld4: movnti %r10,3*8(%rdi)
66
67.Ls5: movq 4*8(%rsi),%r11
68.Ls6: movq 5*8(%rsi),%r8
69.Ls7: movq 6*8(%rsi),%r9
70.Ls8: movq 7*8(%rsi),%r10
71.Ld5: movnti %r11,4*8(%rdi)
72.Ld6: movnti %r8,5*8(%rdi)
73.Ld7: movnti %r9,6*8(%rdi)
74.Ld8: movnti %r10,7*8(%rdi)
75 38
76 dec %rdx 39 .section __ex_table,"a"
40 .align 8
41 .quad 100b,103b
42 .quad 101b,103b
43 .previous
44#endif
45 .endm
77 46
47/*
48 * copy_user_nocache - Uncached memory copy with exception handling
49 * This will force destination/source out of cache for more performance.
50 */
51ENTRY(__copy_user_nocache)
52 CFI_STARTPROC
53 cmpl $8,%edx
54 jb 20f /* less then 8 bytes, go to byte copy loop */
55 ALIGN_DESTINATION
56 movl %edx,%ecx
57 andl $63,%edx
58 shrl $6,%ecx
59 jz 17f
601: movq (%rsi),%r8
612: movq 1*8(%rsi),%r9
623: movq 2*8(%rsi),%r10
634: movq 3*8(%rsi),%r11
645: movnti %r8,(%rdi)
656: movnti %r9,1*8(%rdi)
667: movnti %r10,2*8(%rdi)
678: movnti %r11,3*8(%rdi)
689: movq 4*8(%rsi),%r8
6910: movq 5*8(%rsi),%r9
7011: movq 6*8(%rsi),%r10
7112: movq 7*8(%rsi),%r11
7213: movnti %r8,4*8(%rdi)
7314: movnti %r9,5*8(%rdi)
7415: movnti %r10,6*8(%rdi)
7516: movnti %r11,7*8(%rdi)
78 leaq 64(%rsi),%rsi 76 leaq 64(%rsi),%rsi
79 leaq 64(%rdi),%rdi 77 leaq 64(%rdi),%rdi
80
81 jns .Lloop
82
83 .p2align 4
84.Lhandle_tail:
85 movl %ecx,%edx
86 andl $63,%ecx
87 shrl $3,%ecx
88 jz .Lhandle_7
89 movl $8,%ebx
90 .p2align 4
91.Lloop_8:
92.Ls9: movq (%rsi),%r8
93.Ld9: movnti %r8,(%rdi)
94 decl %ecx 78 decl %ecx
95 leaq 8(%rdi),%rdi 79 jnz 1b
8017: movl %edx,%ecx
81 andl $7,%edx
82 shrl $3,%ecx
83 jz 20f
8418: movq (%rsi),%r8
8519: movnti %r8,(%rdi)
96 leaq 8(%rsi),%rsi 86 leaq 8(%rsi),%rsi
97 jnz .Lloop_8 87 leaq 8(%rdi),%rdi
98 88 decl %ecx
99.Lhandle_7: 89 jnz 18b
9020: andl %edx,%edx
91 jz 23f
100 movl %edx,%ecx 92 movl %edx,%ecx
101 andl $7,%ecx 9321: movb (%rsi),%al
102 jz .Lende 9422: movb %al,(%rdi)
103 .p2align 4
104.Lloop_1:
105.Ls10: movb (%rsi),%bl
106.Ld10: movb %bl,(%rdi)
107 incq %rdi
108 incq %rsi 95 incq %rsi
96 incq %rdi
109 decl %ecx 97 decl %ecx
110 jnz .Lloop_1 98 jnz 21b
111 9923: xorl %eax,%eax
112 CFI_REMEMBER_STATE
113.Lende:
114 popq %rcx
115 CFI_ADJUST_CFA_OFFSET -8
116 CFI_RESTORE %rcx
117 popq %rbx
118 CFI_ADJUST_CFA_OFFSET -8
119 CFI_RESTORE rbx
120 sfence 100 sfence
121 ret 101 ret
122 CFI_RESTORE_STATE
123 102
124#ifdef FIX_ALIGNMENT 103 .section .fixup,"ax"
125 /* align destination */ 10430: shll $6,%ecx
126 .p2align 4 105 addl %ecx,%edx
127.Lbad_alignment: 106 jmp 60f
128 movl $8,%r9d 10740: lea (%rdx,%rcx,8),%rdx
129 subl %ecx,%r9d 108 jmp 60f
130 movl %r9d,%ecx 10950: movl %ecx,%edx
131 cmpq %r9,%rdx 11060: sfence
132 jz .Lhandle_7 111 movl %r8d,%ecx
133 js .Lhandle_7 112 jmp copy_user_handle_tail
134.Lalign_1: 113 .previous
135.Ls11: movb (%rsi),%bl
136.Ld11: movb %bl,(%rdi)
137 incq %rsi
138 incq %rdi
139 decl %ecx
140 jnz .Lalign_1
141 subq %r9,%rdx
142 jmp .Lafter_bad_alignment
143#endif
144 114
145 /* table sorted by exception address */
146 .section __ex_table,"a" 115 .section __ex_table,"a"
147 .align 8 116 .quad 1b,30b
148 .quad .Ls1,.Ls1e /* .Ls[1-4] - 0 bytes copied */ 117 .quad 2b,30b
149 .quad .Ls2,.Ls1e 118 .quad 3b,30b
150 .quad .Ls3,.Ls1e 119 .quad 4b,30b
151 .quad .Ls4,.Ls1e 120 .quad 5b,30b
152 .quad .Ld1,.Ls1e /* .Ld[1-4] - 0..24 bytes coped */ 121 .quad 6b,30b
153 .quad .Ld2,.Ls2e 122 .quad 7b,30b
154 .quad .Ld3,.Ls3e 123 .quad 8b,30b
155 .quad .Ld4,.Ls4e 124 .quad 9b,30b
156 .quad .Ls5,.Ls5e /* .Ls[5-8] - 32 bytes copied */ 125 .quad 10b,30b
157 .quad .Ls6,.Ls5e 126 .quad 11b,30b
158 .quad .Ls7,.Ls5e 127 .quad 12b,30b
159 .quad .Ls8,.Ls5e 128 .quad 13b,30b
160 .quad .Ld5,.Ls5e /* .Ld[5-8] - 32..56 bytes copied */ 129 .quad 14b,30b
161 .quad .Ld6,.Ls6e 130 .quad 15b,30b
162 .quad .Ld7,.Ls7e 131 .quad 16b,30b
163 .quad .Ld8,.Ls8e 132 .quad 18b,40b
164 .quad .Ls9,.Le_quad 133 .quad 19b,40b
165 .quad .Ld9,.Le_quad 134 .quad 21b,50b
166 .quad .Ls10,.Le_byte 135 .quad 22b,50b
167 .quad .Ld10,.Le_byte
168#ifdef FIX_ALIGNMENT
169 .quad .Ls11,.Lzero_rest
170 .quad .Ld11,.Lzero_rest
171#endif
172 .quad .Le5,.Le_zero
173 .previous 136 .previous
174
175 /* eax: zero, ebx: 64 */
176.Ls1e: addl $8,%eax /* eax: bytes left uncopied: Ls1e: 64 .. Ls8e: 8 */
177.Ls2e: addl $8,%eax
178.Ls3e: addl $8,%eax
179.Ls4e: addl $8,%eax
180.Ls5e: addl $8,%eax
181.Ls6e: addl $8,%eax
182.Ls7e: addl $8,%eax
183.Ls8e: addl $8,%eax
184 addq %rbx,%rdi /* +64 */
185 subq %rax,%rdi /* correct destination with computed offset */
186
187 shlq $6,%rdx /* loop counter * 64 (stride length) */
188 addq %rax,%rdx /* add offset to loopcnt */
189 andl $63,%ecx /* remaining bytes */
190 addq %rcx,%rdx /* add them */
191 jmp .Lzero_rest
192
193 /* exception on quad word loop in tail handling */
194 /* ecx: loopcnt/8, %edx: length, rdi: correct */
195.Le_quad:
196 shll $3,%ecx
197 andl $7,%edx
198 addl %ecx,%edx
199 /* edx: bytes to zero, rdi: dest, eax:zero */
200.Lzero_rest:
201 cmpl $0,(%rsp) /* zero flag set? */
202 jz .Le_zero
203 movq %rdx,%rcx
204.Le_byte:
205 xorl %eax,%eax
206.Le5: rep
207 stosb
208 /* when there is another exception while zeroing the rest just return */
209.Le_zero:
210 movq %rdx,%rax
211 jmp .Lende
212 CFI_ENDPROC 137 CFI_ENDPROC
213ENDPROC(__copy_user_nocache) 138ENDPROC(__copy_user_nocache)
214
215
diff --git a/arch/x86/lib/delay_32.c b/arch/x86/lib/delay.c
index d710f2d167bb..f4568605d7d5 100644
--- a/arch/x86/lib/delay_32.c
+++ b/arch/x86/lib/delay.c
@@ -3,6 +3,7 @@
3 * 3 *
4 * Copyright (C) 1993 Linus Torvalds 4 * Copyright (C) 1993 Linus Torvalds
5 * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz> 5 * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
6 * Copyright (C) 2008 Jiri Hladky <hladky _dot_ jiri _at_ gmail _dot_ com>
6 * 7 *
7 * The __delay function must _NOT_ be inlined as its execution time 8 * The __delay function must _NOT_ be inlined as its execution time
8 * depends wildly on alignment on many x86 processors. The additional 9 * depends wildly on alignment on many x86 processors. The additional
@@ -28,16 +29,22 @@
28/* simple loop based delay: */ 29/* simple loop based delay: */
29static void delay_loop(unsigned long loops) 30static void delay_loop(unsigned long loops)
30{ 31{
31 int d0; 32 asm volatile(
32 33 " test %0,%0 \n"
33 __asm__ __volatile__( 34 " jz 3f \n"
34 "\tjmp 1f\n" 35 " jmp 1f \n"
35 ".align 16\n" 36
36 "1:\tjmp 2f\n" 37 ".align 16 \n"
37 ".align 16\n" 38 "1: jmp 2f \n"
38 "2:\tdecl %0\n\tjns 2b" 39
39 :"=&a" (d0) 40 ".align 16 \n"
40 :"0" (loops)); 41 "2: dec %0 \n"
42 " jnz 2b \n"
43 "3: dec %0 \n"
44
45 : /* we don't need output */
46 :"a" (loops)
47 );
41} 48}
42 49
43/* TSC based delay: */ 50/* TSC based delay: */
@@ -91,7 +98,7 @@ void use_tsc_delay(void)
91int __devinit read_current_timer(unsigned long *timer_val) 98int __devinit read_current_timer(unsigned long *timer_val)
92{ 99{
93 if (delay_fn == delay_tsc) { 100 if (delay_fn == delay_tsc) {
94 rdtscl(*timer_val); 101 rdtscll(*timer_val);
95 return 0; 102 return 0;
96 } 103 }
97 return -1; 104 return -1;
@@ -101,31 +108,30 @@ void __delay(unsigned long loops)
101{ 108{
102 delay_fn(loops); 109 delay_fn(loops);
103} 110}
111EXPORT_SYMBOL(__delay);
104 112
105inline void __const_udelay(unsigned long xloops) 113inline void __const_udelay(unsigned long xloops)
106{ 114{
107 int d0; 115 int d0;
108 116
109 xloops *= 4; 117 xloops *= 4;
110 __asm__("mull %0" 118 asm("mull %%edx"
111 :"=d" (xloops), "=&a" (d0) 119 :"=d" (xloops), "=&a" (d0)
112 :"1" (xloops), "0" 120 :"1" (xloops), "0"
113 (cpu_data(raw_smp_processor_id()).loops_per_jiffy * (HZ/4))); 121 (cpu_data(raw_smp_processor_id()).loops_per_jiffy * (HZ/4)));
114 122
115 __delay(++xloops); 123 __delay(++xloops);
116} 124}
125EXPORT_SYMBOL(__const_udelay);
117 126
118void __udelay(unsigned long usecs) 127void __udelay(unsigned long usecs)
119{ 128{
120 __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ 129 __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
121} 130}
131EXPORT_SYMBOL(__udelay);
122 132
123void __ndelay(unsigned long nsecs) 133void __ndelay(unsigned long nsecs)
124{ 134{
125 __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ 135 __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
126} 136}
127
128EXPORT_SYMBOL(__delay);
129EXPORT_SYMBOL(__const_udelay);
130EXPORT_SYMBOL(__udelay);
131EXPORT_SYMBOL(__ndelay); 137EXPORT_SYMBOL(__ndelay);
diff --git a/arch/x86/lib/delay_64.c b/arch/x86/lib/delay_64.c
deleted file mode 100644
index 4c441be92641..000000000000
--- a/arch/x86/lib/delay_64.c
+++ /dev/null
@@ -1,85 +0,0 @@
1/*
2 * Precise Delay Loops for x86-64
3 *
4 * Copyright (C) 1993 Linus Torvalds
5 * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
6 *
7 * The __delay function must _NOT_ be inlined as its execution time
8 * depends wildly on alignment on many x86 processors.
9 */
10
11#include <linux/module.h>
12#include <linux/sched.h>
13#include <linux/timex.h>
14#include <linux/preempt.h>
15#include <linux/delay.h>
16#include <linux/init.h>
17
18#include <asm/delay.h>
19#include <asm/msr.h>
20
21#ifdef CONFIG_SMP
22#include <asm/smp.h>
23#endif
24
25int __devinit read_current_timer(unsigned long *timer_value)
26{
27 rdtscll(*timer_value);
28 return 0;
29}
30
31void __delay(unsigned long loops)
32{
33 unsigned bclock, now;
34 int cpu;
35
36 preempt_disable();
37 cpu = smp_processor_id();
38 rdtscl(bclock);
39 for (;;) {
40 rdtscl(now);
41 if ((now - bclock) >= loops)
42 break;
43
44 /* Allow RT tasks to run */
45 preempt_enable();
46 rep_nop();
47 preempt_disable();
48
49 /*
50 * It is possible that we moved to another CPU, and
51 * since TSC's are per-cpu we need to calculate
52 * that. The delay must guarantee that we wait "at
53 * least" the amount of time. Being moved to another
54 * CPU could make the wait longer but we just need to
55 * make sure we waited long enough. Rebalance the
56 * counter for this CPU.
57 */
58 if (unlikely(cpu != smp_processor_id())) {
59 loops -= (now - bclock);
60 cpu = smp_processor_id();
61 rdtscl(bclock);
62 }
63 }
64 preempt_enable();
65}
66EXPORT_SYMBOL(__delay);
67
68inline void __const_udelay(unsigned long xloops)
69{
70 __delay(((xloops * HZ *
71 cpu_data(raw_smp_processor_id()).loops_per_jiffy) >> 32) + 1);
72}
73EXPORT_SYMBOL(__const_udelay);
74
75void __udelay(unsigned long usecs)
76{
77 __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
78}
79EXPORT_SYMBOL(__udelay);
80
81void __ndelay(unsigned long nsecs)
82{
83 __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
84}
85EXPORT_SYMBOL(__ndelay);
diff --git a/arch/x86/lib/getuser_64.S b/arch/x86/lib/getuser.S
index 5448876261f8..ad374003742f 100644
--- a/arch/x86/lib/getuser_64.S
+++ b/arch/x86/lib/getuser.S
@@ -3,6 +3,7 @@
3 * 3 *
4 * (C) Copyright 1998 Linus Torvalds 4 * (C) Copyright 1998 Linus Torvalds
5 * (C) Copyright 2005 Andi Kleen 5 * (C) Copyright 2005 Andi Kleen
6 * (C) Copyright 2008 Glauber Costa
6 * 7 *
7 * These functions have a non-standard call interface 8 * These functions have a non-standard call interface
8 * to make them more efficient, especially as they 9 * to make them more efficient, especially as they
@@ -13,14 +14,13 @@
13/* 14/*
14 * __get_user_X 15 * __get_user_X
15 * 16 *
16 * Inputs: %rcx contains the address. 17 * Inputs: %[r|e]ax contains the address.
17 * The register is modified, but all changes are undone 18 * The register is modified, but all changes are undone
18 * before returning because the C code doesn't know about it. 19 * before returning because the C code doesn't know about it.
19 * 20 *
20 * Outputs: %rax is error code (0 or -EFAULT) 21 * Outputs: %[r|e]ax is error code (0 or -EFAULT)
21 * %rdx contains zero-extended value 22 * %[r|e]dx contains zero-extended value
22 * 23 *
23 * %r8 is destroyed.
24 * 24 *
25 * These functions should not modify any other registers, 25 * These functions should not modify any other registers,
26 * as they get called from within inline assembly. 26 * as they get called from within inline assembly.
@@ -32,78 +32,73 @@
32#include <asm/errno.h> 32#include <asm/errno.h>
33#include <asm/asm-offsets.h> 33#include <asm/asm-offsets.h>
34#include <asm/thread_info.h> 34#include <asm/thread_info.h>
35#include <asm/asm.h>
35 36
36 .text 37 .text
37ENTRY(__get_user_1) 38ENTRY(__get_user_1)
38 CFI_STARTPROC 39 CFI_STARTPROC
39 GET_THREAD_INFO(%r8) 40 GET_THREAD_INFO(%_ASM_DX)
40 cmpq threadinfo_addr_limit(%r8),%rcx 41 cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
41 jae bad_get_user 42 jae bad_get_user
421: movzb (%rcx),%edx 431: movzb (%_ASM_AX),%edx
43 xorl %eax,%eax 44 xor %eax,%eax
44 ret 45 ret
45 CFI_ENDPROC 46 CFI_ENDPROC
46ENDPROC(__get_user_1) 47ENDPROC(__get_user_1)
47 48
48ENTRY(__get_user_2) 49ENTRY(__get_user_2)
49 CFI_STARTPROC 50 CFI_STARTPROC
50 GET_THREAD_INFO(%r8) 51 add $1,%_ASM_AX
51 addq $1,%rcx 52 jc bad_get_user
52 jc 20f 53 GET_THREAD_INFO(%_ASM_DX)
53 cmpq threadinfo_addr_limit(%r8),%rcx 54 cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
54 jae 20f 55 jae bad_get_user
55 decq %rcx 562: movzwl -1(%_ASM_AX),%edx
562: movzwl (%rcx),%edx 57 xor %eax,%eax
57 xorl %eax,%eax
58 ret 58 ret
5920: decq %rcx
60 jmp bad_get_user
61 CFI_ENDPROC 59 CFI_ENDPROC
62ENDPROC(__get_user_2) 60ENDPROC(__get_user_2)
63 61
64ENTRY(__get_user_4) 62ENTRY(__get_user_4)
65 CFI_STARTPROC 63 CFI_STARTPROC
66 GET_THREAD_INFO(%r8) 64 add $3,%_ASM_AX
67 addq $3,%rcx 65 jc bad_get_user
68 jc 30f 66 GET_THREAD_INFO(%_ASM_DX)
69 cmpq threadinfo_addr_limit(%r8),%rcx 67 cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
70 jae 30f 68 jae bad_get_user
71 subq $3,%rcx 693: mov -3(%_ASM_AX),%edx
723: movl (%rcx),%edx 70 xor %eax,%eax
73 xorl %eax,%eax
74 ret 71 ret
7530: subq $3,%rcx
76 jmp bad_get_user
77 CFI_ENDPROC 72 CFI_ENDPROC
78ENDPROC(__get_user_4) 73ENDPROC(__get_user_4)
79 74
75#ifdef CONFIG_X86_64
80ENTRY(__get_user_8) 76ENTRY(__get_user_8)
81 CFI_STARTPROC 77 CFI_STARTPROC
82 GET_THREAD_INFO(%r8) 78 add $7,%_ASM_AX
83 addq $7,%rcx 79 jc bad_get_user
84 jc 40f 80 GET_THREAD_INFO(%_ASM_DX)
85 cmpq threadinfo_addr_limit(%r8),%rcx 81 cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
86 jae 40f 82 jae bad_get_user
87 subq $7,%rcx 834: movq -7(%_ASM_AX),%_ASM_DX
884: movq (%rcx),%rdx 84 xor %eax,%eax
89 xorl %eax,%eax
90 ret 85 ret
9140: subq $7,%rcx
92 jmp bad_get_user
93 CFI_ENDPROC 86 CFI_ENDPROC
94ENDPROC(__get_user_8) 87ENDPROC(__get_user_8)
88#endif
95 89
96bad_get_user: 90bad_get_user:
97 CFI_STARTPROC 91 CFI_STARTPROC
98 xorl %edx,%edx 92 xor %edx,%edx
99 movq $(-EFAULT),%rax 93 mov $(-EFAULT),%_ASM_AX
100 ret 94 ret
101 CFI_ENDPROC 95 CFI_ENDPROC
102END(bad_get_user) 96END(bad_get_user)
103 97
104.section __ex_table,"a" 98.section __ex_table,"a"
105 .quad 1b,bad_get_user 99 _ASM_PTR 1b,bad_get_user
106 .quad 2b,bad_get_user 100 _ASM_PTR 2b,bad_get_user
107 .quad 3b,bad_get_user 101 _ASM_PTR 3b,bad_get_user
108 .quad 4b,bad_get_user 102#ifdef CONFIG_X86_64
109.previous 103 _ASM_PTR 4b,bad_get_user
104#endif
diff --git a/arch/x86/lib/getuser_32.S b/arch/x86/lib/getuser_32.S
deleted file mode 100644
index 6d84b53f12a2..000000000000
--- a/arch/x86/lib/getuser_32.S
+++ /dev/null
@@ -1,78 +0,0 @@
1/*
2 * __get_user functions.
3 *
4 * (C) Copyright 1998 Linus Torvalds
5 *
6 * These functions have a non-standard call interface
7 * to make them more efficient, especially as they
8 * return an error value in addition to the "real"
9 * return value.
10 */
11#include <linux/linkage.h>
12#include <asm/dwarf2.h>
13#include <asm/thread_info.h>
14
15
16/*
17 * __get_user_X
18 *
19 * Inputs: %eax contains the address
20 *
21 * Outputs: %eax is error code (0 or -EFAULT)
22 * %edx contains zero-extended value
23 *
24 * These functions should not modify any other registers,
25 * as they get called from within inline assembly.
26 */
27
28.text
29ENTRY(__get_user_1)
30 CFI_STARTPROC
31 GET_THREAD_INFO(%edx)
32 cmpl TI_addr_limit(%edx),%eax
33 jae bad_get_user
341: movzbl (%eax),%edx
35 xorl %eax,%eax
36 ret
37 CFI_ENDPROC
38ENDPROC(__get_user_1)
39
40ENTRY(__get_user_2)
41 CFI_STARTPROC
42 addl $1,%eax
43 jc bad_get_user
44 GET_THREAD_INFO(%edx)
45 cmpl TI_addr_limit(%edx),%eax
46 jae bad_get_user
472: movzwl -1(%eax),%edx
48 xorl %eax,%eax
49 ret
50 CFI_ENDPROC
51ENDPROC(__get_user_2)
52
53ENTRY(__get_user_4)
54 CFI_STARTPROC
55 addl $3,%eax
56 jc bad_get_user
57 GET_THREAD_INFO(%edx)
58 cmpl TI_addr_limit(%edx),%eax
59 jae bad_get_user
603: movl -3(%eax),%edx
61 xorl %eax,%eax
62 ret
63 CFI_ENDPROC
64ENDPROC(__get_user_4)
65
66bad_get_user:
67 CFI_STARTPROC
68 xorl %edx,%edx
69 movl $-14,%eax
70 ret
71 CFI_ENDPROC
72END(bad_get_user)
73
74.section __ex_table,"a"
75 .long 1b,bad_get_user
76 .long 2b,bad_get_user
77 .long 3b,bad_get_user
78.previous
diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c
index 57d043fa893e..d5a2b39f882b 100644
--- a/arch/x86/lib/msr-on-cpu.c
+++ b/arch/x86/lib/msr-on-cpu.c
@@ -30,10 +30,10 @@ static int _rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h, int safe)
30 30
31 rv.msr_no = msr_no; 31 rv.msr_no = msr_no;
32 if (safe) { 32 if (safe) {
33 smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 0, 1); 33 smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
34 err = rv.err; 34 err = rv.err;
35 } else { 35 } else {
36 smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 0, 1); 36 smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
37 } 37 }
38 *l = rv.l; 38 *l = rv.l;
39 *h = rv.h; 39 *h = rv.h;
@@ -64,10 +64,10 @@ static int _wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h, int safe)
64 rv.l = l; 64 rv.l = l;
65 rv.h = h; 65 rv.h = h;
66 if (safe) { 66 if (safe) {
67 smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 0, 1); 67 smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
68 err = rv.err; 68 err = rv.err;
69 } else { 69 } else {
70 smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 0, 1); 70 smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
71 } 71 }
72 72
73 return err; 73 return err;
diff --git a/arch/x86/lib/putuser_32.S b/arch/x86/lib/putuser.S
index f58fba109d18..36b0d15ae6e9 100644
--- a/arch/x86/lib/putuser_32.S
+++ b/arch/x86/lib/putuser.S
@@ -2,6 +2,8 @@
2 * __put_user functions. 2 * __put_user functions.
3 * 3 *
4 * (C) Copyright 2005 Linus Torvalds 4 * (C) Copyright 2005 Linus Torvalds
5 * (C) Copyright 2005 Andi Kleen
6 * (C) Copyright 2008 Glauber Costa
5 * 7 *
6 * These functions have a non-standard call interface 8 * These functions have a non-standard call interface
7 * to make them more efficient, especially as they 9 * to make them more efficient, especially as they
@@ -11,6 +13,8 @@
11#include <linux/linkage.h> 13#include <linux/linkage.h>
12#include <asm/dwarf2.h> 14#include <asm/dwarf2.h>
13#include <asm/thread_info.h> 15#include <asm/thread_info.h>
16#include <asm/errno.h>
17#include <asm/asm.h>
14 18
15 19
16/* 20/*
@@ -26,73 +30,68 @@
26 */ 30 */
27 31
28#define ENTER CFI_STARTPROC ; \ 32#define ENTER CFI_STARTPROC ; \
29 pushl %ebx ; \ 33 GET_THREAD_INFO(%_ASM_BX)
30 CFI_ADJUST_CFA_OFFSET 4 ; \ 34#define EXIT ret ; \
31 CFI_REL_OFFSET ebx, 0 ; \
32 GET_THREAD_INFO(%ebx)
33#define EXIT popl %ebx ; \
34 CFI_ADJUST_CFA_OFFSET -4 ; \
35 CFI_RESTORE ebx ; \
36 ret ; \
37 CFI_ENDPROC 35 CFI_ENDPROC
38 36
39.text 37.text
40ENTRY(__put_user_1) 38ENTRY(__put_user_1)
41 ENTER 39 ENTER
42 cmpl TI_addr_limit(%ebx),%ecx 40 cmp TI_addr_limit(%_ASM_BX),%_ASM_CX
43 jae bad_put_user 41 jae bad_put_user
441: movb %al,(%ecx) 421: movb %al,(%_ASM_CX)
45 xorl %eax,%eax 43 xor %eax,%eax
46 EXIT 44 EXIT
47ENDPROC(__put_user_1) 45ENDPROC(__put_user_1)
48 46
49ENTRY(__put_user_2) 47ENTRY(__put_user_2)
50 ENTER 48 ENTER
51 movl TI_addr_limit(%ebx),%ebx 49 mov TI_addr_limit(%_ASM_BX),%_ASM_BX
52 subl $1,%ebx 50 sub $1,%_ASM_BX
53 cmpl %ebx,%ecx 51 cmp %_ASM_BX,%_ASM_CX
54 jae bad_put_user 52 jae bad_put_user
552: movw %ax,(%ecx) 532: movw %ax,(%_ASM_CX)
56 xorl %eax,%eax 54 xor %eax,%eax
57 EXIT 55 EXIT
58ENDPROC(__put_user_2) 56ENDPROC(__put_user_2)
59 57
60ENTRY(__put_user_4) 58ENTRY(__put_user_4)
61 ENTER 59 ENTER
62 movl TI_addr_limit(%ebx),%ebx 60 mov TI_addr_limit(%_ASM_BX),%_ASM_BX
63 subl $3,%ebx 61 sub $3,%_ASM_BX
64 cmpl %ebx,%ecx 62 cmp %_ASM_BX,%_ASM_CX
65 jae bad_put_user 63 jae bad_put_user
663: movl %eax,(%ecx) 643: movl %eax,(%_ASM_CX)
67 xorl %eax,%eax 65 xor %eax,%eax
68 EXIT 66 EXIT
69ENDPROC(__put_user_4) 67ENDPROC(__put_user_4)
70 68
71ENTRY(__put_user_8) 69ENTRY(__put_user_8)
72 ENTER 70 ENTER
73 movl TI_addr_limit(%ebx),%ebx 71 mov TI_addr_limit(%_ASM_BX),%_ASM_BX
74 subl $7,%ebx 72 sub $7,%_ASM_BX
75 cmpl %ebx,%ecx 73 cmp %_ASM_BX,%_ASM_CX
76 jae bad_put_user 74 jae bad_put_user
774: movl %eax,(%ecx) 754: mov %_ASM_AX,(%_ASM_CX)
785: movl %edx,4(%ecx) 76#ifdef CONFIG_X86_32
79 xorl %eax,%eax 775: movl %edx,4(%_ASM_CX)
78#endif
79 xor %eax,%eax
80 EXIT 80 EXIT
81ENDPROC(__put_user_8) 81ENDPROC(__put_user_8)
82 82
83bad_put_user: 83bad_put_user:
84 CFI_STARTPROC simple 84 CFI_STARTPROC
85 CFI_DEF_CFA esp, 2*4 85 movl $-EFAULT,%eax
86 CFI_OFFSET eip, -1*4
87 CFI_OFFSET ebx, -2*4
88 movl $-14,%eax
89 EXIT 86 EXIT
90END(bad_put_user) 87END(bad_put_user)
91 88
92.section __ex_table,"a" 89.section __ex_table,"a"
93 .long 1b,bad_put_user 90 _ASM_PTR 1b,bad_put_user
94 .long 2b,bad_put_user 91 _ASM_PTR 2b,bad_put_user
95 .long 3b,bad_put_user 92 _ASM_PTR 3b,bad_put_user
96 .long 4b,bad_put_user 93 _ASM_PTR 4b,bad_put_user
97 .long 5b,bad_put_user 94#ifdef CONFIG_X86_32
95 _ASM_PTR 5b,bad_put_user
96#endif
98.previous 97.previous
diff --git a/arch/x86/lib/putuser_64.S b/arch/x86/lib/putuser_64.S
deleted file mode 100644
index 4989f5a8fa9b..000000000000
--- a/arch/x86/lib/putuser_64.S
+++ /dev/null
@@ -1,106 +0,0 @@
1/*
2 * __put_user functions.
3 *
4 * (C) Copyright 1998 Linus Torvalds
5 * (C) Copyright 2005 Andi Kleen
6 *
7 * These functions have a non-standard call interface
8 * to make them more efficient, especially as they
9 * return an error value in addition to the "real"
10 * return value.
11 */
12
13/*
14 * __put_user_X
15 *
16 * Inputs: %rcx contains the address
17 * %rdx contains new value
18 *
19 * Outputs: %rax is error code (0 or -EFAULT)
20 *
21 * %r8 is destroyed.
22 *
23 * These functions should not modify any other registers,
24 * as they get called from within inline assembly.
25 */
26
27#include <linux/linkage.h>
28#include <asm/dwarf2.h>
29#include <asm/page.h>
30#include <asm/errno.h>
31#include <asm/asm-offsets.h>
32#include <asm/thread_info.h>
33
34 .text
35ENTRY(__put_user_1)
36 CFI_STARTPROC
37 GET_THREAD_INFO(%r8)
38 cmpq threadinfo_addr_limit(%r8),%rcx
39 jae bad_put_user
401: movb %dl,(%rcx)
41 xorl %eax,%eax
42 ret
43 CFI_ENDPROC
44ENDPROC(__put_user_1)
45
46ENTRY(__put_user_2)
47 CFI_STARTPROC
48 GET_THREAD_INFO(%r8)
49 addq $1,%rcx
50 jc 20f
51 cmpq threadinfo_addr_limit(%r8),%rcx
52 jae 20f
53 decq %rcx
542: movw %dx,(%rcx)
55 xorl %eax,%eax
56 ret
5720: decq %rcx
58 jmp bad_put_user
59 CFI_ENDPROC
60ENDPROC(__put_user_2)
61
62ENTRY(__put_user_4)
63 CFI_STARTPROC
64 GET_THREAD_INFO(%r8)
65 addq $3,%rcx
66 jc 30f
67 cmpq threadinfo_addr_limit(%r8),%rcx
68 jae 30f
69 subq $3,%rcx
703: movl %edx,(%rcx)
71 xorl %eax,%eax
72 ret
7330: subq $3,%rcx
74 jmp bad_put_user
75 CFI_ENDPROC
76ENDPROC(__put_user_4)
77
78ENTRY(__put_user_8)
79 CFI_STARTPROC
80 GET_THREAD_INFO(%r8)
81 addq $7,%rcx
82 jc 40f
83 cmpq threadinfo_addr_limit(%r8),%rcx
84 jae 40f
85 subq $7,%rcx
864: movq %rdx,(%rcx)
87 xorl %eax,%eax
88 ret
8940: subq $7,%rcx
90 jmp bad_put_user
91 CFI_ENDPROC
92ENDPROC(__put_user_8)
93
94bad_put_user:
95 CFI_STARTPROC
96 movq $(-EFAULT),%rax
97 ret
98 CFI_ENDPROC
99END(bad_put_user)
100
101.section __ex_table,"a"
102 .quad 1b,bad_put_user
103 .quad 2b,bad_put_user
104 .quad 3b,bad_put_user
105 .quad 4b,bad_put_user
106.previous
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
new file mode 100644
index 000000000000..650b11e00ecc
--- /dev/null
+++ b/arch/x86/lib/thunk_32.S
@@ -0,0 +1,47 @@
1/*
2 * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash)
3 * Copyright 2008 by Steven Rostedt, Red Hat, Inc
4 * (inspired by Andi Kleen's thunk_64.S)
5 * Subject to the GNU public license, v.2. No warranty of any kind.
6 */
7
8 #include <linux/linkage.h>
9
10#define ARCH_TRACE_IRQS_ON \
11 pushl %eax; \
12 pushl %ecx; \
13 pushl %edx; \
14 call trace_hardirqs_on; \
15 popl %edx; \
16 popl %ecx; \
17 popl %eax;
18
19#define ARCH_TRACE_IRQS_OFF \
20 pushl %eax; \
21 pushl %ecx; \
22 pushl %edx; \
23 call trace_hardirqs_off; \
24 popl %edx; \
25 popl %ecx; \
26 popl %eax;
27
28#ifdef CONFIG_TRACE_IRQFLAGS
29 /* put return address in eax (arg1) */
30 .macro thunk_ra name,func
31 .globl \name
32\name:
33 pushl %eax
34 pushl %ecx
35 pushl %edx
36 /* Place EIP in the arg1 */
37 movl 3*4(%esp), %eax
38 call \func
39 popl %edx
40 popl %ecx
41 popl %eax
42 ret
43 .endm
44
45 thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
46 thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller
47#endif
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index e009251d4e9f..bf9a7d5a5428 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -2,6 +2,7 @@
2 * Save registers before calling assembly functions. This avoids 2 * Save registers before calling assembly functions. This avoids
3 * disturbance of register allocation in some inline assembly constructs. 3 * disturbance of register allocation in some inline assembly constructs.
4 * Copyright 2001,2002 by Andi Kleen, SuSE Labs. 4 * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
5 * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc.
5 * Subject to the GNU public license, v.2. No warranty of any kind. 6 * Subject to the GNU public license, v.2. No warranty of any kind.
6 */ 7 */
7 8
@@ -42,8 +43,22 @@
42#endif 43#endif
43 44
44#ifdef CONFIG_TRACE_IRQFLAGS 45#ifdef CONFIG_TRACE_IRQFLAGS
45 thunk trace_hardirqs_on_thunk,trace_hardirqs_on 46 /* put return address in rdi (arg1) */
46 thunk trace_hardirqs_off_thunk,trace_hardirqs_off 47 .macro thunk_ra name,func
48 .globl \name
49\name:
50 CFI_STARTPROC
51 SAVE_ARGS
52 /* SAVE_ARGS pushs 9 elements */
53 /* the next element would be the rip */
54 movq 9*8(%rsp), %rdi
55 call \func
56 jmp restore
57 CFI_ENDPROC
58 .endm
59
60 thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
61 thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller
47#endif 62#endif
48 63
49#ifdef CONFIG_DEBUG_LOCK_ALLOC 64#ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 0c89d1bb0287..f4df6e7c718b 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -158,3 +158,26 @@ unsigned long copy_in_user(void __user *to, const void __user *from, unsigned le
158} 158}
159EXPORT_SYMBOL(copy_in_user); 159EXPORT_SYMBOL(copy_in_user);
160 160
161/*
162 * Try to copy last bytes and clear the rest if needed.
163 * Since protection fault in copy_from/to_user is not a normal situation,
164 * it is not necessary to optimize tail handling.
165 */
166unsigned long
167copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest)
168{
169 char c;
170 unsigned zero_len;
171
172 for (; len; --len) {
173 if (__get_user_nocheck(c, from++, sizeof(char)))
174 break;
175 if (__put_user_nocheck(c, to++, sizeof(char)))
176 break;
177 }
178
179 for (c = 0, zero_len = len; zerorest && zero_len; --zero_len)
180 if (__put_user_nocheck(c, to++, sizeof(char)))
181 break;
182 return len;
183}
diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c
index 0c28a071824c..3d317836be9e 100644
--- a/arch/x86/mach-default/setup.c
+++ b/arch/x86/mach-default/setup.c
@@ -29,6 +29,10 @@ int no_broadcast=DEFAULT_SEND_IPI;
29 **/ 29 **/
30void __init pre_intr_init_hook(void) 30void __init pre_intr_init_hook(void)
31{ 31{
32 if (x86_quirks->arch_pre_intr_init) {
33 if (x86_quirks->arch_pre_intr_init())
34 return;
35 }
32 init_ISA_irqs(); 36 init_ISA_irqs();
33} 37}
34 38
@@ -52,6 +56,10 @@ static struct irqaction irq2 = {
52 **/ 56 **/
53void __init intr_init_hook(void) 57void __init intr_init_hook(void)
54{ 58{
59 if (x86_quirks->arch_intr_init) {
60 if (x86_quirks->arch_intr_init())
61 return;
62 }
55#ifdef CONFIG_X86_LOCAL_APIC 63#ifdef CONFIG_X86_LOCAL_APIC
56 apic_intr_init(); 64 apic_intr_init();
57#endif 65#endif
@@ -65,7 +73,7 @@ void __init intr_init_hook(void)
65 * 73 *
66 * Description: 74 * Description:
67 * generally used to activate any machine specific identification 75 * generally used to activate any machine specific identification
68 * routines that may be needed before setup_arch() runs. On VISWS 76 * routines that may be needed before setup_arch() runs. On Voyager
69 * this is used to get the board revision and type. 77 * this is used to get the board revision and type.
70 **/ 78 **/
71void __init pre_setup_arch_hook(void) 79void __init pre_setup_arch_hook(void)
@@ -81,6 +89,10 @@ void __init pre_setup_arch_hook(void)
81 **/ 89 **/
82void __init trap_init_hook(void) 90void __init trap_init_hook(void)
83{ 91{
92 if (x86_quirks->arch_trap_init) {
93 if (x86_quirks->arch_trap_init())
94 return;
95 }
84} 96}
85 97
86static struct irqaction irq0 = { 98static struct irqaction irq0 = {
@@ -91,6 +103,16 @@ static struct irqaction irq0 = {
91}; 103};
92 104
93/** 105/**
106 * pre_time_init_hook - do any specific initialisations before.
107 *
108 **/
109void __init pre_time_init_hook(void)
110{
111 if (x86_quirks->arch_pre_time_init)
112 x86_quirks->arch_pre_time_init();
113}
114
115/**
94 * time_init_hook - do any specific initialisations for the system timer. 116 * time_init_hook - do any specific initialisations for the system timer.
95 * 117 *
96 * Description: 118 * Description:
@@ -99,6 +121,16 @@ static struct irqaction irq0 = {
99 **/ 121 **/
100void __init time_init_hook(void) 122void __init time_init_hook(void)
101{ 123{
124 if (x86_quirks->arch_time_init) {
125 /*
126 * A nonzero return code does not mean failure, it means
127 * that the architecture quirk does not want any
128 * generic (timer) setup to be performed after this:
129 */
130 if (x86_quirks->arch_time_init())
131 return;
132 }
133
102 irq0.mask = cpumask_of_cpu(0); 134 irq0.mask = cpumask_of_cpu(0);
103 setup_irq(0, &irq0); 135 setup_irq(0, &irq0);
104} 136}
@@ -142,45 +174,3 @@ static int __init print_ipi_mode(void)
142 174
143late_initcall(print_ipi_mode); 175late_initcall(print_ipi_mode);
144 176
145/**
146 * machine_specific_memory_setup - Hook for machine specific memory setup.
147 *
148 * Description:
149 * This is included late in kernel/setup.c so that it can make
150 * use of all of the static functions.
151 **/
152
153char * __init machine_specific_memory_setup(void)
154{
155 char *who;
156
157
158 who = "BIOS-e820";
159
160 /*
161 * Try to copy the BIOS-supplied E820-map.
162 *
163 * Otherwise fake a memory map; one section from 0k->640k,
164 * the next section from 1mb->appropriate_mem_k
165 */
166 sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
167 if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries)
168 < 0) {
169 unsigned long mem_size;
170
171 /* compare results from other methods and take the greater */
172 if (boot_params.alt_mem_k
173 < boot_params.screen_info.ext_mem_k) {
174 mem_size = boot_params.screen_info.ext_mem_k;
175 who = "BIOS-88";
176 } else {
177 mem_size = boot_params.alt_mem_k;
178 who = "BIOS-e801";
179 }
180
181 e820.nr_map = 0;
182 add_memory_region(0, LOWMEMSIZE(), E820_RAM);
183 add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
184 }
185 return who;
186}
diff --git a/arch/x86/mach-es7000/Makefile b/arch/x86/mach-es7000/Makefile
index 69dd4da218dc..3ef8b43b62fc 100644
--- a/arch/x86/mach-es7000/Makefile
+++ b/arch/x86/mach-es7000/Makefile
@@ -3,4 +3,3 @@
3# 3#
4 4
5obj-$(CONFIG_X86_ES7000) := es7000plat.o 5obj-$(CONFIG_X86_ES7000) := es7000plat.o
6obj-$(CONFIG_X86_GENERICARCH) := es7000plat.o
diff --git a/arch/x86/mach-es7000/es7000plat.c b/arch/x86/mach-es7000/es7000plat.c
index f5d6f7d8b86e..50189af14b85 100644
--- a/arch/x86/mach-es7000/es7000plat.c
+++ b/arch/x86/mach-es7000/es7000plat.c
@@ -52,6 +52,8 @@ static struct mip_reg *host_reg;
52static int mip_port; 52static int mip_port;
53static unsigned long mip_addr, host_addr; 53static unsigned long mip_addr, host_addr;
54 54
55int es7000_plat;
56
55/* 57/*
56 * GSI override for ES7000 platforms. 58 * GSI override for ES7000 platforms.
57 */ 59 */
@@ -128,10 +130,10 @@ parse_unisys_oem (char *oemptr)
128 mip_addr = val; 130 mip_addr = val;
129 mip = (struct mip_reg *)val; 131 mip = (struct mip_reg *)val;
130 mip_reg = __va(mip); 132 mip_reg = __va(mip);
131 Dprintk("es7000_mipcfg: host_reg = 0x%lx \n", 133 pr_debug("es7000_mipcfg: host_reg = 0x%lx \n",
132 (unsigned long)host_reg); 134 (unsigned long)host_reg);
133 Dprintk("es7000_mipcfg: mip_reg = 0x%lx \n", 135 pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n",
134 (unsigned long)mip_reg); 136 (unsigned long)mip_reg);
135 success++; 137 success++;
136 break; 138 break;
137 case MIP_PSAI_REG: 139 case MIP_PSAI_REG:
@@ -175,53 +177,6 @@ find_unisys_acpi_oem_table(unsigned long *oem_addr)
175} 177}
176#endif 178#endif
177 179
178/*
179 * This file also gets compiled if CONFIG_X86_GENERICARCH is set. Generic
180 * arch already has got following function definitions (asm-generic/es7000.c)
181 * hence no need to define these for that case.
182 */
183#ifndef CONFIG_X86_GENERICARCH
184void es7000_sw_apic(void);
185void __init enable_apic_mode(void)
186{
187 es7000_sw_apic();
188 return;
189}
190
191__init int mps_oem_check(struct mp_config_table *mpc, char *oem,
192 char *productid)
193{
194 if (mpc->mpc_oemptr) {
195 struct mp_config_oemtable *oem_table =
196 (struct mp_config_oemtable *)mpc->mpc_oemptr;
197 if (!strncmp(oem, "UNISYS", 6))
198 return parse_unisys_oem((char *)oem_table);
199 }
200 return 0;
201}
202#ifdef CONFIG_ACPI
203/* Hook from generic ACPI tables.c */
204int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
205{
206 unsigned long oem_addr;
207 if (!find_unisys_acpi_oem_table(&oem_addr)) {
208 if (es7000_check_dsdt())
209 return parse_unisys_oem((char *)oem_addr);
210 else {
211 setup_unisys();
212 return 1;
213 }
214 }
215 return 0;
216}
217#else
218int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
219{
220 return 0;
221}
222#endif
223#endif /* COFIG_X86_GENERICARCH */
224
225static void 180static void
226es7000_spin(int n) 181es7000_spin(int n)
227{ 182{
diff --git a/arch/x86/mach-generic/Makefile b/arch/x86/mach-generic/Makefile
index 19d6d407737b..0dbd7803a1d5 100644
--- a/arch/x86/mach-generic/Makefile
+++ b/arch/x86/mach-generic/Makefile
@@ -2,7 +2,11 @@
2# Makefile for the generic architecture 2# Makefile for the generic architecture
3# 3#
4 4
5EXTRA_CFLAGS := -Iarch/x86/kernel 5EXTRA_CFLAGS := -Iarch/x86/kernel
6 6
7obj-y := probe.o summit.o bigsmp.o es7000.o default.o 7obj-y := probe.o default.o
8obj-y += ../../x86/mach-es7000/ 8obj-$(CONFIG_X86_NUMAQ) += numaq.o
9obj-$(CONFIG_X86_SUMMIT) += summit.o
10obj-$(CONFIG_X86_BIGSMP) += bigsmp.o
11obj-$(CONFIG_X86_ES7000) += es7000.o
12obj-$(CONFIG_X86_ES7000) += ../../x86/mach-es7000/
diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c
index 95fc463056d0..59d771714559 100644
--- a/arch/x86/mach-generic/bigsmp.c
+++ b/arch/x86/mach-generic/bigsmp.c
@@ -23,10 +23,8 @@ static int dmi_bigsmp; /* can be set by dmi scanners */
23 23
24static int hp_ht_bigsmp(const struct dmi_system_id *d) 24static int hp_ht_bigsmp(const struct dmi_system_id *d)
25{ 25{
26#ifdef CONFIG_X86_GENERICARCH
27 printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident); 26 printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident);
28 dmi_bigsmp = 1; 27 dmi_bigsmp = 1;
29#endif
30 return 0; 28 return 0;
31} 29}
32 30
@@ -48,7 +46,7 @@ static const struct dmi_system_id bigsmp_dmi_table[] = {
48static int probe_bigsmp(void) 46static int probe_bigsmp(void)
49{ 47{
50 if (def_to_bigsmp) 48 if (def_to_bigsmp)
51 dmi_bigsmp = 1; 49 dmi_bigsmp = 1;
52 else 50 else
53 dmi_check_system(bigsmp_dmi_table); 51 dmi_check_system(bigsmp_dmi_table);
54 return dmi_bigsmp; 52 return dmi_bigsmp;
diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c
new file mode 100644
index 000000000000..8091e68764c4
--- /dev/null
+++ b/arch/x86/mach-generic/numaq.c
@@ -0,0 +1,41 @@
1/*
2 * APIC driver for the IBM NUMAQ chipset.
3 */
4#define APIC_DEFINITION 1
5#include <linux/threads.h>
6#include <linux/cpumask.h>
7#include <linux/smp.h>
8#include <asm/mpspec.h>
9#include <asm/genapic.h>
10#include <asm/fixmap.h>
11#include <asm/apicdef.h>
12#include <linux/kernel.h>
13#include <linux/string.h>
14#include <linux/init.h>
15#include <asm/mach-numaq/mach_apic.h>
16#include <asm/mach-numaq/mach_apicdef.h>
17#include <asm/mach-numaq/mach_ipi.h>
18#include <asm/mach-numaq/mach_mpparse.h>
19#include <asm/mach-numaq/mach_wakecpu.h>
20#include <asm/numaq.h>
21
22static int mps_oem_check(struct mp_config_table *mpc, char *oem,
23 char *productid)
24{
25 numaq_mps_oem_check(mpc, oem, productid);
26 return found_numaq;
27}
28
29static int probe_numaq(void)
30{
31 /* already know from get_memcfg_numaq() */
32 return found_numaq;
33}
34
35/* Hook from generic ACPI tables.c */
36static int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
37{
38 return 0;
39}
40
41struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq);
diff --git a/arch/x86/mach-generic/probe.c b/arch/x86/mach-generic/probe.c
index c5ae751b994a..5a7e4619e1c4 100644
--- a/arch/x86/mach-generic/probe.c
+++ b/arch/x86/mach-generic/probe.c
@@ -16,6 +16,7 @@
16#include <asm/apicdef.h> 16#include <asm/apicdef.h>
17#include <asm/genapic.h> 17#include <asm/genapic.h>
18 18
19extern struct genapic apic_numaq;
19extern struct genapic apic_summit; 20extern struct genapic apic_summit;
20extern struct genapic apic_bigsmp; 21extern struct genapic apic_bigsmp;
21extern struct genapic apic_es7000; 22extern struct genapic apic_es7000;
@@ -24,9 +25,18 @@ extern struct genapic apic_default;
24struct genapic *genapic = &apic_default; 25struct genapic *genapic = &apic_default;
25 26
26static struct genapic *apic_probe[] __initdata = { 27static struct genapic *apic_probe[] __initdata = {
28#ifdef CONFIG_X86_NUMAQ
29 &apic_numaq,
30#endif
31#ifdef CONFIG_X86_SUMMIT
27 &apic_summit, 32 &apic_summit,
33#endif
34#ifdef CONFIG_X86_BIGSMP
28 &apic_bigsmp, 35 &apic_bigsmp,
36#endif
37#ifdef CONFIG_X86_ES7000
29 &apic_es7000, 38 &apic_es7000,
39#endif
30 &apic_default, /* must be last */ 40 &apic_default, /* must be last */
31 NULL, 41 NULL,
32}; 42};
@@ -54,6 +64,7 @@ early_param("apic", parse_apic);
54 64
55void __init generic_bigsmp_probe(void) 65void __init generic_bigsmp_probe(void)
56{ 66{
67#ifdef CONFIG_X86_BIGSMP
57 /* 68 /*
58 * This routine is used to switch to bigsmp mode when 69 * This routine is used to switch to bigsmp mode when
59 * - There is no apic= option specified by the user 70 * - There is no apic= option specified by the user
@@ -67,6 +78,7 @@ void __init generic_bigsmp_probe(void)
67 printk(KERN_INFO "Overriding APIC driver with %s\n", 78 printk(KERN_INFO "Overriding APIC driver with %s\n",
68 genapic->name); 79 genapic->name);
69 } 80 }
81#endif
70} 82}
71 83
72void __init generic_apic_probe(void) 84void __init generic_apic_probe(void)
@@ -88,7 +100,8 @@ void __init generic_apic_probe(void)
88 100
89/* These functions can switch the APIC even after the initial ->probe() */ 101/* These functions can switch the APIC even after the initial ->probe() */
90 102
91int __init mps_oem_check(struct mp_config_table *mpc, char *oem, char *productid) 103int __init mps_oem_check(struct mp_config_table *mpc, char *oem,
104 char *productid)
92{ 105{
93 int i; 106 int i;
94 for (i = 0; apic_probe[i]; ++i) { 107 for (i = 0; apic_probe[i]; ++i) {
diff --git a/arch/x86/mach-visws/Makefile b/arch/x86/mach-visws/Makefile
deleted file mode 100644
index 835fd96ad768..000000000000
--- a/arch/x86/mach-visws/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
1#
2# Makefile for the linux kernel.
3#
4
5obj-y := setup.o traps.o reboot.o
6
7obj-$(CONFIG_X86_VISWS_APIC) += visws_apic.o
8obj-$(CONFIG_X86_LOCAL_APIC) += mpparse.o
diff --git a/arch/x86/mach-visws/mpparse.c b/arch/x86/mach-visws/mpparse.c
deleted file mode 100644
index 57484e91ab90..000000000000
--- a/arch/x86/mach-visws/mpparse.c
+++ /dev/null
@@ -1,88 +0,0 @@
1
2#include <linux/init.h>
3#include <linux/smp.h>
4
5#include <asm/smp.h>
6#include <asm/io.h>
7
8#include "cobalt.h"
9#include "mach_apic.h"
10
11/* Have we found an MP table */
12int smp_found_config;
13
14int pic_mode;
15
16extern unsigned int __cpuinitdata maxcpus;
17
18/*
19 * The Visual Workstation is Intel MP compliant in the hardware
20 * sense, but it doesn't have a BIOS(-configuration table).
21 * No problem for Linux.
22 */
23
24static void __init MP_processor_info (struct mpc_config_processor *m)
25{
26 int ver, logical_apicid;
27 physid_mask_t apic_cpus;
28
29 if (!(m->mpc_cpuflag & CPU_ENABLED))
30 return;
31
32 logical_apicid = m->mpc_apicid;
33 printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n",
34 m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "",
35 m->mpc_apicid,
36 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
37 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
38 m->mpc_apicver);
39
40 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR)
41 boot_cpu_physical_apicid = m->mpc_apicid;
42
43 ver = m->mpc_apicver;
44 if ((ver >= 0x14 && m->mpc_apicid >= 0xff) || m->mpc_apicid >= 0xf) {
45 printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
46 m->mpc_apicid, MAX_APICS);
47 return;
48 }
49
50 apic_cpus = apicid_to_cpu_present(m->mpc_apicid);
51 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
52 /*
53 * Validate version
54 */
55 if (ver == 0x0) {
56 printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! "
57 "fixing up to 0x10. (tell your hw vendor)\n",
58 m->mpc_apicid);
59 ver = 0x10;
60 }
61 apic_version[m->mpc_apicid] = ver;
62}
63
64void __init find_smp_config(void)
65{
66 struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS);
67 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
68
69 if (ncpus > CO_CPU_MAX) {
70 printk(KERN_WARNING "find_visws_smp: got cpu count of %d at %p\n",
71 ncpus, mp);
72
73 ncpus = CO_CPU_MAX;
74 }
75
76 if (ncpus > maxcpus)
77 ncpus = maxcpus;
78
79 smp_found_config = 1;
80 while (ncpus--)
81 MP_processor_info(mp++);
82
83 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
84}
85
86void __init get_smp_config (void)
87{
88}
diff --git a/arch/x86/mach-visws/reboot.c b/arch/x86/mach-visws/reboot.c
deleted file mode 100644
index 99332abfad42..000000000000
--- a/arch/x86/mach-visws/reboot.c
+++ /dev/null
@@ -1,55 +0,0 @@
1#include <linux/module.h>
2#include <linux/smp.h>
3#include <linux/delay.h>
4
5#include <asm/io.h>
6#include "piix4.h"
7
8void (*pm_power_off)(void);
9EXPORT_SYMBOL(pm_power_off);
10
11void machine_shutdown(void)
12{
13#ifdef CONFIG_SMP
14 smp_send_stop();
15#endif
16}
17
18void machine_emergency_restart(void)
19{
20 /*
21 * Visual Workstations restart after this
22 * register is poked on the PIIX4
23 */
24 outb(PIIX4_RESET_VAL, PIIX4_RESET_PORT);
25}
26
27void machine_restart(char * __unused)
28{
29 machine_shutdown();
30 machine_emergency_restart();
31}
32
33void machine_power_off(void)
34{
35 unsigned short pm_status;
36 extern unsigned int pci_bus0;
37
38 while ((pm_status = inw(PMSTS_PORT)) & 0x100)
39 outw(pm_status, PMSTS_PORT);
40
41 outw(PM_SUSPEND_ENABLE, PMCNTRL_PORT);
42
43 mdelay(10);
44
45#define PCI_CONF1_ADDRESS(bus, devfn, reg) \
46 (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3))
47
48 outl(PCI_CONF1_ADDRESS(pci_bus0, SPECIAL_DEV, SPECIAL_REG), 0xCF8);
49 outl(PIIX_SPECIAL_STOP, 0xCFC);
50}
51
52void machine_halt(void)
53{
54}
55
diff --git a/arch/x86/mach-visws/setup.c b/arch/x86/mach-visws/setup.c
deleted file mode 100644
index de4c9dbd086f..000000000000
--- a/arch/x86/mach-visws/setup.c
+++ /dev/null
@@ -1,183 +0,0 @@
1/*
2 * Unmaintained SGI Visual Workstation support.
3 * Split out from setup.c by davej@suse.de
4 */
5
6#include <linux/smp.h>
7#include <linux/init.h>
8#include <linux/interrupt.h>
9#include <linux/module.h>
10
11#include <asm/fixmap.h>
12#include <asm/arch_hooks.h>
13#include <asm/io.h>
14#include <asm/e820.h>
15#include <asm/setup.h>
16#include "cobalt.h"
17#include "piix4.h"
18
19int no_broadcast;
20
21char visws_board_type = -1;
22char visws_board_rev = -1;
23
24void __init visws_get_board_type_and_rev(void)
25{
26 int raw;
27
28 visws_board_type = (char)(inb_p(PIIX_GPI_BD_REG) & PIIX_GPI_BD_REG)
29 >> PIIX_GPI_BD_SHIFT;
30 /*
31 * Get Board rev.
32 * First, we have to initialize the 307 part to allow us access
33 * to the GPIO registers. Let's map them at 0x0fc0 which is right
34 * after the PIIX4 PM section.
35 */
36 outb_p(SIO_DEV_SEL, SIO_INDEX);
37 outb_p(SIO_GP_DEV, SIO_DATA); /* Talk to GPIO regs. */
38
39 outb_p(SIO_DEV_MSB, SIO_INDEX);
40 outb_p(SIO_GP_MSB, SIO_DATA); /* MSB of GPIO base address */
41
42 outb_p(SIO_DEV_LSB, SIO_INDEX);
43 outb_p(SIO_GP_LSB, SIO_DATA); /* LSB of GPIO base address */
44
45 outb_p(SIO_DEV_ENB, SIO_INDEX);
46 outb_p(1, SIO_DATA); /* Enable GPIO registers. */
47
48 /*
49 * Now, we have to map the power management section to write
50 * a bit which enables access to the GPIO registers.
51 * What lunatic came up with this shit?
52 */
53 outb_p(SIO_DEV_SEL, SIO_INDEX);
54 outb_p(SIO_PM_DEV, SIO_DATA); /* Talk to GPIO regs. */
55
56 outb_p(SIO_DEV_MSB, SIO_INDEX);
57 outb_p(SIO_PM_MSB, SIO_DATA); /* MSB of PM base address */
58
59 outb_p(SIO_DEV_LSB, SIO_INDEX);
60 outb_p(SIO_PM_LSB, SIO_DATA); /* LSB of PM base address */
61
62 outb_p(SIO_DEV_ENB, SIO_INDEX);
63 outb_p(1, SIO_DATA); /* Enable PM registers. */
64
65 /*
66 * Now, write the PM register which enables the GPIO registers.
67 */
68 outb_p(SIO_PM_FER2, SIO_PM_INDEX);
69 outb_p(SIO_PM_GP_EN, SIO_PM_DATA);
70
71 /*
72 * Now, initialize the GPIO registers.
73 * We want them all to be inputs which is the
74 * power on default, so let's leave them alone.
75 * So, let's just read the board rev!
76 */
77 raw = inb_p(SIO_GP_DATA1);
78 raw &= 0x7f; /* 7 bits of valid board revision ID. */
79
80 if (visws_board_type == VISWS_320) {
81 if (raw < 0x6) {
82 visws_board_rev = 4;
83 } else if (raw < 0xc) {
84 visws_board_rev = 5;
85 } else {
86 visws_board_rev = 6;
87 }
88 } else if (visws_board_type == VISWS_540) {
89 visws_board_rev = 2;
90 } else {
91 visws_board_rev = raw;
92 }
93
94 printk(KERN_INFO "Silicon Graphics Visual Workstation %s (rev %d) detected\n",
95 (visws_board_type == VISWS_320 ? "320" :
96 (visws_board_type == VISWS_540 ? "540" :
97 "unknown")), visws_board_rev);
98}
99
100void __init pre_intr_init_hook(void)
101{
102 init_VISWS_APIC_irqs();
103}
104
105void __init intr_init_hook(void)
106{
107#ifdef CONFIG_X86_LOCAL_APIC
108 apic_intr_init();
109#endif
110}
111
112void __init pre_setup_arch_hook()
113{
114 visws_get_board_type_and_rev();
115}
116
117static struct irqaction irq0 = {
118 .handler = timer_interrupt,
119 .flags = IRQF_DISABLED | IRQF_IRQPOLL,
120 .name = "timer",
121};
122
123void __init time_init_hook(void)
124{
125 printk(KERN_INFO "Starting Cobalt Timer system clock\n");
126
127 /* Set the countdown value */
128 co_cpu_write(CO_CPU_TIMEVAL, CO_TIME_HZ/HZ);
129
130 /* Start the timer */
131 co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) | CO_CTRL_TIMERUN);
132
133 /* Enable (unmask) the timer interrupt */
134 co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK);
135
136 /* Wire cpu IDT entry to s/w handler (and Cobalt APIC to IDT) */
137 setup_irq(0, &irq0);
138}
139
140/* Hook for machine specific memory setup. */
141
142#define MB (1024 * 1024)
143
144unsigned long sgivwfb_mem_phys;
145unsigned long sgivwfb_mem_size;
146EXPORT_SYMBOL(sgivwfb_mem_phys);
147EXPORT_SYMBOL(sgivwfb_mem_size);
148
149long long mem_size __initdata = 0;
150
151char * __init machine_specific_memory_setup(void)
152{
153 long long gfx_mem_size = 8 * MB;
154
155 mem_size = boot_params.alt_mem_k;
156
157 if (!mem_size) {
158 printk(KERN_WARNING "Bootloader didn't set memory size, upgrade it !\n");
159 mem_size = 128 * MB;
160 }
161
162 /*
163 * this hardcodes the graphics memory to 8 MB
164 * it really should be sized dynamically (or at least
165 * set as a boot param)
166 */
167 if (!sgivwfb_mem_size) {
168 printk(KERN_WARNING "Defaulting to 8 MB framebuffer size\n");
169 sgivwfb_mem_size = 8 * MB;
170 }
171
172 /*
173 * Trim to nearest MB
174 */
175 sgivwfb_mem_size &= ~((1 << 20) - 1);
176 sgivwfb_mem_phys = mem_size - gfx_mem_size;
177
178 add_memory_region(0, LOWMEMSIZE(), E820_RAM);
179 add_memory_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM);
180 add_memory_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED);
181
182 return "PROM";
183}
diff --git a/arch/x86/mach-visws/traps.c b/arch/x86/mach-visws/traps.c
deleted file mode 100644
index bfac6ba10f8a..000000000000
--- a/arch/x86/mach-visws/traps.c
+++ /dev/null
@@ -1,69 +0,0 @@
1/* VISWS traps */
2
3#include <linux/sched.h>
4#include <linux/kernel.h>
5#include <linux/init.h>
6#include <linux/pci.h>
7#include <linux/pci_ids.h>
8
9#include <asm/io.h>
10#include <asm/arch_hooks.h>
11#include <asm/apic.h>
12#include "cobalt.h"
13#include "lithium.h"
14
15
16#define A01234 (LI_INTA_0 | LI_INTA_1 | LI_INTA_2 | LI_INTA_3 | LI_INTA_4)
17#define BCD (LI_INTB | LI_INTC | LI_INTD)
18#define ALLDEVS (A01234 | BCD)
19
20static __init void lithium_init(void)
21{
22 set_fixmap(FIX_LI_PCIA, LI_PCI_A_PHYS);
23 set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS);
24
25 if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
26 (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
27 printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A');
28 panic("This machine is not SGI Visual Workstation 320/540");
29 }
30
31 if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
32 (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
33 printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B');
34 panic("This machine is not SGI Visual Workstation 320/540");
35 }
36
37 li_pcia_write16(LI_PCI_INTEN, ALLDEVS);
38 li_pcib_write16(LI_PCI_INTEN, ALLDEVS);
39}
40
41static __init void cobalt_init(void)
42{
43 /*
44 * On normal SMP PC this is used only with SMP, but we have to
45 * use it and set it up here to start the Cobalt clock
46 */
47 set_fixmap(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE);
48 setup_local_APIC();
49 printk(KERN_INFO "Local APIC Version %#x, ID %#x\n",
50 (unsigned int)apic_read(APIC_LVR),
51 (unsigned int)apic_read(APIC_ID));
52
53 set_fixmap(FIX_CO_CPU, CO_CPU_PHYS);
54 set_fixmap(FIX_CO_APIC, CO_APIC_PHYS);
55 printk(KERN_INFO "Cobalt Revision %#lx, APIC ID %#lx\n",
56 co_cpu_read(CO_CPU_REV), co_apic_read(CO_APIC_ID));
57
58 /* Enable Cobalt APIC being careful to NOT change the ID! */
59 co_apic_write(CO_APIC_ID, co_apic_read(CO_APIC_ID) | CO_APIC_ENABLE);
60
61 printk(KERN_INFO "Cobalt APIC enabled: ID reg %#lx\n",
62 co_apic_read(CO_APIC_ID));
63}
64
65void __init trap_init_hook(void)
66{
67 lithium_init();
68 cobalt_init();
69}
diff --git a/arch/x86/mach-visws/visws_apic.c b/arch/x86/mach-visws/visws_apic.c
deleted file mode 100644
index cef9cb1d15ac..000000000000
--- a/arch/x86/mach-visws/visws_apic.c
+++ /dev/null
@@ -1,297 +0,0 @@
1/*
2 * Copyright (C) 1999 Bent Hagemark, Ingo Molnar
3 *
4 * SGI Visual Workstation interrupt controller
5 *
6 * The Cobalt system ASIC in the Visual Workstation contains a "Cobalt" APIC
7 * which serves as the main interrupt controller in the system. Non-legacy
8 * hardware in the system uses this controller directly. Legacy devices
9 * are connected to the PIIX4 which in turn has its 8259(s) connected to
10 * a of the Cobalt APIC entry.
11 *
12 * 09/02/2000 - Updated for 2.4 by jbarnes@sgi.com
13 *
14 * 25/11/2002 - Updated for 2.5 by Andrey Panin <pazke@orbita1.ru>
15 */
16
17#include <linux/kernel_stat.h>
18#include <linux/interrupt.h>
19#include <linux/init.h>
20
21#include <asm/io.h>
22#include <asm/apic.h>
23#include <asm/i8259.h>
24
25#include "cobalt.h"
26#include "irq_vectors.h"
27
28
29static DEFINE_SPINLOCK(cobalt_lock);
30
31/*
32 * Set the given Cobalt APIC Redirection Table entry to point
33 * to the given IDT vector/index.
34 */
35static inline void co_apic_set(int entry, int irq)
36{
37 co_apic_write(CO_APIC_LO(entry), CO_APIC_LEVEL | (irq + FIRST_EXTERNAL_VECTOR));
38 co_apic_write(CO_APIC_HI(entry), 0);
39}
40
41/*
42 * Cobalt (IO)-APIC functions to handle PCI devices.
43 */
44static inline int co_apic_ide0_hack(void)
45{
46 extern char visws_board_type;
47 extern char visws_board_rev;
48
49 if (visws_board_type == VISWS_320 && visws_board_rev == 5)
50 return 5;
51 return CO_APIC_IDE0;
52}
53
54static int is_co_apic(unsigned int irq)
55{
56 if (IS_CO_APIC(irq))
57 return CO_APIC(irq);
58
59 switch (irq) {
60 case 0: return CO_APIC_CPU;
61 case CO_IRQ_IDE0: return co_apic_ide0_hack();
62 case CO_IRQ_IDE1: return CO_APIC_IDE1;
63 default: return -1;
64 }
65}
66
67
68/*
69 * This is the SGI Cobalt (IO-)APIC:
70 */
71
72static void enable_cobalt_irq(unsigned int irq)
73{
74 co_apic_set(is_co_apic(irq), irq);
75}
76
77static void disable_cobalt_irq(unsigned int irq)
78{
79 int entry = is_co_apic(irq);
80
81 co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK);
82 co_apic_read(CO_APIC_LO(entry));
83}
84
85/*
86 * "irq" really just serves to identify the device. Here is where we
87 * map this to the Cobalt APIC entry where it's physically wired.
88 * This is called via request_irq -> setup_irq -> irq_desc->startup()
89 */
90static unsigned int startup_cobalt_irq(unsigned int irq)
91{
92 unsigned long flags;
93
94 spin_lock_irqsave(&cobalt_lock, flags);
95 if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
96 irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
97 enable_cobalt_irq(irq);
98 spin_unlock_irqrestore(&cobalt_lock, flags);
99 return 0;
100}
101
102static void ack_cobalt_irq(unsigned int irq)
103{
104 unsigned long flags;
105
106 spin_lock_irqsave(&cobalt_lock, flags);
107 disable_cobalt_irq(irq);
108 apic_write(APIC_EOI, APIC_EIO_ACK);
109 spin_unlock_irqrestore(&cobalt_lock, flags);
110}
111
112static void end_cobalt_irq(unsigned int irq)
113{
114 unsigned long flags;
115
116 spin_lock_irqsave(&cobalt_lock, flags);
117 if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS)))
118 enable_cobalt_irq(irq);
119 spin_unlock_irqrestore(&cobalt_lock, flags);
120}
121
122static struct irq_chip cobalt_irq_type = {
123 .typename = "Cobalt-APIC",
124 .startup = startup_cobalt_irq,
125 .shutdown = disable_cobalt_irq,
126 .enable = enable_cobalt_irq,
127 .disable = disable_cobalt_irq,
128 .ack = ack_cobalt_irq,
129 .end = end_cobalt_irq,
130};
131
132
133/*
134 * This is the PIIX4-based 8259 that is wired up indirectly to Cobalt
135 * -- not the manner expected by the code in i8259.c.
136 *
137 * there is a 'master' physical interrupt source that gets sent to
138 * the CPU. But in the chipset there are various 'virtual' interrupts
139 * waiting to be handled. We represent this to Linux through a 'master'
140 * interrupt controller type, and through a special virtual interrupt-
141 * controller. Device drivers only see the virtual interrupt sources.
142 */
143static unsigned int startup_piix4_master_irq(unsigned int irq)
144{
145 init_8259A(0);
146
147 return startup_cobalt_irq(irq);
148}
149
150static void end_piix4_master_irq(unsigned int irq)
151{
152 unsigned long flags;
153
154 spin_lock_irqsave(&cobalt_lock, flags);
155 enable_cobalt_irq(irq);
156 spin_unlock_irqrestore(&cobalt_lock, flags);
157}
158
159static struct irq_chip piix4_master_irq_type = {
160 .typename = "PIIX4-master",
161 .startup = startup_piix4_master_irq,
162 .ack = ack_cobalt_irq,
163 .end = end_piix4_master_irq,
164};
165
166
167static struct irq_chip piix4_virtual_irq_type = {
168 .typename = "PIIX4-virtual",
169 .shutdown = disable_8259A_irq,
170 .enable = enable_8259A_irq,
171 .disable = disable_8259A_irq,
172};
173
174
175/*
176 * PIIX4-8259 master/virtual functions to handle interrupt requests
177 * from legacy devices: floppy, parallel, serial, rtc.
178 *
179 * None of these get Cobalt APIC entries, neither do they have IDT
180 * entries. These interrupts are purely virtual and distributed from
181 * the 'master' interrupt source: CO_IRQ_8259.
182 *
183 * When the 8259 interrupts its handler figures out which of these
184 * devices is interrupting and dispatches to its handler.
185 *
186 * CAREFUL: devices see the 'virtual' interrupt only. Thus disable/
187 * enable_irq gets the right irq. This 'master' irq is never directly
188 * manipulated by any driver.
189 */
190static irqreturn_t piix4_master_intr(int irq, void *dev_id)
191{
192 int realirq;
193 irq_desc_t *desc;
194 unsigned long flags;
195
196 spin_lock_irqsave(&i8259A_lock, flags);
197
198 /* Find out what's interrupting in the PIIX4 master 8259 */
199 outb(0x0c, 0x20); /* OCW3 Poll command */
200 realirq = inb(0x20);
201
202 /*
203 * Bit 7 == 0 means invalid/spurious
204 */
205 if (unlikely(!(realirq & 0x80)))
206 goto out_unlock;
207
208 realirq &= 7;
209
210 if (unlikely(realirq == 2)) {
211 outb(0x0c, 0xa0);
212 realirq = inb(0xa0);
213
214 if (unlikely(!(realirq & 0x80)))
215 goto out_unlock;
216
217 realirq = (realirq & 7) + 8;
218 }
219
220 /* mask and ack interrupt */
221 cached_irq_mask |= 1 << realirq;
222 if (unlikely(realirq > 7)) {
223 inb(0xa1);
224 outb(cached_slave_mask, 0xa1);
225 outb(0x60 + (realirq & 7), 0xa0);
226 outb(0x60 + 2, 0x20);
227 } else {
228 inb(0x21);
229 outb(cached_master_mask, 0x21);
230 outb(0x60 + realirq, 0x20);
231 }
232
233 spin_unlock_irqrestore(&i8259A_lock, flags);
234
235 desc = irq_desc + realirq;
236
237 /*
238 * handle this 'virtual interrupt' as a Cobalt one now.
239 */
240 kstat_cpu(smp_processor_id()).irqs[realirq]++;
241
242 if (likely(desc->action != NULL))
243 handle_IRQ_event(realirq, desc->action);
244
245 if (!(desc->status & IRQ_DISABLED))
246 enable_8259A_irq(realirq);
247
248 return IRQ_HANDLED;
249
250out_unlock:
251 spin_unlock_irqrestore(&i8259A_lock, flags);
252 return IRQ_NONE;
253}
254
255static struct irqaction master_action = {
256 .handler = piix4_master_intr,
257 .name = "PIIX4-8259",
258};
259
260static struct irqaction cascade_action = {
261 .handler = no_action,
262 .name = "cascade",
263};
264
265
266void init_VISWS_APIC_irqs(void)
267{
268 int i;
269
270 for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
271 irq_desc[i].status = IRQ_DISABLED;
272 irq_desc[i].action = 0;
273 irq_desc[i].depth = 1;
274
275 if (i == 0) {
276 irq_desc[i].chip = &cobalt_irq_type;
277 }
278 else if (i == CO_IRQ_IDE0) {
279 irq_desc[i].chip = &cobalt_irq_type;
280 }
281 else if (i == CO_IRQ_IDE1) {
282 irq_desc[i].chip = &cobalt_irq_type;
283 }
284 else if (i == CO_IRQ_8259) {
285 irq_desc[i].chip = &piix4_master_irq_type;
286 }
287 else if (i < CO_IRQ_APIC0) {
288 irq_desc[i].chip = &piix4_virtual_irq_type;
289 }
290 else if (IS_CO_APIC(i)) {
291 irq_desc[i].chip = &cobalt_irq_type;
292 }
293 }
294
295 setup_irq(CO_IRQ_8259, &master_action);
296 setup_irq(2, &cascade_action);
297}
diff --git a/arch/x86/mach-voyager/setup.c b/arch/x86/mach-voyager/setup.c
index 5ae5466b9eb9..6bbdd633864c 100644
--- a/arch/x86/mach-voyager/setup.c
+++ b/arch/x86/mach-voyager/setup.c
@@ -62,6 +62,7 @@ void __init time_init_hook(void)
62char *__init machine_specific_memory_setup(void) 62char *__init machine_specific_memory_setup(void)
63{ 63{
64 char *who; 64 char *who;
65 int new_nr;
65 66
66 who = "NOT VOYAGER"; 67 who = "NOT VOYAGER";
67 68
@@ -73,7 +74,7 @@ char *__init machine_specific_memory_setup(void)
73 74
74 e820.nr_map = 0; 75 e820.nr_map = 0;
75 for (i = 0; voyager_memory_detect(i, &addr, &length); i++) { 76 for (i = 0; voyager_memory_detect(i, &addr, &length); i++) {
76 add_memory_region(addr, length, E820_RAM); 77 e820_add_region(addr, length, E820_RAM);
77 } 78 }
78 return who; 79 return who;
79 } else if (voyager_level == 4) { 80 } else if (voyager_level == 4) {
@@ -91,43 +92,17 @@ char *__init machine_specific_memory_setup(void)
91 tom = (boot_params.screen_info.ext_mem_k) << 10; 92 tom = (boot_params.screen_info.ext_mem_k) << 10;
92 } 93 }
93 who = "Voyager-TOM"; 94 who = "Voyager-TOM";
94 add_memory_region(0, 0x9f000, E820_RAM); 95 e820_add_region(0, 0x9f000, E820_RAM);
95 /* map from 1M to top of memory */ 96 /* map from 1M to top of memory */
96 add_memory_region(1 * 1024 * 1024, tom - 1 * 1024 * 1024, 97 e820_add_region(1 * 1024 * 1024, tom - 1 * 1024 * 1024,
97 E820_RAM); 98 E820_RAM);
98 /* FIXME: Should check the ASICs to see if I need to 99 /* FIXME: Should check the ASICs to see if I need to
99 * take out the 8M window. Just do it at the moment 100 * take out the 8M window. Just do it at the moment
100 * */ 101 * */
101 add_memory_region(8 * 1024 * 1024, 8 * 1024 * 1024, 102 e820_add_region(8 * 1024 * 1024, 8 * 1024 * 1024,
102 E820_RESERVED); 103 E820_RESERVED);
103 return who; 104 return who;
104 } 105 }
105 106
106 who = "BIOS-e820"; 107 return default_machine_specific_memory_setup();
107
108 /*
109 * Try to copy the BIOS-supplied E820-map.
110 *
111 * Otherwise fake a memory map; one section from 0k->640k,
112 * the next section from 1mb->appropriate_mem_k
113 */
114 sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
115 if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries)
116 < 0) {
117 unsigned long mem_size;
118
119 /* compare results from other methods and take the greater */
120 if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) {
121 mem_size = boot_params.screen_info.ext_mem_k;
122 who = "BIOS-88";
123 } else {
124 mem_size = boot_params.alt_mem_k;
125 who = "BIOS-e801";
126 }
127
128 e820.nr_map = 0;
129 add_memory_region(0, LOWMEMSIZE(), E820_RAM);
130 add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
131 }
132 return who;
133} 108}
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 8acbf0cdf1a5..ee0fba092157 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -59,11 +59,6 @@ __u32 voyager_quad_processors = 0;
59 * activity count. Finally exported by i386_ksyms.c */ 59 * activity count. Finally exported by i386_ksyms.c */
60static int voyager_extended_cpus = 1; 60static int voyager_extended_cpus = 1;
61 61
62/* Have we found an SMP box - used by time.c to do the profiling
63 interrupt for timeslicing; do not set to 1 until the per CPU timer
64 interrupt is active */
65int smp_found_config = 0;
66
67/* Used for the invalidate map that's also checked in the spinlock */ 62/* Used for the invalidate map that's also checked in the spinlock */
68static volatile unsigned long smp_invalidate_needed; 63static volatile unsigned long smp_invalidate_needed;
69 64
@@ -955,94 +950,24 @@ static void smp_stop_cpu_function(void *dummy)
955 halt(); 950 halt();
956} 951}
957 952
958static DEFINE_SPINLOCK(call_lock);
959
960struct call_data_struct {
961 void (*func) (void *info);
962 void *info;
963 volatile unsigned long started;
964 volatile unsigned long finished;
965 int wait;
966};
967
968static struct call_data_struct *call_data;
969
970/* execute a thread on a new CPU. The function to be called must be 953/* execute a thread on a new CPU. The function to be called must be
971 * previously set up. This is used to schedule a function for 954 * previously set up. This is used to schedule a function for
972 * execution on all CPUs - set up the function then broadcast a 955 * execution on all CPUs - set up the function then broadcast a
973 * function_interrupt CPI to come here on each CPU */ 956 * function_interrupt CPI to come here on each CPU */
974static void smp_call_function_interrupt(void) 957static void smp_call_function_interrupt(void)
975{ 958{
976 void (*func) (void *info) = call_data->func;
977 void *info = call_data->info;
978 /* must take copy of wait because call_data may be replaced
979 * unless the function is waiting for us to finish */
980 int wait = call_data->wait;
981 __u8 cpu = smp_processor_id();
982
983 /*
984 * Notify initiating CPU that I've grabbed the data and am
985 * about to execute the function
986 */
987 mb();
988 if (!test_and_clear_bit(cpu, &call_data->started)) {
989 /* If the bit wasn't set, this could be a replay */
990 printk(KERN_WARNING "VOYAGER SMP: CPU %d received call funtion"
991 " with no call pending\n", cpu);
992 return;
993 }
994 /*
995 * At this point the info structure may be out of scope unless wait==1
996 */
997 irq_enter(); 959 irq_enter();
998 (*func) (info); 960 generic_smp_call_function_interrupt();
999 __get_cpu_var(irq_stat).irq_call_count++; 961 __get_cpu_var(irq_stat).irq_call_count++;
1000 irq_exit(); 962 irq_exit();
1001 if (wait) {
1002 mb();
1003 clear_bit(cpu, &call_data->finished);
1004 }
1005} 963}
1006 964
1007static int 965static void smp_call_function_single_interrupt(void)
1008voyager_smp_call_function_mask(cpumask_t cpumask,
1009 void (*func) (void *info), void *info, int wait)
1010{ 966{
1011 struct call_data_struct data; 967 irq_enter();
1012 u32 mask = cpus_addr(cpumask)[0]; 968 generic_smp_call_function_single_interrupt();
1013 969 __get_cpu_var(irq_stat).irq_call_count++;
1014 mask &= ~(1 << smp_processor_id()); 970 irq_exit();
1015
1016 if (!mask)
1017 return 0;
1018
1019 /* Can deadlock when called with interrupts disabled */
1020 WARN_ON(irqs_disabled());
1021
1022 data.func = func;
1023 data.info = info;
1024 data.started = mask;
1025 data.wait = wait;
1026 if (wait)
1027 data.finished = mask;
1028
1029 spin_lock(&call_lock);
1030 call_data = &data;
1031 wmb();
1032 /* Send a message to all other CPUs and wait for them to respond */
1033 send_CPI(mask, VIC_CALL_FUNCTION_CPI);
1034
1035 /* Wait for response */
1036 while (data.started)
1037 barrier();
1038
1039 if (wait)
1040 while (data.finished)
1041 barrier();
1042
1043 spin_unlock(&call_lock);
1044
1045 return 0;
1046} 971}
1047 972
1048/* Sorry about the name. In an APIC based system, the APICs 973/* Sorry about the name. In an APIC based system, the APICs
@@ -1099,6 +1024,12 @@ void smp_qic_call_function_interrupt(struct pt_regs *regs)
1099 smp_call_function_interrupt(); 1024 smp_call_function_interrupt();
1100} 1025}
1101 1026
1027void smp_qic_call_function_single_interrupt(struct pt_regs *regs)
1028{
1029 ack_QIC_CPI(QIC_CALL_FUNCTION_SINGLE_CPI);
1030 smp_call_function_single_interrupt();
1031}
1032
1102void smp_vic_cpi_interrupt(struct pt_regs *regs) 1033void smp_vic_cpi_interrupt(struct pt_regs *regs)
1103{ 1034{
1104 struct pt_regs *old_regs = set_irq_regs(regs); 1035 struct pt_regs *old_regs = set_irq_regs(regs);
@@ -1119,6 +1050,8 @@ void smp_vic_cpi_interrupt(struct pt_regs *regs)
1119 smp_enable_irq_interrupt(); 1050 smp_enable_irq_interrupt();
1120 if (test_and_clear_bit(VIC_CALL_FUNCTION_CPI, &vic_cpi_mailbox[cpu])) 1051 if (test_and_clear_bit(VIC_CALL_FUNCTION_CPI, &vic_cpi_mailbox[cpu]))
1121 smp_call_function_interrupt(); 1052 smp_call_function_interrupt();
1053 if (test_and_clear_bit(VIC_CALL_FUNCTION_SINGLE_CPI, &vic_cpi_mailbox[cpu]))
1054 smp_call_function_single_interrupt();
1122 set_irq_regs(old_regs); 1055 set_irq_regs(old_regs);
1123} 1056}
1124 1057
@@ -1134,16 +1067,7 @@ static void do_flush_tlb_all(void *info)
1134/* flush the TLB of every active CPU in the system */ 1067/* flush the TLB of every active CPU in the system */
1135void flush_tlb_all(void) 1068void flush_tlb_all(void)
1136{ 1069{
1137 on_each_cpu(do_flush_tlb_all, 0, 1, 1); 1070 on_each_cpu(do_flush_tlb_all, 0, 1);
1138}
1139
1140/* used to set up the trampoline for other CPUs when the memory manager
1141 * is sorted out */
1142void __init smp_alloc_memory(void)
1143{
1144 trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE);
1145 if (__pa(trampoline_base) >= 0x93000)
1146 BUG();
1147} 1071}
1148 1072
1149/* send a reschedule CPI to one CPU by physical CPU number*/ 1073/* send a reschedule CPI to one CPU by physical CPU number*/
@@ -1175,7 +1099,7 @@ int safe_smp_processor_id(void)
1175/* broadcast a halt to all other CPUs */ 1099/* broadcast a halt to all other CPUs */
1176static void voyager_smp_send_stop(void) 1100static void voyager_smp_send_stop(void)
1177{ 1101{
1178 smp_call_function(smp_stop_cpu_function, NULL, 1, 1); 1102 smp_call_function(smp_stop_cpu_function, NULL, 1);
1179} 1103}
1180 1104
1181/* this function is triggered in time.c when a clock tick fires 1105/* this function is triggered in time.c when a clock tick fires
@@ -1862,5 +1786,7 @@ struct smp_ops smp_ops = {
1862 1786
1863 .smp_send_stop = voyager_smp_send_stop, 1787 .smp_send_stop = voyager_smp_send_stop,
1864 .smp_send_reschedule = voyager_smp_send_reschedule, 1788 .smp_send_reschedule = voyager_smp_send_reschedule,
1865 .smp_call_function_mask = voyager_smp_call_function_mask, 1789
1790 .send_call_func_ipi = native_send_call_func_ipi,
1791 .send_call_func_single_ipi = native_send_call_func_single_ipi,
1866}; 1792};
diff --git a/arch/x86/math-emu/reg_constant.c b/arch/x86/math-emu/reg_constant.c
index 04869e64b18e..00548354912f 100644
--- a/arch/x86/math-emu/reg_constant.c
+++ b/arch/x86/math-emu/reg_constant.c
@@ -16,8 +16,8 @@
16#include "reg_constant.h" 16#include "reg_constant.h"
17#include "control_w.h" 17#include "control_w.h"
18 18
19#define MAKE_REG(s,e,l,h) { l, h, \ 19#define MAKE_REG(s, e, l, h) { l, h, \
20 ((EXTENDED_Ebias+(e)) | ((SIGN_##s != 0)*0x8000)) } 20 ((EXTENDED_Ebias+(e)) | ((SIGN_##s != 0)*0x8000)) }
21 21
22FPU_REG const CONST_1 = MAKE_REG(POS, 0, 0x00000000, 0x80000000); 22FPU_REG const CONST_1 = MAKE_REG(POS, 0, 0x00000000, 0x80000000);
23#if 0 23#if 0
@@ -40,7 +40,7 @@ FPU_REG const CONST_PI2extra = MAKE_REG(NEG, -66,
40FPU_REG const CONST_Z = MAKE_REG(POS, EXP_UNDER, 0x0, 0x0); 40FPU_REG const CONST_Z = MAKE_REG(POS, EXP_UNDER, 0x0, 0x0);
41 41
42/* Only the sign and significand (and tag) are used in internal NaNs */ 42/* Only the sign and significand (and tag) are used in internal NaNs */
43/* The 80486 never generates one of these 43/* The 80486 never generates one of these
44FPU_REG const CONST_SNAN = MAKE_REG(POS, EXP_OVER, 0x00000001, 0x80000000); 44FPU_REG const CONST_SNAN = MAKE_REG(POS, EXP_OVER, 0x00000001, 0x80000000);
45 */ 45 */
46/* This is the real indefinite QNaN */ 46/* This is the real indefinite QNaN */
@@ -49,7 +49,7 @@ FPU_REG const CONST_QNaN = MAKE_REG(NEG, EXP_OVER, 0x00000000, 0xC0000000);
49/* Only the sign (and tag) is used in internal infinities */ 49/* Only the sign (and tag) is used in internal infinities */
50FPU_REG const CONST_INF = MAKE_REG(POS, EXP_OVER, 0x00000000, 0x80000000); 50FPU_REG const CONST_INF = MAKE_REG(POS, EXP_OVER, 0x00000000, 0x80000000);
51 51
52static void fld_const(FPU_REG const *c, int adj, u_char tag) 52static void fld_const(FPU_REG const * c, int adj, u_char tag)
53{ 53{
54 FPU_REG *st_new_ptr; 54 FPU_REG *st_new_ptr;
55 55
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index b7b3e4c7cfc9..2977ea37791f 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,6 +1,7 @@
1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ 1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
2 pat.o pgtable.o 2 pat.o pgtable.o
3 3
4obj-$(CONFIG_HAVE_GET_USER_PAGES_FAST) += gup.o
4obj-$(CONFIG_X86_32) += pgtable_32.o 5obj-$(CONFIG_X86_32) += pgtable_32.o
5 6
6obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o 7obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
@@ -8,10 +9,17 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o
8 9
9obj-$(CONFIG_HIGHMEM) += highmem_32.o 10obj-$(CONFIG_HIGHMEM) += highmem_32.o
10 11
12obj-$(CONFIG_MMIOTRACE_HOOKS) += kmmio.o
13obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
14mmiotrace-y := pf_in.o mmio-mod.o
15obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
16
11ifeq ($(CONFIG_X86_32),y) 17ifeq ($(CONFIG_X86_32),y)
12obj-$(CONFIG_NUMA) += discontig_32.o 18obj-$(CONFIG_NUMA) += discontig_32.o
13else 19else
14obj-$(CONFIG_NUMA) += numa_64.o 20obj-$(CONFIG_NUMA) += numa_64.o
15obj-$(CONFIG_K8_NUMA) += k8topology_64.o 21obj-$(CONFIG_K8_NUMA) += k8topology_64.o
16obj-$(CONFIG_ACPI_NUMA) += srat_64.o
17endif 22endif
23obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
24
25obj-$(CONFIG_MEMTEST) += memtest.o
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 914ccf983687..62fa440678d8 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -38,10 +38,10 @@
38#include <asm/setup.h> 38#include <asm/setup.h>
39#include <asm/mmzone.h> 39#include <asm/mmzone.h>
40#include <asm/bios_ebda.h> 40#include <asm/bios_ebda.h>
41#include <asm/proto.h>
41 42
42struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 43struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
43EXPORT_SYMBOL(node_data); 44EXPORT_SYMBOL(node_data);
44static bootmem_data_t node0_bdata;
45 45
46/* 46/*
47 * numa interface - we expect the numa architecture specific code to have 47 * numa interface - we expect the numa architecture specific code to have
@@ -59,14 +59,14 @@ unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
59/* 59/*
60 * 4) physnode_map - the mapping between a pfn and owning node 60 * 4) physnode_map - the mapping between a pfn and owning node
61 * physnode_map keeps track of the physical memory layout of a generic 61 * physnode_map keeps track of the physical memory layout of a generic
62 * numa node on a 256Mb break (each element of the array will 62 * numa node on a 64Mb break (each element of the array will
63 * represent 256Mb of memory and will be marked by the node id. so, 63 * represent 64Mb of memory and will be marked by the node id. so,
64 * if the first gig is on node 0, and the second gig is on node 1 64 * if the first gig is on node 0, and the second gig is on node 1
65 * physnode_map will contain: 65 * physnode_map will contain:
66 * 66 *
67 * physnode_map[0-3] = 0; 67 * physnode_map[0-15] = 0;
68 * physnode_map[4-7] = 1; 68 * physnode_map[16-31] = 1;
69 * physnode_map[8- ] = -1; 69 * physnode_map[32- ] = -1;
70 */ 70 */
71s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1}; 71s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
72EXPORT_SYMBOL(physnode_map); 72EXPORT_SYMBOL(physnode_map);
@@ -75,15 +75,15 @@ void memory_present(int nid, unsigned long start, unsigned long end)
75{ 75{
76 unsigned long pfn; 76 unsigned long pfn;
77 77
78 printk(KERN_INFO "Node: %d, start_pfn: %ld, end_pfn: %ld\n", 78 printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n",
79 nid, start, end); 79 nid, start, end);
80 printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); 80 printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
81 printk(KERN_DEBUG " "); 81 printk(KERN_DEBUG " ");
82 for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { 82 for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
83 physnode_map[pfn / PAGES_PER_ELEMENT] = nid; 83 physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
84 printk("%ld ", pfn); 84 printk(KERN_CONT "%lx ", pfn);
85 } 85 }
86 printk("\n"); 86 printk(KERN_CONT "\n");
87} 87}
88 88
89unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, 89unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
@@ -99,7 +99,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
99#endif 99#endif
100 100
101extern unsigned long find_max_low_pfn(void); 101extern unsigned long find_max_low_pfn(void);
102extern void add_one_highpage_init(struct page *, int, int);
103extern unsigned long highend_pfn, highstart_pfn; 102extern unsigned long highend_pfn, highstart_pfn;
104 103
105#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) 104#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
@@ -117,13 +116,13 @@ static unsigned long kva_pages;
117 */ 116 */
118int __init get_memcfg_numa_flat(void) 117int __init get_memcfg_numa_flat(void)
119{ 118{
120 printk("NUMA - single node, flat memory mode\n"); 119 printk(KERN_DEBUG "NUMA - single node, flat memory mode\n");
121 120
122 /* Run the memory configuration and find the top of memory. */
123 propagate_e820_map();
124 node_start_pfn[0] = 0; 121 node_start_pfn[0] = 0;
125 node_end_pfn[0] = max_pfn; 122 node_end_pfn[0] = max_pfn;
123 e820_register_active_regions(0, 0, max_pfn);
126 memory_present(0, 0, max_pfn); 124 memory_present(0, 0, max_pfn);
125 node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
127 126
128 /* Indicate there is one node available. */ 127 /* Indicate there is one node available. */
129 nodes_clear(node_online_map); 128 nodes_clear(node_online_map);
@@ -156,24 +155,32 @@ static void __init propagate_e820_map_node(int nid)
156 */ 155 */
157static void __init allocate_pgdat(int nid) 156static void __init allocate_pgdat(int nid)
158{ 157{
159 if (nid && node_has_online_mem(nid)) 158 char buf[16];
159
160 if (node_has_online_mem(nid) && node_remap_start_vaddr[nid])
160 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; 161 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
161 else { 162 else {
162 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn)); 163 unsigned long pgdat_phys;
163 min_low_pfn += PFN_UP(sizeof(pg_data_t)); 164 pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT,
165 max_pfn_mapped<<PAGE_SHIFT,
166 sizeof(pg_data_t),
167 PAGE_SIZE);
168 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
169 memset(buf, 0, sizeof(buf));
170 sprintf(buf, "NODE_DATA %d", nid);
171 reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf);
164 } 172 }
173 printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
174 nid, (unsigned long)NODE_DATA(nid));
165} 175}
166 176
167#ifdef CONFIG_DISCONTIGMEM
168/* 177/*
169 * In the discontig memory model, a portion of the kernel virtual area (KVA) 178 * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel
170 * is reserved and portions of nodes are mapped using it. This is to allow 179 * virtual address space (KVA) is reserved and portions of nodes are mapped
171 * node-local memory to be allocated for structures that would normally require 180 * using it. This is to allow node-local memory to be allocated for
172 * ZONE_NORMAL. The memory is allocated with alloc_remap() and callers 181 * structures that would normally require ZONE_NORMAL. The memory is
173 * should be prepared to allocate from the bootmem allocator instead. This KVA 182 * allocated with alloc_remap() and callers should be prepared to allocate
174 * mechanism is incompatible with SPARSEMEM as it makes assumptions about the 183 * from the bootmem allocator instead.
175 * layout of memory that are broken if alloc_remap() succeeds for some of the
176 * map and fails for others
177 */ 184 */
178static unsigned long node_remap_start_pfn[MAX_NUMNODES]; 185static unsigned long node_remap_start_pfn[MAX_NUMNODES];
179static void *node_remap_end_vaddr[MAX_NUMNODES]; 186static void *node_remap_end_vaddr[MAX_NUMNODES];
@@ -195,15 +202,19 @@ void *alloc_remap(int nid, unsigned long size)
195 return allocation; 202 return allocation;
196} 203}
197 204
198void __init remap_numa_kva(void) 205static void __init remap_numa_kva(void)
199{ 206{
200 void *vaddr; 207 void *vaddr;
201 unsigned long pfn; 208 unsigned long pfn;
202 int node; 209 int node;
203 210
204 for_each_online_node(node) { 211 for_each_online_node(node) {
212 printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
205 for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { 213 for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
206 vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT); 214 vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
215 printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
216 (unsigned long)vaddr,
217 node_remap_start_pfn[node] + pfn);
207 set_pmd_pfn((ulong) vaddr, 218 set_pmd_pfn((ulong) vaddr,
208 node_remap_start_pfn[node] + pfn, 219 node_remap_start_pfn[node] + pfn,
209 PAGE_KERNEL_LARGE); 220 PAGE_KERNEL_LARGE);
@@ -215,17 +226,21 @@ static unsigned long calculate_numa_remap_pages(void)
215{ 226{
216 int nid; 227 int nid;
217 unsigned long size, reserve_pages = 0; 228 unsigned long size, reserve_pages = 0;
218 unsigned long pfn;
219 229
220 for_each_online_node(nid) { 230 for_each_online_node(nid) {
221 unsigned old_end_pfn = node_end_pfn[nid]; 231 u64 node_kva_target;
232 u64 node_kva_final;
222 233
223 /* 234 /*
224 * The acpi/srat node info can show hot-add memroy zones 235 * The acpi/srat node info can show hot-add memroy zones
225 * where memory could be added but not currently present. 236 * where memory could be added but not currently present.
226 */ 237 */
238 printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
239 nid, node_start_pfn[nid], node_end_pfn[nid]);
227 if (node_start_pfn[nid] > max_pfn) 240 if (node_start_pfn[nid] > max_pfn)
228 continue; 241 continue;
242 if (!node_end_pfn[nid])
243 continue;
229 if (node_end_pfn[nid] > max_pfn) 244 if (node_end_pfn[nid] > max_pfn)
230 node_end_pfn[nid] = max_pfn; 245 node_end_pfn[nid] = max_pfn;
231 246
@@ -237,41 +252,48 @@ static unsigned long calculate_numa_remap_pages(void)
237 /* now the roundup is correct, convert to PAGE_SIZE pages */ 252 /* now the roundup is correct, convert to PAGE_SIZE pages */
238 size = size * PTRS_PER_PTE; 253 size = size * PTRS_PER_PTE;
239 254
240 /* 255 node_kva_target = round_down(node_end_pfn[nid] - size,
241 * Validate the region we are allocating only contains valid 256 PTRS_PER_PTE);
242 * pages. 257 node_kva_target <<= PAGE_SHIFT;
243 */ 258 do {
244 for (pfn = node_end_pfn[nid] - size; 259 node_kva_final = find_e820_area(node_kva_target,
245 pfn < node_end_pfn[nid]; pfn++) 260 ((u64)node_end_pfn[nid])<<PAGE_SHIFT,
246 if (!page_is_ram(pfn)) 261 ((u64)size)<<PAGE_SHIFT,
247 break; 262 LARGE_PAGE_BYTES);
248 263 node_kva_target -= LARGE_PAGE_BYTES;
249 if (pfn != node_end_pfn[nid]) 264 } while (node_kva_final == -1ULL &&
250 size = 0; 265 (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
266
267 if (node_kva_final == -1ULL)
268 panic("Can not get kva ram\n");
251 269
252 printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
253 size, nid);
254 node_remap_size[nid] = size; 270 node_remap_size[nid] = size;
255 node_remap_offset[nid] = reserve_pages; 271 node_remap_offset[nid] = reserve_pages;
256 reserve_pages += size; 272 reserve_pages += size;
257 printk("Shrinking node %d from %ld pages to %ld pages\n", 273 printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of"
258 nid, node_end_pfn[nid], node_end_pfn[nid] - size); 274 " node %d at %llx\n",
259 275 size, nid, node_kva_final>>PAGE_SHIFT);
260 if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) { 276
261 /* 277 /*
262 * Align node_end_pfn[] and node_remap_start_pfn[] to 278 * prevent kva address below max_low_pfn want it on system
263 * pmd boundary. remap_numa_kva will barf otherwise. 279 * with less memory later.
264 */ 280 * layout will be: KVA address , KVA RAM
265 printk("Shrinking node %d further by %ld pages for proper alignment\n", 281 *
266 nid, node_end_pfn[nid] & (PTRS_PER_PTE-1)); 282 * we are supposed to only record the one less then max_low_pfn
267 size += node_end_pfn[nid] & (PTRS_PER_PTE-1); 283 * but we could have some hole in high memory, and it will only
268 } 284 * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
285 * to use it as free.
286 * So reserve_early here, hope we don't run out of that array
287 */
288 reserve_early(node_kva_final,
289 node_kva_final+(((u64)size)<<PAGE_SHIFT),
290 "KVA RAM");
269 291
270 node_end_pfn[nid] -= size; 292 node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
271 node_remap_start_pfn[nid] = node_end_pfn[nid]; 293 remove_active_range(nid, node_remap_start_pfn[nid],
272 shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]); 294 node_remap_start_pfn[nid] + size);
273 } 295 }
274 printk("Reserving total of %ld pages for numa KVA remap\n", 296 printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
275 reserve_pages); 297 reserve_pages);
276 return reserve_pages; 298 return reserve_pages;
277} 299}
@@ -285,37 +307,16 @@ static void init_remap_allocator(int nid)
285 node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + 307 node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
286 ALIGN(sizeof(pg_data_t), PAGE_SIZE); 308 ALIGN(sizeof(pg_data_t), PAGE_SIZE);
287 309
288 printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, 310 printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid,
289 (ulong) node_remap_start_vaddr[nid], 311 (ulong) node_remap_start_vaddr[nid],
290 (ulong) pfn_to_kaddr(highstart_pfn 312 (ulong) node_remap_end_vaddr[nid]);
291 + node_remap_offset[nid] + node_remap_size[nid]));
292}
293#else
294void *alloc_remap(int nid, unsigned long size)
295{
296 return NULL;
297}
298
299static unsigned long calculate_numa_remap_pages(void)
300{
301 return 0;
302}
303
304static void init_remap_allocator(int nid)
305{
306}
307
308void __init remap_numa_kva(void)
309{
310} 313}
311#endif /* CONFIG_DISCONTIGMEM */
312 314
313extern void setup_bootmem_allocator(void); 315void __init initmem_init(unsigned long start_pfn,
314unsigned long __init setup_memory(void) 316 unsigned long end_pfn)
315{ 317{
316 int nid; 318 int nid;
317 unsigned long system_start_pfn, system_max_low_pfn; 319 long kva_target_pfn;
318 unsigned long wasted_pages;
319 320
320 /* 321 /*
321 * When mapping a NUMA machine we allocate the node_mem_map arrays 322 * When mapping a NUMA machine we allocate the node_mem_map arrays
@@ -324,109 +325,77 @@ unsigned long __init setup_memory(void)
324 * this space and use it to adjust the boundary between ZONE_NORMAL 325 * this space and use it to adjust the boundary between ZONE_NORMAL
325 * and ZONE_HIGHMEM. 326 * and ZONE_HIGHMEM.
326 */ 327 */
327 get_memcfg_numa();
328 328
329 kva_pages = calculate_numa_remap_pages(); 329 get_memcfg_numa();
330 330
331 /* partially used pages are not usable - thus round upwards */ 331 kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE);
332 system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
333 332
334 kva_start_pfn = find_max_low_pfn() - kva_pages; 333 kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
334 do {
335 kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT,
336 max_low_pfn<<PAGE_SHIFT,
337 kva_pages<<PAGE_SHIFT,
338 PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
339 kva_target_pfn -= PTRS_PER_PTE;
340 } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn);
335 341
336#ifdef CONFIG_BLK_DEV_INITRD 342 if (kva_start_pfn == -1UL)
337 /* Numa kva area is below the initrd */ 343 panic("Can not get kva space\n");
338 if (initrd_start)
339 kva_start_pfn = PFN_DOWN(initrd_start - PAGE_OFFSET)
340 - kva_pages;
341#endif
342 344
343 /* 345 printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
344 * We waste pages past at the end of the KVA for no good reason other
345 * than how it is located. This is bad.
346 */
347 wasted_pages = kva_start_pfn & (PTRS_PER_PTE-1);
348 kva_start_pfn -= wasted_pages;
349 kva_pages += wasted_pages;
350
351 system_max_low_pfn = max_low_pfn = find_max_low_pfn();
352 printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n",
353 kva_start_pfn, max_low_pfn); 346 kva_start_pfn, max_low_pfn);
354 printk("max_pfn = %ld\n", max_pfn); 347 printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
348
349 /* avoid clash with initrd */
350 reserve_early(kva_start_pfn<<PAGE_SHIFT,
351 (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
352 "KVA PG");
355#ifdef CONFIG_HIGHMEM 353#ifdef CONFIG_HIGHMEM
356 highstart_pfn = highend_pfn = max_pfn; 354 highstart_pfn = highend_pfn = max_pfn;
357 if (max_pfn > system_max_low_pfn) 355 if (max_pfn > max_low_pfn)
358 highstart_pfn = system_max_low_pfn; 356 highstart_pfn = max_low_pfn;
359 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", 357 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
360 pages_to_mb(highend_pfn - highstart_pfn)); 358 pages_to_mb(highend_pfn - highstart_pfn));
361 num_physpages = highend_pfn; 359 num_physpages = highend_pfn;
362 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; 360 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
363#else 361#else
364 num_physpages = system_max_low_pfn; 362 num_physpages = max_low_pfn;
365 high_memory = (void *) __va(system_max_low_pfn * PAGE_SIZE - 1) + 1; 363 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
366#endif 364#endif
367 printk(KERN_NOTICE "%ldMB LOWMEM available.\n", 365 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
368 pages_to_mb(system_max_low_pfn)); 366 pages_to_mb(max_low_pfn));
369 printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", 367 printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n",
370 min_low_pfn, max_low_pfn, highstart_pfn); 368 max_low_pfn, highstart_pfn);
371 369
372 printk("Low memory ends at vaddr %08lx\n", 370 printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
373 (ulong) pfn_to_kaddr(max_low_pfn)); 371 (ulong) pfn_to_kaddr(max_low_pfn));
374 for_each_online_node(nid) { 372 for_each_online_node(nid) {
375 init_remap_allocator(nid); 373 init_remap_allocator(nid);
376 374
377 allocate_pgdat(nid); 375 allocate_pgdat(nid);
378 } 376 }
379 printk("High memory starts at vaddr %08lx\n", 377 remap_numa_kva();
378
379 printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
380 (ulong) pfn_to_kaddr(highstart_pfn)); 380 (ulong) pfn_to_kaddr(highstart_pfn));
381 for_each_online_node(nid) 381 for_each_online_node(nid)
382 propagate_e820_map_node(nid); 382 propagate_e820_map_node(nid);
383 383
384 memset(NODE_DATA(0), 0, sizeof(struct pglist_data)); 384 for_each_online_node(nid)
385 NODE_DATA(0)->bdata = &node0_bdata; 385 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
386 setup_bootmem_allocator();
387 return max_low_pfn;
388}
389
390void __init numa_kva_reserve(void)
391{
392 if (kva_pages)
393 reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages),
394 BOOTMEM_DEFAULT);
395}
396
397void __init zone_sizes_init(void)
398{
399 int nid;
400 unsigned long max_zone_pfns[MAX_NR_ZONES];
401 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
402 max_zone_pfns[ZONE_DMA] =
403 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
404 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
405#ifdef CONFIG_HIGHMEM
406 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
407#endif
408
409 /* If SRAT has not registered memory, register it now */
410 if (find_max_pfn_with_active_regions() == 0) {
411 for_each_online_node(nid) {
412 if (node_has_online_mem(nid))
413 add_active_range(nid, node_start_pfn[nid],
414 node_end_pfn[nid]);
415 }
416 }
417 386
418 free_area_init_nodes(max_zone_pfns); 387 NODE_DATA(0)->bdata = &bootmem_node_data[0];
419 return; 388 setup_bootmem_allocator();
420} 389}
421 390
422void __init set_highmem_pages_init(int bad_ppro) 391void __init set_highmem_pages_init(void)
423{ 392{
424#ifdef CONFIG_HIGHMEM 393#ifdef CONFIG_HIGHMEM
425 struct zone *zone; 394 struct zone *zone;
426 struct page *page; 395 int nid;
427 396
428 for_each_zone(zone) { 397 for_each_zone(zone) {
429 unsigned long node_pfn, zone_start_pfn, zone_end_pfn; 398 unsigned long zone_start_pfn, zone_end_pfn;
430 399
431 if (!is_highmem(zone)) 400 if (!is_highmem(zone))
432 continue; 401 continue;
@@ -434,16 +403,12 @@ void __init set_highmem_pages_init(int bad_ppro)
434 zone_start_pfn = zone->zone_start_pfn; 403 zone_start_pfn = zone->zone_start_pfn;
435 zone_end_pfn = zone_start_pfn + zone->spanned_pages; 404 zone_end_pfn = zone_start_pfn + zone->spanned_pages;
436 405
437 printk("Initializing %s for node %d (%08lx:%08lx)\n", 406 nid = zone_to_nid(zone);
438 zone->name, zone_to_nid(zone), 407 printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
439 zone_start_pfn, zone_end_pfn); 408 zone->name, nid, zone_start_pfn, zone_end_pfn);
440 409
441 for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) { 410 add_highpages_with_active_regions(nid, zone_start_pfn,
442 if (!pfn_valid(node_pfn)) 411 zone_end_pfn);
443 continue;
444 page = pfn_to_page(node_pfn);
445 add_one_highpage_init(page, node_pfn, bad_ppro);
446 }
447 } 412 }
448 totalram_pages += totalhigh_pages; 413 totalram_pages += totalhigh_pages;
449#endif 414#endif
@@ -476,3 +441,4 @@ int memory_add_physaddr_to_nid(u64 addr)
476 441
477EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); 442EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
478#endif 443#endif
444
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 2c24bea92c66..a20d1fa64b4e 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -42,7 +42,7 @@ static struct addr_marker address_markers[] = {
42 { 0, "User Space" }, 42 { 0, "User Space" },
43#ifdef CONFIG_X86_64 43#ifdef CONFIG_X86_64
44 { 0x8000000000000000UL, "Kernel Space" }, 44 { 0x8000000000000000UL, "Kernel Space" },
45 { 0xffff810000000000UL, "Low Kernel Mapping" }, 45 { PAGE_OFFSET, "Low Kernel Mapping" },
46 { VMALLOC_START, "vmalloc() Area" }, 46 { VMALLOC_START, "vmalloc() Area" },
47 { VMEMMAP_START, "Vmemmap" }, 47 { VMEMMAP_START, "Vmemmap" },
48 { __START_KERNEL_map, "High Kernel Mapping" }, 48 { __START_KERNEL_map, "High Kernel Mapping" },
@@ -148,8 +148,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
148 * we have now. "break" is either changing perms, levels or 148 * we have now. "break" is either changing perms, levels or
149 * address space marker. 149 * address space marker.
150 */ 150 */
151 prot = pgprot_val(new_prot) & ~(PTE_MASK); 151 prot = pgprot_val(new_prot) & ~(PTE_PFN_MASK);
152 cur = pgprot_val(st->current_prot) & ~(PTE_MASK); 152 cur = pgprot_val(st->current_prot) & ~(PTE_PFN_MASK);
153 153
154 if (!st->level) { 154 if (!st->level) {
155 /* First entry */ 155 /* First entry */
@@ -221,7 +221,7 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
221 for (i = 0; i < PTRS_PER_PMD; i++) { 221 for (i = 0; i < PTRS_PER_PMD; i++) {
222 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); 222 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
223 if (!pmd_none(*start)) { 223 if (!pmd_none(*start)) {
224 pgprotval_t prot = pmd_val(*start) & ~PTE_MASK; 224 pgprotval_t prot = pmd_val(*start) & PTE_FLAGS_MASK;
225 225
226 if (pmd_large(*start) || !pmd_present(*start)) 226 if (pmd_large(*start) || !pmd_present(*start))
227 note_page(m, st, __pgprot(prot), 3); 227 note_page(m, st, __pgprot(prot), 3);
@@ -253,7 +253,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
253 for (i = 0; i < PTRS_PER_PUD; i++) { 253 for (i = 0; i < PTRS_PER_PUD; i++) {
254 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); 254 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
255 if (!pud_none(*start)) { 255 if (!pud_none(*start)) {
256 pgprotval_t prot = pud_val(*start) & ~PTE_MASK; 256 pgprotval_t prot = pud_val(*start) & PTE_FLAGS_MASK;
257 257
258 if (pud_large(*start) || !pud_present(*start)) 258 if (pud_large(*start) || !pud_present(*start))
259 note_page(m, st, __pgprot(prot), 2); 259 note_page(m, st, __pgprot(prot), 2);
@@ -288,7 +288,7 @@ static void walk_pgd_level(struct seq_file *m)
288 for (i = 0; i < PTRS_PER_PGD; i++) { 288 for (i = 0; i < PTRS_PER_PGD; i++) {
289 st.current_address = normalize_addr(i * PGD_LEVEL_MULT); 289 st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
290 if (!pgd_none(*start)) { 290 if (!pgd_none(*start)) {
291 pgprotval_t prot = pgd_val(*start) & ~PTE_MASK; 291 pgprotval_t prot = pgd_val(*start) & PTE_FLAGS_MASK;
292 292
293 if (pgd_large(*start) || !pgd_present(*start)) 293 if (pgd_large(*start) || !pgd_present(*start))
294 note_page(m, &st, __pgprot(prot), 1); 294 note_page(m, &st, __pgprot(prot), 1);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8bcb6f40ccb6..455f3fe67b42 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -10,6 +10,7 @@
10#include <linux/string.h> 10#include <linux/string.h>
11#include <linux/types.h> 11#include <linux/types.h>
12#include <linux/ptrace.h> 12#include <linux/ptrace.h>
13#include <linux/mmiotrace.h>
13#include <linux/mman.h> 14#include <linux/mman.h>
14#include <linux/mm.h> 15#include <linux/mm.h>
15#include <linux/smp.h> 16#include <linux/smp.h>
@@ -49,17 +50,23 @@
49#define PF_RSVD (1<<3) 50#define PF_RSVD (1<<3)
50#define PF_INSTR (1<<4) 51#define PF_INSTR (1<<4)
51 52
53static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
54{
55#ifdef CONFIG_MMIOTRACE_HOOKS
56 if (unlikely(is_kmmio_active()))
57 if (kmmio_handler(regs, addr) == 1)
58 return -1;
59#endif
60 return 0;
61}
62
52static inline int notify_page_fault(struct pt_regs *regs) 63static inline int notify_page_fault(struct pt_regs *regs)
53{ 64{
54#ifdef CONFIG_KPROBES 65#ifdef CONFIG_KPROBES
55 int ret = 0; 66 int ret = 0;
56 67
57 /* kprobe_running() needs smp_processor_id() */ 68 /* kprobe_running() needs smp_processor_id() */
58#ifdef CONFIG_X86_32
59 if (!user_mode_vm(regs)) { 69 if (!user_mode_vm(regs)) {
60#else
61 if (!user_mode(regs)) {
62#endif
63 preempt_disable(); 70 preempt_disable();
64 if (kprobe_running() && kprobe_fault_handler(regs, 14)) 71 if (kprobe_running() && kprobe_fault_handler(regs, 14))
65 ret = 1; 72 ret = 1;
@@ -396,11 +403,7 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
396 printk(KERN_CONT "NULL pointer dereference"); 403 printk(KERN_CONT "NULL pointer dereference");
397 else 404 else
398 printk(KERN_CONT "paging request"); 405 printk(KERN_CONT "paging request");
399#ifdef CONFIG_X86_32 406 printk(KERN_CONT " at %p\n", (void *) address);
400 printk(KERN_CONT " at %08lx\n", address);
401#else
402 printk(KERN_CONT " at %016lx\n", address);
403#endif
404 printk(KERN_ALERT "IP:"); 407 printk(KERN_ALERT "IP:");
405 printk_address(regs->ip, 1); 408 printk_address(regs->ip, 1);
406 dump_pagetable(address); 409 dump_pagetable(address);
@@ -606,6 +609,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
606 609
607 if (notify_page_fault(regs)) 610 if (notify_page_fault(regs))
608 return; 611 return;
612 if (unlikely(kmmio_fault(regs, address)))
613 return;
609 614
610 /* 615 /*
611 * We fault-in kernel-space virtual memory on-demand. The 616 * We fault-in kernel-space virtual memory on-demand. The
@@ -800,14 +805,10 @@ bad_area_nosemaphore:
800 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && 805 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
801 printk_ratelimit()) { 806 printk_ratelimit()) {
802 printk( 807 printk(
803#ifdef CONFIG_X86_32 808 "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
804 "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
805#else
806 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
807#endif
808 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, 809 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
809 tsk->comm, task_pid_nr(tsk), address, regs->ip, 810 tsk->comm, task_pid_nr(tsk), address,
810 regs->sp, error_code); 811 (void *) regs->ip, (void *) regs->sp, error_code);
811 print_vma_addr(" in ", regs->ip); 812 print_vma_addr(" in ", regs->ip);
812 printk("\n"); 813 printk("\n");
813 } 814 }
@@ -915,14 +916,7 @@ LIST_HEAD(pgd_list);
915void vmalloc_sync_all(void) 916void vmalloc_sync_all(void)
916{ 917{
917#ifdef CONFIG_X86_32 918#ifdef CONFIG_X86_32
918 /* 919 unsigned long start = VMALLOC_START & PGDIR_MASK;
919 * Note that races in the updates of insync and start aren't
920 * problematic: insync can only get set bits added, and updates to
921 * start are only improving performance (without affecting correctness
922 * if undone).
923 */
924 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
925 static unsigned long start = TASK_SIZE;
926 unsigned long address; 920 unsigned long address;
927 921
928 if (SHARED_KERNEL_PMD) 922 if (SHARED_KERNEL_PMD)
@@ -930,56 +924,38 @@ void vmalloc_sync_all(void)
930 924
931 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); 925 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
932 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { 926 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
933 if (!test_bit(pgd_index(address), insync)) { 927 unsigned long flags;
934 unsigned long flags; 928 struct page *page;
935 struct page *page; 929
936 930 spin_lock_irqsave(&pgd_lock, flags);
937 spin_lock_irqsave(&pgd_lock, flags); 931 list_for_each_entry(page, &pgd_list, lru) {
938 list_for_each_entry(page, &pgd_list, lru) { 932 if (!vmalloc_sync_one(page_address(page),
939 if (!vmalloc_sync_one(page_address(page), 933 address))
940 address)) 934 break;
941 break;
942 }
943 spin_unlock_irqrestore(&pgd_lock, flags);
944 if (!page)
945 set_bit(pgd_index(address), insync);
946 } 935 }
947 if (address == start && test_bit(pgd_index(address), insync)) 936 spin_unlock_irqrestore(&pgd_lock, flags);
948 start = address + PGDIR_SIZE;
949 } 937 }
950#else /* CONFIG_X86_64 */ 938#else /* CONFIG_X86_64 */
951 /* 939 unsigned long start = VMALLOC_START & PGDIR_MASK;
952 * Note that races in the updates of insync and start aren't
953 * problematic: insync can only get set bits added, and updates to
954 * start are only improving performance (without affecting correctness
955 * if undone).
956 */
957 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
958 static unsigned long start = VMALLOC_START & PGDIR_MASK;
959 unsigned long address; 940 unsigned long address;
960 941
961 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { 942 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
962 if (!test_bit(pgd_index(address), insync)) { 943 const pgd_t *pgd_ref = pgd_offset_k(address);
963 const pgd_t *pgd_ref = pgd_offset_k(address); 944 unsigned long flags;
964 unsigned long flags; 945 struct page *page;
965 struct page *page; 946
966 947 if (pgd_none(*pgd_ref))
967 if (pgd_none(*pgd_ref)) 948 continue;
968 continue; 949 spin_lock_irqsave(&pgd_lock, flags);
969 spin_lock_irqsave(&pgd_lock, flags); 950 list_for_each_entry(page, &pgd_list, lru) {
970 list_for_each_entry(page, &pgd_list, lru) { 951 pgd_t *pgd;
971 pgd_t *pgd; 952 pgd = (pgd_t *)page_address(page) + pgd_index(address);
972 pgd = (pgd_t *)page_address(page) + pgd_index(address); 953 if (pgd_none(*pgd))
973 if (pgd_none(*pgd)) 954 set_pgd(pgd, *pgd_ref);
974 set_pgd(pgd, *pgd_ref); 955 else
975 else 956 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
976 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
977 }
978 spin_unlock_irqrestore(&pgd_lock, flags);
979 set_bit(pgd_index(address), insync);
980 } 957 }
981 if (address == start) 958 spin_unlock_irqrestore(&pgd_lock, flags);
982 start = address + PGDIR_SIZE;
983 } 959 }
984#endif 960#endif
985} 961}
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
new file mode 100644
index 000000000000..3085f25b4355
--- /dev/null
+++ b/arch/x86/mm/gup.c
@@ -0,0 +1,295 @@
1/*
2 * Lockless get_user_pages_fast for x86
3 *
4 * Copyright (C) 2008 Nick Piggin
5 * Copyright (C) 2008 Novell Inc.
6 */
7#include <linux/sched.h>
8#include <linux/mm.h>
9#include <linux/vmstat.h>
10#include <linux/highmem.h>
11
12#include <asm/pgtable.h>
13
14static inline pte_t gup_get_pte(pte_t *ptep)
15{
16#ifndef CONFIG_X86_PAE
17 return *ptep;
18#else
19 /*
20 * With get_user_pages_fast, we walk down the pagetables without taking
21 * any locks. For this we would like to load the pointers atoimcally,
22 * but that is not possible (without expensive cmpxchg8b) on PAE. What
23 * we do have is the guarantee that a pte will only either go from not
24 * present to present, or present to not present or both -- it will not
25 * switch to a completely different present page without a TLB flush in
26 * between; something that we are blocking by holding interrupts off.
27 *
28 * Setting ptes from not present to present goes:
29 * ptep->pte_high = h;
30 * smp_wmb();
31 * ptep->pte_low = l;
32 *
33 * And present to not present goes:
34 * ptep->pte_low = 0;
35 * smp_wmb();
36 * ptep->pte_high = 0;
37 *
38 * We must ensure here that the load of pte_low sees l iff pte_high
39 * sees h. We load pte_high *after* loading pte_low, which ensures we
40 * don't see an older value of pte_high. *Then* we recheck pte_low,
41 * which ensures that we haven't picked up a changed pte high. We might
42 * have got rubbish values from pte_low and pte_high, but we are
43 * guaranteed that pte_low will not have the present bit set *unless*
44 * it is 'l'. And get_user_pages_fast only operates on present ptes, so
45 * we're safe.
46 *
47 * gup_get_pte should not be used or copied outside gup.c without being
48 * very careful -- it does not atomically load the pte or anything that
49 * is likely to be useful for you.
50 */
51 pte_t pte;
52
53retry:
54 pte.pte_low = ptep->pte_low;
55 smp_rmb();
56 pte.pte_high = ptep->pte_high;
57 smp_rmb();
58 if (unlikely(pte.pte_low != ptep->pte_low))
59 goto retry;
60
61 return pte;
62#endif
63}
64
65/*
66 * The performance critical leaf functions are made noinline otherwise gcc
67 * inlines everything into a single function which results in too much
68 * register pressure.
69 */
70static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
71 unsigned long end, int write, struct page **pages, int *nr)
72{
73 unsigned long mask;
74 pte_t *ptep;
75
76 mask = _PAGE_PRESENT|_PAGE_USER;
77 if (write)
78 mask |= _PAGE_RW;
79
80 ptep = pte_offset_map(&pmd, addr);
81 do {
82 pte_t pte = gup_get_pte(ptep);
83 struct page *page;
84
85 if ((pte_val(pte) & (mask | _PAGE_SPECIAL)) != mask) {
86 pte_unmap(ptep);
87 return 0;
88 }
89 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
90 page = pte_page(pte);
91 get_page(page);
92 pages[*nr] = page;
93 (*nr)++;
94
95 } while (ptep++, addr += PAGE_SIZE, addr != end);
96 pte_unmap(ptep - 1);
97
98 return 1;
99}
100
101static inline void get_head_page_multiple(struct page *page, int nr)
102{
103 VM_BUG_ON(page != compound_head(page));
104 VM_BUG_ON(page_count(page) == 0);
105 atomic_add(nr, &page->_count);
106}
107
108static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
109 unsigned long end, int write, struct page **pages, int *nr)
110{
111 unsigned long mask;
112 pte_t pte = *(pte_t *)&pmd;
113 struct page *head, *page;
114 int refs;
115
116 mask = _PAGE_PRESENT|_PAGE_USER;
117 if (write)
118 mask |= _PAGE_RW;
119 if ((pte_val(pte) & mask) != mask)
120 return 0;
121 /* hugepages are never "special" */
122 VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL);
123 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
124
125 refs = 0;
126 head = pte_page(pte);
127 page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
128 do {
129 VM_BUG_ON(compound_head(page) != head);
130 pages[*nr] = page;
131 (*nr)++;
132 page++;
133 refs++;
134 } while (addr += PAGE_SIZE, addr != end);
135 get_head_page_multiple(head, refs);
136
137 return 1;
138}
139
140static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
141 int write, struct page **pages, int *nr)
142{
143 unsigned long next;
144 pmd_t *pmdp;
145
146 pmdp = pmd_offset(&pud, addr);
147 do {
148 pmd_t pmd = *pmdp;
149
150 next = pmd_addr_end(addr, end);
151 if (pmd_none(pmd))
152 return 0;
153 if (unlikely(pmd_large(pmd))) {
154 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
155 return 0;
156 } else {
157 if (!gup_pte_range(pmd, addr, next, write, pages, nr))
158 return 0;
159 }
160 } while (pmdp++, addr = next, addr != end);
161
162 return 1;
163}
164
165static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
166 unsigned long end, int write, struct page **pages, int *nr)
167{
168 unsigned long mask;
169 pte_t pte = *(pte_t *)&pud;
170 struct page *head, *page;
171 int refs;
172
173 mask = _PAGE_PRESENT|_PAGE_USER;
174 if (write)
175 mask |= _PAGE_RW;
176 if ((pte_val(pte) & mask) != mask)
177 return 0;
178 /* hugepages are never "special" */
179 VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL);
180 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
181
182 refs = 0;
183 head = pte_page(pte);
184 page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
185 do {
186 VM_BUG_ON(compound_head(page) != head);
187 pages[*nr] = page;
188 (*nr)++;
189 page++;
190 refs++;
191 } while (addr += PAGE_SIZE, addr != end);
192 get_head_page_multiple(head, refs);
193
194 return 1;
195}
196
197static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
198 int write, struct page **pages, int *nr)
199{
200 unsigned long next;
201 pud_t *pudp;
202
203 pudp = pud_offset(&pgd, addr);
204 do {
205 pud_t pud = *pudp;
206
207 next = pud_addr_end(addr, end);
208 if (pud_none(pud))
209 return 0;
210 if (unlikely(pud_large(pud))) {
211 if (!gup_huge_pud(pud, addr, next, write, pages, nr))
212 return 0;
213 } else {
214 if (!gup_pmd_range(pud, addr, next, write, pages, nr))
215 return 0;
216 }
217 } while (pudp++, addr = next, addr != end);
218
219 return 1;
220}
221
222int get_user_pages_fast(unsigned long start, int nr_pages, int write,
223 struct page **pages)
224{
225 struct mm_struct *mm = current->mm;
226 unsigned long end = start + (nr_pages << PAGE_SHIFT);
227 unsigned long addr = start;
228 unsigned long next;
229 pgd_t *pgdp;
230 int nr = 0;
231
232 if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
233 start, nr_pages*PAGE_SIZE)))
234 goto slow_irqon;
235
236 /*
237 * XXX: batch / limit 'nr', to avoid large irq off latency
238 * needs some instrumenting to determine the common sizes used by
239 * important workloads (eg. DB2), and whether limiting the batch size
240 * will decrease performance.
241 *
242 * It seems like we're in the clear for the moment. Direct-IO is
243 * the main guy that batches up lots of get_user_pages, and even
244 * they are limited to 64-at-a-time which is not so many.
245 */
246 /*
247 * This doesn't prevent pagetable teardown, but does prevent
248 * the pagetables and pages from being freed on x86.
249 *
250 * So long as we atomically load page table pointers versus teardown
251 * (which we do on x86, with the above PAE exception), we can follow the
252 * address down to the the page and take a ref on it.
253 */
254 local_irq_disable();
255 pgdp = pgd_offset(mm, addr);
256 do {
257 pgd_t pgd = *pgdp;
258
259 next = pgd_addr_end(addr, end);
260 if (pgd_none(pgd))
261 goto slow;
262 if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
263 goto slow;
264 } while (pgdp++, addr = next, addr != end);
265 local_irq_enable();
266
267 VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
268 return nr;
269
270 {
271 int ret;
272
273slow:
274 local_irq_enable();
275slow_irqon:
276 /* Try to get the remaining pages with get_user_pages */
277 start += nr << PAGE_SHIFT;
278 pages += nr;
279
280 down_read(&mm->mmap_sem);
281 ret = get_user_pages(current, mm, start,
282 (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
283 up_read(&mm->mmap_sem);
284
285 /* Have to be a bit careful with return values */
286 if (nr > 0) {
287 if (ret < 0)
288 ret = nr;
289 else
290 ret += nr;
291 }
292
293 return ret;
294 }
295}
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 0b3d567e686d..8f307d914c2e 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -124,7 +124,8 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
124 return 1; 124 return 1;
125} 125}
126 126
127pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) 127pte_t *huge_pte_alloc(struct mm_struct *mm,
128 unsigned long addr, unsigned long sz)
128{ 129{
129 pgd_t *pgd; 130 pgd_t *pgd;
130 pud_t *pud; 131 pud_t *pud;
@@ -133,9 +134,14 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
133 pgd = pgd_offset(mm, addr); 134 pgd = pgd_offset(mm, addr);
134 pud = pud_alloc(mm, pgd, addr); 135 pud = pud_alloc(mm, pgd, addr);
135 if (pud) { 136 if (pud) {
136 if (pud_none(*pud)) 137 if (sz == PUD_SIZE) {
137 huge_pmd_share(mm, addr, pud); 138 pte = (pte_t *)pud;
138 pte = (pte_t *) pmd_alloc(mm, pud, addr); 139 } else {
140 BUG_ON(sz != PMD_SIZE);
141 if (pud_none(*pud))
142 huge_pmd_share(mm, addr, pud);
143 pte = (pte_t *) pmd_alloc(mm, pud, addr);
144 }
139 } 145 }
140 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); 146 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
141 147
@@ -151,8 +157,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
151 pgd = pgd_offset(mm, addr); 157 pgd = pgd_offset(mm, addr);
152 if (pgd_present(*pgd)) { 158 if (pgd_present(*pgd)) {
153 pud = pud_offset(pgd, addr); 159 pud = pud_offset(pgd, addr);
154 if (pud_present(*pud)) 160 if (pud_present(*pud)) {
161 if (pud_large(*pud))
162 return (pte_t *)pud;
155 pmd = pmd_offset(pud, addr); 163 pmd = pmd_offset(pud, addr);
164 }
156 } 165 }
157 return (pte_t *) pmd; 166 return (pte_t *) pmd;
158} 167}
@@ -188,6 +197,11 @@ int pmd_huge(pmd_t pmd)
188 return 0; 197 return 0;
189} 198}
190 199
200int pud_huge(pud_t pud)
201{
202 return 0;
203}
204
191struct page * 205struct page *
192follow_huge_pmd(struct mm_struct *mm, unsigned long address, 206follow_huge_pmd(struct mm_struct *mm, unsigned long address,
193 pmd_t *pmd, int write) 207 pmd_t *pmd, int write)
@@ -208,6 +222,11 @@ int pmd_huge(pmd_t pmd)
208 return !!(pmd_val(pmd) & _PAGE_PSE); 222 return !!(pmd_val(pmd) & _PAGE_PSE);
209} 223}
210 224
225int pud_huge(pud_t pud)
226{
227 return !!(pud_val(pud) & _PAGE_PSE);
228}
229
211struct page * 230struct page *
212follow_huge_pmd(struct mm_struct *mm, unsigned long address, 231follow_huge_pmd(struct mm_struct *mm, unsigned long address,
213 pmd_t *pmd, int write) 232 pmd_t *pmd, int write)
@@ -216,9 +235,22 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
216 235
217 page = pte_page(*(pte_t *)pmd); 236 page = pte_page(*(pte_t *)pmd);
218 if (page) 237 if (page)
219 page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT); 238 page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
239 return page;
240}
241
242struct page *
243follow_huge_pud(struct mm_struct *mm, unsigned long address,
244 pud_t *pud, int write)
245{
246 struct page *page;
247
248 page = pte_page(*(pte_t *)pud);
249 if (page)
250 page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
220 return page; 251 return page;
221} 252}
253
222#endif 254#endif
223 255
224/* x86_64 also uses this file */ 256/* x86_64 also uses this file */
@@ -228,6 +260,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
228 unsigned long addr, unsigned long len, 260 unsigned long addr, unsigned long len,
229 unsigned long pgoff, unsigned long flags) 261 unsigned long pgoff, unsigned long flags)
230{ 262{
263 struct hstate *h = hstate_file(file);
231 struct mm_struct *mm = current->mm; 264 struct mm_struct *mm = current->mm;
232 struct vm_area_struct *vma; 265 struct vm_area_struct *vma;
233 unsigned long start_addr; 266 unsigned long start_addr;
@@ -240,7 +273,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
240 } 273 }
241 274
242full_search: 275full_search:
243 addr = ALIGN(start_addr, HPAGE_SIZE); 276 addr = ALIGN(start_addr, huge_page_size(h));
244 277
245 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { 278 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
246 /* At this point: (!vma || addr < vma->vm_end). */ 279 /* At this point: (!vma || addr < vma->vm_end). */
@@ -262,7 +295,7 @@ full_search:
262 } 295 }
263 if (addr + mm->cached_hole_size < vma->vm_start) 296 if (addr + mm->cached_hole_size < vma->vm_start)
264 mm->cached_hole_size = vma->vm_start - addr; 297 mm->cached_hole_size = vma->vm_start - addr;
265 addr = ALIGN(vma->vm_end, HPAGE_SIZE); 298 addr = ALIGN(vma->vm_end, huge_page_size(h));
266 } 299 }
267} 300}
268 301
@@ -270,6 +303,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
270 unsigned long addr0, unsigned long len, 303 unsigned long addr0, unsigned long len,
271 unsigned long pgoff, unsigned long flags) 304 unsigned long pgoff, unsigned long flags)
272{ 305{
306 struct hstate *h = hstate_file(file);
273 struct mm_struct *mm = current->mm; 307 struct mm_struct *mm = current->mm;
274 struct vm_area_struct *vma, *prev_vma; 308 struct vm_area_struct *vma, *prev_vma;
275 unsigned long base = mm->mmap_base, addr = addr0; 309 unsigned long base = mm->mmap_base, addr = addr0;
@@ -290,7 +324,7 @@ try_again:
290 goto fail; 324 goto fail;
291 325
292 /* either no address requested or cant fit in requested address hole */ 326 /* either no address requested or cant fit in requested address hole */
293 addr = (mm->free_area_cache - len) & HPAGE_MASK; 327 addr = (mm->free_area_cache - len) & huge_page_mask(h);
294 do { 328 do {
295 /* 329 /*
296 * Lookup failure means no vma is above this address, 330 * Lookup failure means no vma is above this address,
@@ -321,7 +355,7 @@ try_again:
321 largest_hole = vma->vm_start - addr; 355 largest_hole = vma->vm_start - addr;
322 356
323 /* try just below the current vma->vm_start */ 357 /* try just below the current vma->vm_start */
324 addr = (vma->vm_start - len) & HPAGE_MASK; 358 addr = (vma->vm_start - len) & huge_page_mask(h);
325 } while (len <= vma->vm_start); 359 } while (len <= vma->vm_start);
326 360
327fail: 361fail:
@@ -359,22 +393,23 @@ unsigned long
359hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 393hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
360 unsigned long len, unsigned long pgoff, unsigned long flags) 394 unsigned long len, unsigned long pgoff, unsigned long flags)
361{ 395{
396 struct hstate *h = hstate_file(file);
362 struct mm_struct *mm = current->mm; 397 struct mm_struct *mm = current->mm;
363 struct vm_area_struct *vma; 398 struct vm_area_struct *vma;
364 399
365 if (len & ~HPAGE_MASK) 400 if (len & ~huge_page_mask(h))
366 return -EINVAL; 401 return -EINVAL;
367 if (len > TASK_SIZE) 402 if (len > TASK_SIZE)
368 return -ENOMEM; 403 return -ENOMEM;
369 404
370 if (flags & MAP_FIXED) { 405 if (flags & MAP_FIXED) {
371 if (prepare_hugepage_range(addr, len)) 406 if (prepare_hugepage_range(file, addr, len))
372 return -EINVAL; 407 return -EINVAL;
373 return addr; 408 return addr;
374 } 409 }
375 410
376 if (addr) { 411 if (addr) {
377 addr = ALIGN(addr, HPAGE_SIZE); 412 addr = ALIGN(addr, huge_page_size(h));
378 vma = find_vma(mm, addr); 413 vma = find_vma(mm, addr);
379 if (TASK_SIZE - len >= addr && 414 if (TASK_SIZE - len >= addr &&
380 (!vma || addr + len <= vma->vm_start)) 415 (!vma || addr + len <= vma->vm_start))
@@ -390,3 +425,20 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
390 425
391#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ 426#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
392 427
428#ifdef CONFIG_X86_64
429static __init int setup_hugepagesz(char *opt)
430{
431 unsigned long ps = memparse(opt, &opt);
432 if (ps == PMD_SIZE) {
433 hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
434 } else if (ps == PUD_SIZE && cpu_has_gbpages) {
435 hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
436 } else {
437 printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n",
438 ps >> 20);
439 return 0;
440 }
441 return 1;
442}
443__setup("hugepagesz=", setup_hugepagesz);
444#endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index ec30d10154b6..d37f29376b0c 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -50,6 +50,7 @@
50 50
51unsigned int __VMALLOC_RESERVE = 128 << 20; 51unsigned int __VMALLOC_RESERVE = 128 << 20;
52 52
53unsigned long max_low_pfn_mapped;
53unsigned long max_pfn_mapped; 54unsigned long max_pfn_mapped;
54 55
55DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 56DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -57,6 +58,27 @@ unsigned long highstart_pfn, highend_pfn;
57 58
58static noinline int do_test_wp_bit(void); 59static noinline int do_test_wp_bit(void);
59 60
61
62static unsigned long __initdata table_start;
63static unsigned long __meminitdata table_end;
64static unsigned long __meminitdata table_top;
65
66static int __initdata after_init_bootmem;
67
68static __init void *alloc_low_page(unsigned long *phys)
69{
70 unsigned long pfn = table_end++;
71 void *adr;
72
73 if (pfn >= table_top)
74 panic("alloc_low_page: ran out of memory");
75
76 adr = __va(pfn * PAGE_SIZE);
77 memset(adr, 0, PAGE_SIZE);
78 *phys = pfn * PAGE_SIZE;
79 return adr;
80}
81
60/* 82/*
61 * Creates a middle page table and puts a pointer to it in the 83 * Creates a middle page table and puts a pointer to it in the
62 * given global directory entry. This only returns the gd entry 84 * given global directory entry. This only returns the gd entry
@@ -68,9 +90,12 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
68 pmd_t *pmd_table; 90 pmd_t *pmd_table;
69 91
70#ifdef CONFIG_X86_PAE 92#ifdef CONFIG_X86_PAE
93 unsigned long phys;
71 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { 94 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
72 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); 95 if (after_init_bootmem)
73 96 pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
97 else
98 pmd_table = (pmd_t *)alloc_low_page(&phys);
74 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); 99 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
75 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 100 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
76 pud = pud_offset(pgd, 0); 101 pud = pud_offset(pgd, 0);
@@ -92,12 +117,16 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
92 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { 117 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
93 pte_t *page_table = NULL; 118 pte_t *page_table = NULL;
94 119
120 if (after_init_bootmem) {
95#ifdef CONFIG_DEBUG_PAGEALLOC 121#ifdef CONFIG_DEBUG_PAGEALLOC
96 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); 122 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
97#endif 123#endif
98 if (!page_table) { 124 if (!page_table)
99 page_table = 125 page_table =
100 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); 126 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
127 } else {
128 unsigned long phys;
129 page_table = (pte_t *)alloc_low_page(&phys);
101 } 130 }
102 131
103 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); 132 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
@@ -155,38 +184,44 @@ static inline int is_kernel_text(unsigned long addr)
155 * of max_low_pfn pages, by creating page tables starting from address 184 * of max_low_pfn pages, by creating page tables starting from address
156 * PAGE_OFFSET: 185 * PAGE_OFFSET:
157 */ 186 */
158static void __init kernel_physical_mapping_init(pgd_t *pgd_base) 187static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
188 unsigned long start_pfn,
189 unsigned long end_pfn,
190 int use_pse)
159{ 191{
160 int pgd_idx, pmd_idx, pte_ofs; 192 int pgd_idx, pmd_idx, pte_ofs;
161 unsigned long pfn; 193 unsigned long pfn;
162 pgd_t *pgd; 194 pgd_t *pgd;
163 pmd_t *pmd; 195 pmd_t *pmd;
164 pte_t *pte; 196 pte_t *pte;
197 unsigned pages_2m = 0, pages_4k = 0;
165 198
166 pgd_idx = pgd_index(PAGE_OFFSET); 199 if (!cpu_has_pse)
167 pgd = pgd_base + pgd_idx; 200 use_pse = 0;
168 pfn = 0;
169 201
202 pfn = start_pfn;
203 pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
204 pgd = pgd_base + pgd_idx;
170 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { 205 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
171 pmd = one_md_table_init(pgd); 206 pmd = one_md_table_init(pgd);
172 if (pfn >= max_low_pfn)
173 continue;
174 207
175 for (pmd_idx = 0; 208 if (pfn >= end_pfn)
176 pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; 209 continue;
210#ifdef CONFIG_X86_PAE
211 pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
212 pmd += pmd_idx;
213#else
214 pmd_idx = 0;
215#endif
216 for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
177 pmd++, pmd_idx++) { 217 pmd++, pmd_idx++) {
178 unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET; 218 unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
179 219
180 /* 220 /*
181 * Map with big pages if possible, otherwise 221 * Map with big pages if possible, otherwise
182 * create normal page tables: 222 * create normal page tables:
183 *
184 * Don't use a large page for the first 2/4MB of memory
185 * because there are often fixed size MTRRs in there
186 * and overlapping MTRRs into large pages can cause
187 * slowdowns.
188 */ 223 */
189 if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) { 224 if (use_pse) {
190 unsigned int addr2; 225 unsigned int addr2;
191 pgprot_t prot = PAGE_KERNEL_LARGE; 226 pgprot_t prot = PAGE_KERNEL_LARGE;
192 227
@@ -197,34 +232,30 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
197 is_kernel_text(addr2)) 232 is_kernel_text(addr2))
198 prot = PAGE_KERNEL_LARGE_EXEC; 233 prot = PAGE_KERNEL_LARGE_EXEC;
199 234
235 pages_2m++;
200 set_pmd(pmd, pfn_pmd(pfn, prot)); 236 set_pmd(pmd, pfn_pmd(pfn, prot));
201 237
202 pfn += PTRS_PER_PTE; 238 pfn += PTRS_PER_PTE;
203 max_pfn_mapped = pfn;
204 continue; 239 continue;
205 } 240 }
206 pte = one_page_table_init(pmd); 241 pte = one_page_table_init(pmd);
207 242
208 for (pte_ofs = 0; 243 pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
209 pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; 244 pte += pte_ofs;
245 for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
210 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { 246 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
211 pgprot_t prot = PAGE_KERNEL; 247 pgprot_t prot = PAGE_KERNEL;
212 248
213 if (is_kernel_text(addr)) 249 if (is_kernel_text(addr))
214 prot = PAGE_KERNEL_EXEC; 250 prot = PAGE_KERNEL_EXEC;
215 251
252 pages_4k++;
216 set_pte(pte, pfn_pte(pfn, prot)); 253 set_pte(pte, pfn_pte(pfn, prot));
217 } 254 }
218 max_pfn_mapped = pfn;
219 } 255 }
220 } 256 }
221} 257 update_page_count(PG_LEVEL_2M, pages_2m);
222 258 update_page_count(PG_LEVEL_4K, pages_4k);
223static inline int page_kills_ppro(unsigned long pagenr)
224{
225 if (pagenr >= 0x70000 && pagenr <= 0x7003F)
226 return 1;
227 return 0;
228} 259}
229 260
230/* 261/*
@@ -287,29 +318,62 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
287 pkmap_page_table = pte; 318 pkmap_page_table = pte;
288} 319}
289 320
290void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) 321static void __init add_one_highpage_init(struct page *page, int pfn)
291{ 322{
292 if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { 323 ClearPageReserved(page);
293 ClearPageReserved(page); 324 init_page_count(page);
294 init_page_count(page); 325 __free_page(page);
295 __free_page(page); 326 totalhigh_pages++;
296 totalhigh_pages++;
297 } else
298 SetPageReserved(page);
299} 327}
300 328
301#ifndef CONFIG_NUMA 329struct add_highpages_data {
302static void __init set_highmem_pages_init(int bad_ppro) 330 unsigned long start_pfn;
331 unsigned long end_pfn;
332};
333
334static int __init add_highpages_work_fn(unsigned long start_pfn,
335 unsigned long end_pfn, void *datax)
303{ 336{
304 int pfn; 337 int node_pfn;
338 struct page *page;
339 unsigned long final_start_pfn, final_end_pfn;
340 struct add_highpages_data *data;
305 341
306 for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) { 342 data = (struct add_highpages_data *)datax;
307 /* 343
308 * Holes under sparsemem might not have no mem_map[]: 344 final_start_pfn = max(start_pfn, data->start_pfn);
309 */ 345 final_end_pfn = min(end_pfn, data->end_pfn);
310 if (pfn_valid(pfn)) 346 if (final_start_pfn >= final_end_pfn)
311 add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); 347 return 0;
348
349 for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
350 node_pfn++) {
351 if (!pfn_valid(node_pfn))
352 continue;
353 page = pfn_to_page(node_pfn);
354 add_one_highpage_init(page, node_pfn);
312 } 355 }
356
357 return 0;
358
359}
360
361void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
362 unsigned long end_pfn)
363{
364 struct add_highpages_data data;
365
366 data.start_pfn = start_pfn;
367 data.end_pfn = end_pfn;
368
369 work_with_active_regions(nid, add_highpages_work_fn, &data);
370}
371
372#ifndef CONFIG_NUMA
373static void __init set_highmem_pages_init(void)
374{
375 add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
376
313 totalram_pages += totalhigh_pages; 377 totalram_pages += totalhigh_pages;
314} 378}
315#endif /* !CONFIG_NUMA */ 379#endif /* !CONFIG_NUMA */
@@ -317,14 +381,9 @@ static void __init set_highmem_pages_init(int bad_ppro)
317#else 381#else
318# define kmap_init() do { } while (0) 382# define kmap_init() do { } while (0)
319# define permanent_kmaps_init(pgd_base) do { } while (0) 383# define permanent_kmaps_init(pgd_base) do { } while (0)
320# define set_highmem_pages_init(bad_ppro) do { } while (0) 384# define set_highmem_pages_init() do { } while (0)
321#endif /* CONFIG_HIGHMEM */ 385#endif /* CONFIG_HIGHMEM */
322 386
323pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
324EXPORT_SYMBOL(__PAGE_KERNEL);
325
326pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
327
328void __init native_pagetable_setup_start(pgd_t *base) 387void __init native_pagetable_setup_start(pgd_t *base)
329{ 388{
330 unsigned long pfn, va; 389 unsigned long pfn, va;
@@ -380,27 +439,10 @@ void __init native_pagetable_setup_done(pgd_t *base)
380 * be partially populated, and so it avoids stomping on any existing 439 * be partially populated, and so it avoids stomping on any existing
381 * mappings. 440 * mappings.
382 */ 441 */
383static void __init pagetable_init(void) 442static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
384{ 443{
385 pgd_t *pgd_base = swapper_pg_dir;
386 unsigned long vaddr, end; 444 unsigned long vaddr, end;
387 445
388 paravirt_pagetable_setup_start(pgd_base);
389
390 /* Enable PSE if available */
391 if (cpu_has_pse)
392 set_in_cr4(X86_CR4_PSE);
393
394 /* Enable PGE if available */
395 if (cpu_has_pge) {
396 set_in_cr4(X86_CR4_PGE);
397 __PAGE_KERNEL |= _PAGE_GLOBAL;
398 __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
399 }
400
401 kernel_physical_mapping_init(pgd_base);
402 remap_numa_kva();
403
404 /* 446 /*
405 * Fixed mappings, only the page table structure has to be 447 * Fixed mappings, only the page table structure has to be
406 * created - mappings will be set by set_fixmap(): 448 * created - mappings will be set by set_fixmap():
@@ -410,6 +452,13 @@ static void __init pagetable_init(void)
410 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; 452 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
411 page_table_range_init(vaddr, end, pgd_base); 453 page_table_range_init(vaddr, end, pgd_base);
412 early_ioremap_reset(); 454 early_ioremap_reset();
455}
456
457static void __init pagetable_init(void)
458{
459 pgd_t *pgd_base = swapper_pg_dir;
460
461 paravirt_pagetable_setup_start(pgd_base);
413 462
414 permanent_kmaps_init(pgd_base); 463 permanent_kmaps_init(pgd_base);
415 464
@@ -456,7 +505,7 @@ void zap_low_mappings(void)
456 505
457int nx_enabled; 506int nx_enabled;
458 507
459pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX; 508pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
460EXPORT_SYMBOL_GPL(__supported_pte_mask); 509EXPORT_SYMBOL_GPL(__supported_pte_mask);
461 510
462#ifdef CONFIG_X86_PAE 511#ifdef CONFIG_X86_PAE
@@ -509,27 +558,319 @@ static void __init set_nx(void)
509} 558}
510#endif 559#endif
511 560
561/* user-defined highmem size */
562static unsigned int highmem_pages = -1;
563
512/* 564/*
513 * paging_init() sets up the page tables - note that the first 8MB are 565 * highmem=size forces highmem to be exactly 'size' bytes.
514 * already mapped by head.S. 566 * This works even on boxes that have no highmem otherwise.
515 * 567 * This also works to reduce highmem size on bigger boxes.
516 * This routines also unmaps the page at virtual kernel address 0, so
517 * that we can trap those pesky NULL-reference errors in the kernel.
518 */ 568 */
519void __init paging_init(void) 569static int __init parse_highmem(char *arg)
570{
571 if (!arg)
572 return -EINVAL;
573
574 highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
575 return 0;
576}
577early_param("highmem", parse_highmem);
578
579/*
580 * Determine low and high memory ranges:
581 */
582void __init find_low_pfn_range(void)
583{
584 /* it could update max_pfn */
585
586 /* max_low_pfn is 0, we already have early_res support */
587
588 max_low_pfn = max_pfn;
589 if (max_low_pfn > MAXMEM_PFN) {
590 if (highmem_pages == -1)
591 highmem_pages = max_pfn - MAXMEM_PFN;
592 if (highmem_pages + MAXMEM_PFN < max_pfn)
593 max_pfn = MAXMEM_PFN + highmem_pages;
594 if (highmem_pages + MAXMEM_PFN > max_pfn) {
595 printk(KERN_WARNING "only %luMB highmem pages "
596 "available, ignoring highmem size of %uMB.\n",
597 pages_to_mb(max_pfn - MAXMEM_PFN),
598 pages_to_mb(highmem_pages));
599 highmem_pages = 0;
600 }
601 max_low_pfn = MAXMEM_PFN;
602#ifndef CONFIG_HIGHMEM
603 /* Maximum memory usable is what is directly addressable */
604 printk(KERN_WARNING "Warning only %ldMB will be used.\n",
605 MAXMEM>>20);
606 if (max_pfn > MAX_NONPAE_PFN)
607 printk(KERN_WARNING
608 "Use a HIGHMEM64G enabled kernel.\n");
609 else
610 printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
611 max_pfn = MAXMEM_PFN;
612#else /* !CONFIG_HIGHMEM */
613#ifndef CONFIG_HIGHMEM64G
614 if (max_pfn > MAX_NONPAE_PFN) {
615 max_pfn = MAX_NONPAE_PFN;
616 printk(KERN_WARNING "Warning only 4GB will be used."
617 "Use a HIGHMEM64G enabled kernel.\n");
618 }
619#endif /* !CONFIG_HIGHMEM64G */
620#endif /* !CONFIG_HIGHMEM */
621 } else {
622 if (highmem_pages == -1)
623 highmem_pages = 0;
624#ifdef CONFIG_HIGHMEM
625 if (highmem_pages >= max_pfn) {
626 printk(KERN_ERR "highmem size specified (%uMB) is "
627 "bigger than pages available (%luMB)!.\n",
628 pages_to_mb(highmem_pages),
629 pages_to_mb(max_pfn));
630 highmem_pages = 0;
631 }
632 if (highmem_pages) {
633 if (max_low_pfn - highmem_pages <
634 64*1024*1024/PAGE_SIZE){
635 printk(KERN_ERR "highmem size %uMB results in "
636 "smaller than 64MB lowmem, ignoring it.\n"
637 , pages_to_mb(highmem_pages));
638 highmem_pages = 0;
639 }
640 max_low_pfn -= highmem_pages;
641 }
642#else
643 if (highmem_pages)
644 printk(KERN_ERR "ignoring highmem size on non-highmem"
645 " kernel!\n");
646#endif
647 }
648}
649
650#ifndef CONFIG_NEED_MULTIPLE_NODES
651void __init initmem_init(unsigned long start_pfn,
652 unsigned long end_pfn)
520{ 653{
654#ifdef CONFIG_HIGHMEM
655 highstart_pfn = highend_pfn = max_pfn;
656 if (max_pfn > max_low_pfn)
657 highstart_pfn = max_low_pfn;
658 memory_present(0, 0, highend_pfn);
659 e820_register_active_regions(0, 0, highend_pfn);
660 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
661 pages_to_mb(highend_pfn - highstart_pfn));
662 num_physpages = highend_pfn;
663 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
664#else
665 memory_present(0, 0, max_low_pfn);
666 e820_register_active_regions(0, 0, max_low_pfn);
667 num_physpages = max_low_pfn;
668 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
669#endif
670#ifdef CONFIG_FLATMEM
671 max_mapnr = num_physpages;
672#endif
673 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
674 pages_to_mb(max_low_pfn));
675
676 setup_bootmem_allocator();
677}
678#endif /* !CONFIG_NEED_MULTIPLE_NODES */
679
680static void __init zone_sizes_init(void)
681{
682 unsigned long max_zone_pfns[MAX_NR_ZONES];
683 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
684 max_zone_pfns[ZONE_DMA] =
685 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
686 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
687#ifdef CONFIG_HIGHMEM
688 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
689#endif
690
691 free_area_init_nodes(max_zone_pfns);
692}
693
694void __init setup_bootmem_allocator(void)
695{
696 int i;
697 unsigned long bootmap_size, bootmap;
698 /*
699 * Initialize the boot-time allocator (with low memory only):
700 */
701 bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
702 bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
703 max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
704 PAGE_SIZE);
705 if (bootmap == -1L)
706 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
707 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
708
709 /* don't touch min_low_pfn */
710 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
711 min_low_pfn, max_low_pfn);
712 printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
713 max_pfn_mapped<<PAGE_SHIFT);
714 printk(KERN_INFO " low ram: %08lx - %08lx\n",
715 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
716 printk(KERN_INFO " bootmap %08lx - %08lx\n",
717 bootmap, bootmap + bootmap_size);
718 for_each_online_node(i)
719 free_bootmem_with_active_regions(i, max_low_pfn);
720 early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
721
722 after_init_bootmem = 1;
723}
724
725static void __init find_early_table_space(unsigned long end)
726{
727 unsigned long puds, pmds, ptes, tables, start;
728
729 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
730 tables = PAGE_ALIGN(puds * sizeof(pud_t));
731
732 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
733 tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
734
735 if (cpu_has_pse) {
736 unsigned long extra;
737
738 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
739 extra += PMD_SIZE;
740 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
741 } else
742 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
743
744 tables += PAGE_ALIGN(ptes * sizeof(pte_t));
745
746 /* for fixmap */
747 tables += PAGE_SIZE * 2;
748
749 /*
750 * RED-PEN putting page tables only on node 0 could
751 * cause a hotspot and fill up ZONE_DMA. The page tables
752 * need roughly 0.5KB per GB.
753 */
754 start = 0x7000;
755 table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
756 tables, PAGE_SIZE);
757 if (table_start == -1UL)
758 panic("Cannot find space for the kernel page tables");
759
760 table_start >>= PAGE_SHIFT;
761 table_end = table_start;
762 table_top = table_start + (tables>>PAGE_SHIFT);
763
764 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
765 end, table_start << PAGE_SHIFT,
766 (table_start << PAGE_SHIFT) + tables);
767}
768
769unsigned long __init_refok init_memory_mapping(unsigned long start,
770 unsigned long end)
771{
772 pgd_t *pgd_base = swapper_pg_dir;
773 unsigned long start_pfn, end_pfn;
774 unsigned long big_page_start;
775
776 /*
777 * Find space for the kernel direct mapping tables.
778 */
779 if (!after_init_bootmem)
780 find_early_table_space(end);
781
521#ifdef CONFIG_X86_PAE 782#ifdef CONFIG_X86_PAE
522 set_nx(); 783 set_nx();
523 if (nx_enabled) 784 if (nx_enabled)
524 printk(KERN_INFO "NX (Execute Disable) protection: active\n"); 785 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
525#endif 786#endif
526 pagetable_init(); 787
788 /* Enable PSE if available */
789 if (cpu_has_pse)
790 set_in_cr4(X86_CR4_PSE);
791
792 /* Enable PGE if available */
793 if (cpu_has_pge) {
794 set_in_cr4(X86_CR4_PGE);
795 __supported_pte_mask |= _PAGE_GLOBAL;
796 }
797
798 /*
799 * Don't use a large page for the first 2/4MB of memory
800 * because there are often fixed size MTRRs in there
801 * and overlapping MTRRs into large pages can cause
802 * slowdowns.
803 */
804 big_page_start = PMD_SIZE;
805
806 if (start < big_page_start) {
807 start_pfn = start >> PAGE_SHIFT;
808 end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
809 } else {
810 /* head is not big page alignment ? */
811 start_pfn = start >> PAGE_SHIFT;
812 end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
813 << (PMD_SHIFT - PAGE_SHIFT);
814 }
815 if (start_pfn < end_pfn)
816 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
817
818 /* big page range */
819 start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
820 << (PMD_SHIFT - PAGE_SHIFT);
821 if (start_pfn < (big_page_start >> PAGE_SHIFT))
822 start_pfn = big_page_start >> PAGE_SHIFT;
823 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
824 if (start_pfn < end_pfn)
825 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
826 cpu_has_pse);
827
828 /* tail is not big page alignment ? */
829 start_pfn = end_pfn;
830 if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
831 end_pfn = end >> PAGE_SHIFT;
832 if (start_pfn < end_pfn)
833 kernel_physical_mapping_init(pgd_base, start_pfn,
834 end_pfn, 0);
835 }
836
837 early_ioremap_page_table_range_init(pgd_base);
527 838
528 load_cr3(swapper_pg_dir); 839 load_cr3(swapper_pg_dir);
529 840
530 __flush_tlb_all(); 841 __flush_tlb_all();
531 842
843 if (!after_init_bootmem)
844 reserve_early(table_start << PAGE_SHIFT,
845 table_end << PAGE_SHIFT, "PGTABLE");
846
847 if (!after_init_bootmem)
848 early_memtest(start, end);
849
850 return end >> PAGE_SHIFT;
851}
852
853
854/*
855 * paging_init() sets up the page tables - note that the first 8MB are
856 * already mapped by head.S.
857 *
858 * This routines also unmaps the page at virtual kernel address 0, so
859 * that we can trap those pesky NULL-reference errors in the kernel.
860 */
861void __init paging_init(void)
862{
863 pagetable_init();
864
865 __flush_tlb_all();
866
532 kmap_init(); 867 kmap_init();
868
869 /*
870 * NOTE: at this point the bootmem allocator is fully available.
871 */
872 sparse_init();
873 zone_sizes_init();
533} 874}
534 875
535/* 876/*
@@ -564,24 +905,11 @@ static struct kcore_list kcore_mem, kcore_vmalloc;
564void __init mem_init(void) 905void __init mem_init(void)
565{ 906{
566 int codesize, reservedpages, datasize, initsize; 907 int codesize, reservedpages, datasize, initsize;
567 int tmp, bad_ppro; 908 int tmp;
568 909
569#ifdef CONFIG_FLATMEM 910#ifdef CONFIG_FLATMEM
570 BUG_ON(!mem_map); 911 BUG_ON(!mem_map);
571#endif 912#endif
572 bad_ppro = ppro_with_ram_bug();
573
574#ifdef CONFIG_HIGHMEM
575 /* check that fixmap and pkmap do not overlap */
576 if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
577 printk(KERN_ERR
578 "fixmap and kmap areas overlap - this will crash\n");
579 printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
580 PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
581 FIXADDR_START);
582 BUG();
583 }
584#endif
585 /* this will put all low memory onto the freelists */ 913 /* this will put all low memory onto the freelists */
586 totalram_pages += free_all_bootmem(); 914 totalram_pages += free_all_bootmem();
587 915
@@ -593,7 +921,7 @@ void __init mem_init(void)
593 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) 921 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
594 reservedpages++; 922 reservedpages++;
595 923
596 set_highmem_pages_init(bad_ppro); 924 set_highmem_pages_init();
597 925
598 codesize = (unsigned long) &_etext - (unsigned long) &_text; 926 codesize = (unsigned long) &_etext - (unsigned long) &_text;
599 datasize = (unsigned long) &_edata - (unsigned long) &_etext; 927 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
@@ -614,7 +942,6 @@ void __init mem_init(void)
614 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) 942 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
615 ); 943 );
616 944
617#if 1 /* double-sanity-check paranoia */
618 printk(KERN_INFO "virtual kernel memory layout:\n" 945 printk(KERN_INFO "virtual kernel memory layout:\n"
619 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" 946 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
620#ifdef CONFIG_HIGHMEM 947#ifdef CONFIG_HIGHMEM
@@ -655,7 +982,6 @@ void __init mem_init(void)
655#endif 982#endif
656 BUG_ON(VMALLOC_START > VMALLOC_END); 983 BUG_ON(VMALLOC_START > VMALLOC_END);
657 BUG_ON((unsigned long)high_memory > VMALLOC_START); 984 BUG_ON((unsigned long)high_memory > VMALLOC_START);
658#endif /* double-sanity-check paranoia */
659 985
660 if (boot_cpu_data.wp_works_ok < 0) 986 if (boot_cpu_data.wp_works_ok < 0)
661 test_wp_bit(); 987 test_wp_bit();
@@ -710,6 +1036,8 @@ void mark_rodata_ro(void)
710 unsigned long start = PFN_ALIGN(_text); 1036 unsigned long start = PFN_ALIGN(_text);
711 unsigned long size = PFN_ALIGN(_etext) - start; 1037 unsigned long size = PFN_ALIGN(_etext) - start;
712 1038
1039#ifndef CONFIG_DYNAMIC_FTRACE
1040 /* Dynamic tracing modifies the kernel text section */
713 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 1041 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
714 printk(KERN_INFO "Write protecting the kernel text: %luk\n", 1042 printk(KERN_INFO "Write protecting the kernel text: %luk\n",
715 size >> 10); 1043 size >> 10);
@@ -722,6 +1050,8 @@ void mark_rodata_ro(void)
722 printk(KERN_INFO "Testing CPA: write protecting again\n"); 1050 printk(KERN_INFO "Testing CPA: write protecting again\n");
723 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); 1051 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
724#endif 1052#endif
1053#endif /* CONFIG_DYNAMIC_FTRACE */
1054
725 start += size; 1055 start += size;
726 size = (unsigned long)__end_rodata - start; 1056 size = (unsigned long)__end_rodata - start;
727 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 1057 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
@@ -784,3 +1114,9 @@ void free_initrd_mem(unsigned long start, unsigned long end)
784 free_init_pages("initrd memory", start, end); 1114 free_init_pages("initrd memory", start, end);
785} 1115}
786#endif 1116#endif
1117
1118int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
1119 int flags)
1120{
1121 return reserve_bootmem(phys, len, flags);
1122}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 819dad973b13..129618ca0ea2 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -18,6 +18,7 @@
18#include <linux/swap.h> 18#include <linux/swap.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/initrd.h>
21#include <linux/pagemap.h> 22#include <linux/pagemap.h>
22#include <linux/bootmem.h> 23#include <linux/bootmem.h>
23#include <linux/proc_fs.h> 24#include <linux/proc_fs.h>
@@ -47,6 +48,14 @@
47#include <asm/numa.h> 48#include <asm/numa.h>
48#include <asm/cacheflush.h> 49#include <asm/cacheflush.h>
49 50
51/*
52 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
53 * The direct mapping extends to max_pfn_mapped, so that we can directly access
54 * apertures, ACPI and other tables without having to play with fixmaps.
55 */
56unsigned long max_low_pfn_mapped;
57unsigned long max_pfn_mapped;
58
50static unsigned long dma_reserve __initdata; 59static unsigned long dma_reserve __initdata;
51 60
52DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 61DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -77,43 +86,6 @@ early_param("gbpages", parse_direct_gbpages_on);
77 * around without checking the pgd every time. 86 * around without checking the pgd every time.
78 */ 87 */
79 88
80void show_mem(void)
81{
82 long i, total = 0, reserved = 0;
83 long shared = 0, cached = 0;
84 struct page *page;
85 pg_data_t *pgdat;
86
87 printk(KERN_INFO "Mem-info:\n");
88 show_free_areas();
89 for_each_online_pgdat(pgdat) {
90 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
91 /*
92 * This loop can take a while with 256 GB and
93 * 4k pages so defer the NMI watchdog:
94 */
95 if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
96 touch_nmi_watchdog();
97
98 if (!pfn_valid(pgdat->node_start_pfn + i))
99 continue;
100
101 page = pfn_to_page(pgdat->node_start_pfn + i);
102 total++;
103 if (PageReserved(page))
104 reserved++;
105 else if (PageSwapCache(page))
106 cached++;
107 else if (page_count(page))
108 shared += page_count(page) - 1;
109 }
110 }
111 printk(KERN_INFO "%lu pages of RAM\n", total);
112 printk(KERN_INFO "%lu reserved pages\n", reserved);
113 printk(KERN_INFO "%lu pages shared\n", shared);
114 printk(KERN_INFO "%lu pages swap cached\n", cached);
115}
116
117int after_bootmem; 89int after_bootmem;
118 90
119static __init void *spp_getpage(void) 91static __init void *spp_getpage(void)
@@ -135,26 +107,17 @@ static __init void *spp_getpage(void)
135 return ptr; 107 return ptr;
136} 108}
137 109
138static __init void 110void
139set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) 111set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
140{ 112{
141 pgd_t *pgd;
142 pud_t *pud; 113 pud_t *pud;
143 pmd_t *pmd; 114 pmd_t *pmd;
144 pte_t *pte, new_pte; 115 pte_t *pte;
145
146 pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
147 116
148 pgd = pgd_offset_k(vaddr); 117 pud = pud_page + pud_index(vaddr);
149 if (pgd_none(*pgd)) {
150 printk(KERN_ERR
151 "PGD FIXMAP MISSING, it should be setup in head.S!\n");
152 return;
153 }
154 pud = pud_offset(pgd, vaddr);
155 if (pud_none(*pud)) { 118 if (pud_none(*pud)) {
156 pmd = (pmd_t *) spp_getpage(); 119 pmd = (pmd_t *) spp_getpage();
157 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); 120 pud_populate(&init_mm, pud, pmd);
158 if (pmd != pmd_offset(pud, 0)) { 121 if (pmd != pmd_offset(pud, 0)) {
159 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", 122 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
160 pmd, pmd_offset(pud, 0)); 123 pmd, pmd_offset(pud, 0));
@@ -164,13 +127,12 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
164 pmd = pmd_offset(pud, vaddr); 127 pmd = pmd_offset(pud, vaddr);
165 if (pmd_none(*pmd)) { 128 if (pmd_none(*pmd)) {
166 pte = (pte_t *) spp_getpage(); 129 pte = (pte_t *) spp_getpage();
167 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); 130 pmd_populate_kernel(&init_mm, pmd, pte);
168 if (pte != pte_offset_kernel(pmd, 0)) { 131 if (pte != pte_offset_kernel(pmd, 0)) {
169 printk(KERN_ERR "PAGETABLE BUG #02!\n"); 132 printk(KERN_ERR "PAGETABLE BUG #02!\n");
170 return; 133 return;
171 } 134 }
172 } 135 }
173 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
174 136
175 pte = pte_offset_kernel(pmd, vaddr); 137 pte = pte_offset_kernel(pmd, vaddr);
176 if (!pte_none(*pte) && pte_val(new_pte) && 138 if (!pte_none(*pte) && pte_val(new_pte) &&
@@ -185,6 +147,64 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
185 __flush_tlb_one(vaddr); 147 __flush_tlb_one(vaddr);
186} 148}
187 149
150void
151set_pte_vaddr(unsigned long vaddr, pte_t pteval)
152{
153 pgd_t *pgd;
154 pud_t *pud_page;
155
156 pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));
157
158 pgd = pgd_offset_k(vaddr);
159 if (pgd_none(*pgd)) {
160 printk(KERN_ERR
161 "PGD FIXMAP MISSING, it should be setup in head.S!\n");
162 return;
163 }
164 pud_page = (pud_t*)pgd_page_vaddr(*pgd);
165 set_pte_vaddr_pud(pud_page, vaddr, pteval);
166}
167
168/*
169 * Create large page table mappings for a range of physical addresses.
170 */
171static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
172 pgprot_t prot)
173{
174 pgd_t *pgd;
175 pud_t *pud;
176 pmd_t *pmd;
177
178 BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
179 for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
180 pgd = pgd_offset_k((unsigned long)__va(phys));
181 if (pgd_none(*pgd)) {
182 pud = (pud_t *) spp_getpage();
183 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
184 _PAGE_USER));
185 }
186 pud = pud_offset(pgd, (unsigned long)__va(phys));
187 if (pud_none(*pud)) {
188 pmd = (pmd_t *) spp_getpage();
189 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
190 _PAGE_USER));
191 }
192 pmd = pmd_offset(pud, phys);
193 BUG_ON(!pmd_none(*pmd));
194 set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
195 }
196}
197
198void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
199{
200 __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
201}
202
203void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
204{
205 __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
206}
207
188/* 208/*
189 * The head.S code sets up the kernel high mapping: 209 * The head.S code sets up the kernel high mapping:
190 * 210 *
@@ -213,20 +233,9 @@ void __init cleanup_highmap(void)
213 } 233 }
214} 234}
215 235
216/* NOTE: this is meant to be run only at boot */
217void __init __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
218{
219 unsigned long address = __fix_to_virt(idx);
220
221 if (idx >= __end_of_fixed_addresses) {
222 printk(KERN_ERR "Invalid __set_fixmap\n");
223 return;
224 }
225 set_pte_phys(address, phys, prot);
226}
227
228static unsigned long __initdata table_start; 236static unsigned long __initdata table_start;
229static unsigned long __meminitdata table_end; 237static unsigned long __meminitdata table_end;
238static unsigned long __meminitdata table_top;
230 239
231static __meminit void *alloc_low_page(unsigned long *phys) 240static __meminit void *alloc_low_page(unsigned long *phys)
232{ 241{
@@ -240,7 +249,7 @@ static __meminit void *alloc_low_page(unsigned long *phys)
240 return adr; 249 return adr;
241 } 250 }
242 251
243 if (pfn >= end_pfn) 252 if (pfn >= table_top)
244 panic("alloc_low_page: ran out of memory"); 253 panic("alloc_low_page: ran out of memory");
245 254
246 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); 255 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@ -257,65 +266,61 @@ static __meminit void unmap_low_page(void *adr)
257 early_iounmap(adr, PAGE_SIZE); 266 early_iounmap(adr, PAGE_SIZE);
258} 267}
259 268
260/* Must run before zap_low_mappings */ 269static unsigned long __meminit
261__meminit void *early_ioremap(unsigned long addr, unsigned long size) 270phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
262{ 271{
263 pmd_t *pmd, *last_pmd; 272 unsigned pages = 0;
264 unsigned long vaddr; 273 unsigned long last_map_addr = end;
265 int i, pmds; 274 int i;
275
276 pte_t *pte = pte_page + pte_index(addr);
266 277
267 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; 278 for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
268 vaddr = __START_KERNEL_map;
269 pmd = level2_kernel_pgt;
270 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
271 279
272 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { 280 if (addr >= end) {
273 for (i = 0; i < pmds; i++) { 281 if (!after_bootmem) {
274 if (pmd_present(pmd[i])) 282 for(; i < PTRS_PER_PTE; i++, pte++)
275 goto continue_outer_loop; 283 set_pte(pte, __pte(0));
284 }
285 break;
276 } 286 }
277 vaddr += addr & ~PMD_MASK;
278 addr &= PMD_MASK;
279 287
280 for (i = 0; i < pmds; i++, addr += PMD_SIZE) 288 if (pte_val(*pte))
281 set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); 289 continue;
282 __flush_tlb_all();
283 290
284 return (void *)vaddr; 291 if (0)
285continue_outer_loop: 292 printk(" pte=%p addr=%lx pte=%016lx\n",
286 ; 293 pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
294 set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
295 last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
296 pages++;
287 } 297 }
288 printk(KERN_ERR "early_ioremap(0x%lx, %lu) failed\n", addr, size); 298 update_page_count(PG_LEVEL_4K, pages);
289 299
290 return NULL; 300 return last_map_addr;
291} 301}
292 302
293/* 303static unsigned long __meminit
294 * To avoid virtual aliases later: 304phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end)
295 */
296__meminit void early_iounmap(void *addr, unsigned long size)
297{ 305{
298 unsigned long vaddr; 306 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
299 pmd_t *pmd;
300 int i, pmds;
301
302 vaddr = (unsigned long)addr;
303 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
304 pmd = level2_kernel_pgt + pmd_index(vaddr);
305 307
306 for (i = 0; i < pmds; i++) 308 return phys_pte_init(pte, address, end);
307 pmd_clear(pmd + i);
308
309 __flush_tlb_all();
310} 309}
311 310
312static unsigned long __meminit 311static unsigned long __meminit
313phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) 312phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
313 unsigned long page_size_mask)
314{ 314{
315 unsigned long pages = 0;
316 unsigned long last_map_addr = end;
317
315 int i = pmd_index(address); 318 int i = pmd_index(address);
316 319
317 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { 320 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
321 unsigned long pte_phys;
318 pmd_t *pmd = pmd_page + pmd_index(address); 322 pmd_t *pmd = pmd_page + pmd_index(address);
323 pte_t *pte;
319 324
320 if (address >= end) { 325 if (address >= end) {
321 if (!after_bootmem) { 326 if (!after_bootmem) {
@@ -325,31 +330,50 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
325 break; 330 break;
326 } 331 }
327 332
328 if (pmd_val(*pmd)) 333 if (pmd_val(*pmd)) {
334 if (!pmd_large(*pmd))
335 last_map_addr = phys_pte_update(pmd, address,
336 end);
329 continue; 337 continue;
338 }
330 339
331 set_pte((pte_t *)pmd, 340 if (page_size_mask & (1<<PG_LEVEL_2M)) {
332 pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 341 pages++;
342 set_pte((pte_t *)pmd,
343 pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
344 last_map_addr = (address & PMD_MASK) + PMD_SIZE;
345 continue;
346 }
347
348 pte = alloc_low_page(&pte_phys);
349 last_map_addr = phys_pte_init(pte, address, end);
350 unmap_low_page(pte);
351
352 pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
333 } 353 }
334 return address; 354 update_page_count(PG_LEVEL_2M, pages);
355 return last_map_addr;
335} 356}
336 357
337static unsigned long __meminit 358static unsigned long __meminit
338phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) 359phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
360 unsigned long page_size_mask)
339{ 361{
340 pmd_t *pmd = pmd_offset(pud, 0); 362 pmd_t *pmd = pmd_offset(pud, 0);
341 unsigned long last_map_addr; 363 unsigned long last_map_addr;
342 364
343 spin_lock(&init_mm.page_table_lock); 365 spin_lock(&init_mm.page_table_lock);
344 last_map_addr = phys_pmd_init(pmd, address, end); 366 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
345 spin_unlock(&init_mm.page_table_lock); 367 spin_unlock(&init_mm.page_table_lock);
346 __flush_tlb_all(); 368 __flush_tlb_all();
347 return last_map_addr; 369 return last_map_addr;
348} 370}
349 371
350static unsigned long __meminit 372static unsigned long __meminit
351phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) 373phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
374 unsigned long page_size_mask)
352{ 375{
376 unsigned long pages = 0;
353 unsigned long last_map_addr = end; 377 unsigned long last_map_addr = end;
354 int i = pud_index(addr); 378 int i = pud_index(addr);
355 379
@@ -369,11 +393,13 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
369 393
370 if (pud_val(*pud)) { 394 if (pud_val(*pud)) {
371 if (!pud_large(*pud)) 395 if (!pud_large(*pud))
372 last_map_addr = phys_pmd_update(pud, addr, end); 396 last_map_addr = phys_pmd_update(pud, addr, end,
397 page_size_mask);
373 continue; 398 continue;
374 } 399 }
375 400
376 if (direct_gbpages) { 401 if (page_size_mask & (1<<PG_LEVEL_1G)) {
402 pages++;
377 set_pte((pte_t *)pud, 403 set_pte((pte_t *)pud,
378 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 404 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
379 last_map_addr = (addr & PUD_MASK) + PUD_SIZE; 405 last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
@@ -383,27 +409,50 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
383 pmd = alloc_low_page(&pmd_phys); 409 pmd = alloc_low_page(&pmd_phys);
384 410
385 spin_lock(&init_mm.page_table_lock); 411 spin_lock(&init_mm.page_table_lock);
386 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); 412 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
387 last_map_addr = phys_pmd_init(pmd, addr, end); 413 unmap_low_page(pmd);
414 pud_populate(&init_mm, pud, __va(pmd_phys));
388 spin_unlock(&init_mm.page_table_lock); 415 spin_unlock(&init_mm.page_table_lock);
389 416
390 unmap_low_page(pmd);
391 } 417 }
392 __flush_tlb_all(); 418 __flush_tlb_all();
419 update_page_count(PG_LEVEL_1G, pages);
393 420
394 return last_map_addr >> PAGE_SHIFT; 421 return last_map_addr;
422}
423
424static unsigned long __meminit
425phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
426 unsigned long page_size_mask)
427{
428 pud_t *pud;
429
430 pud = (pud_t *)pgd_page_vaddr(*pgd);
431
432 return phys_pud_init(pud, addr, end, page_size_mask);
395} 433}
396 434
397static void __init find_early_table_space(unsigned long end) 435static void __init find_early_table_space(unsigned long end)
398{ 436{
399 unsigned long puds, pmds, tables, start; 437 unsigned long puds, pmds, ptes, tables, start;
400 438
401 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 439 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
402 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); 440 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
403 if (!direct_gbpages) { 441 if (direct_gbpages) {
442 unsigned long extra;
443 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
444 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
445 } else
404 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 446 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
405 tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); 447 tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
406 } 448
449 if (cpu_has_pse) {
450 unsigned long extra;
451 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
452 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
453 } else
454 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
455 tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE);
407 456
408 /* 457 /*
409 * RED-PEN putting page tables only on node 0 could 458 * RED-PEN putting page tables only on node 0 could
@@ -417,10 +466,10 @@ static void __init find_early_table_space(unsigned long end)
417 466
418 table_start >>= PAGE_SHIFT; 467 table_start >>= PAGE_SHIFT;
419 table_end = table_start; 468 table_end = table_start;
469 table_top = table_start + (tables >> PAGE_SHIFT);
420 470
421 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n", 471 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
422 end, table_start << PAGE_SHIFT, 472 end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
423 (table_start << PAGE_SHIFT) + tables);
424} 473}
425 474
426static void __init init_gbpages(void) 475static void __init init_gbpages(void)
@@ -431,126 +480,85 @@ static void __init init_gbpages(void)
431 direct_gbpages = 0; 480 direct_gbpages = 0;
432} 481}
433 482
434#ifdef CONFIG_MEMTEST_BOOTPARAM 483static unsigned long __init kernel_physical_mapping_init(unsigned long start,
435 484 unsigned long end,
436static void __init memtest(unsigned long start_phys, unsigned long size, 485 unsigned long page_size_mask)
437 unsigned pattern) 486{
438{
439 unsigned long i;
440 unsigned long *start;
441 unsigned long start_bad;
442 unsigned long last_bad;
443 unsigned long val;
444 unsigned long start_phys_aligned;
445 unsigned long count;
446 unsigned long incr;
447
448 switch (pattern) {
449 case 0:
450 val = 0UL;
451 break;
452 case 1:
453 val = -1UL;
454 break;
455 case 2:
456 val = 0x5555555555555555UL;
457 break;
458 case 3:
459 val = 0xaaaaaaaaaaaaaaaaUL;
460 break;
461 default:
462 return;
463 }
464 487
465 incr = sizeof(unsigned long); 488 unsigned long next, last_map_addr = end;
466 start_phys_aligned = ALIGN(start_phys, incr);
467 count = (size - (start_phys_aligned - start_phys))/incr;
468 start = __va(start_phys_aligned);
469 start_bad = 0;
470 last_bad = 0;
471
472 for (i = 0; i < count; i++)
473 start[i] = val;
474 for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
475 if (*start != val) {
476 if (start_phys_aligned == last_bad + incr) {
477 last_bad += incr;
478 } else {
479 if (start_bad) {
480 printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
481 val, start_bad, last_bad + incr);
482 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
483 }
484 start_bad = last_bad = start_phys_aligned;
485 }
486 }
487 }
488 if (start_bad) {
489 printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
490 val, start_bad, last_bad + incr);
491 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
492 }
493 489
494} 490 start = (unsigned long)__va(start);
491 end = (unsigned long)__va(end);
495 492
496static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE; 493 for (; start < end; start = next) {
494 pgd_t *pgd = pgd_offset_k(start);
495 unsigned long pud_phys;
496 pud_t *pud;
497 497
498static int __init parse_memtest(char *arg) 498 next = (start + PGDIR_SIZE) & PGDIR_MASK;
499{ 499 if (next > end)
500 if (arg) 500 next = end;
501 memtest_pattern = simple_strtoul(arg, NULL, 0);
502 return 0;
503}
504 501
505early_param("memtest", parse_memtest); 502 if (pgd_val(*pgd)) {
503 last_map_addr = phys_pud_update(pgd, __pa(start),
504 __pa(end), page_size_mask);
505 continue;
506 }
506 507
507static void __init early_memtest(unsigned long start, unsigned long end) 508 if (after_bootmem)
508{ 509 pud = pud_offset(pgd, start & PGDIR_MASK);
509 u64 t_start, t_size; 510 else
510 unsigned pattern; 511 pud = alloc_low_page(&pud_phys);
511 512
512 if (!memtest_pattern) 513 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
513 return; 514 page_size_mask);
515 unmap_low_page(pud);
516 pgd_populate(&init_mm, pgd_offset_k(start),
517 __va(pud_phys));
518 }
514 519
515 printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern); 520 return last_map_addr;
516 for (pattern = 0; pattern < memtest_pattern; pattern++) { 521}
517 t_start = start;
518 t_size = 0;
519 while (t_start < end) {
520 t_start = find_e820_area_size(t_start, &t_size, 1);
521 522
522 /* done ? */ 523struct map_range {
523 if (t_start >= end) 524 unsigned long start;
524 break; 525 unsigned long end;
525 if (t_start + t_size > end) 526 unsigned page_size_mask;
526 t_size = end - t_start; 527};
527 528
528 printk(KERN_CONT "\n %016llx - %016llx pattern %d", 529#define NR_RANGE_MR 5
529 (unsigned long long)t_start,
530 (unsigned long long)t_start + t_size, pattern);
531 530
532 memtest(t_start, t_size, pattern); 531static int save_mr(struct map_range *mr, int nr_range,
532 unsigned long start_pfn, unsigned long end_pfn,
533 unsigned long page_size_mask)
534{
533 535
534 t_start += t_size; 536 if (start_pfn < end_pfn) {
535 } 537 if (nr_range >= NR_RANGE_MR)
538 panic("run out of range for init_memory_mapping\n");
539 mr[nr_range].start = start_pfn<<PAGE_SHIFT;
540 mr[nr_range].end = end_pfn<<PAGE_SHIFT;
541 mr[nr_range].page_size_mask = page_size_mask;
542 nr_range++;
536 } 543 }
537 printk(KERN_CONT "\n"); 544
538} 545 return nr_range;
539#else
540static void __init early_memtest(unsigned long start, unsigned long end)
541{
542} 546}
543#endif
544 547
545/* 548/*
546 * Setup the direct mapping of the physical memory at PAGE_OFFSET. 549 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
547 * This runs before bootmem is initialized and gets pages directly from 550 * This runs before bootmem is initialized and gets pages directly from
548 * the physical memory. To access them they are temporarily mapped. 551 * the physical memory. To access them they are temporarily mapped.
549 */ 552 */
550unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end) 553unsigned long __init_refok init_memory_mapping(unsigned long start,
554 unsigned long end)
551{ 555{
552 unsigned long next, last_map_addr = end; 556 unsigned long last_map_addr = 0;
553 unsigned long start_phys = start, end_phys = end; 557 unsigned long page_size_mask = 0;
558 unsigned long start_pfn, end_pfn;
559
560 struct map_range mr[NR_RANGE_MR];
561 int nr_range, i;
554 562
555 printk(KERN_INFO "init_memory_mapping\n"); 563 printk(KERN_INFO "init_memory_mapping\n");
556 564
@@ -561,48 +569,115 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned lon
561 * memory mapped. Unfortunately this is done currently before the 569 * memory mapped. Unfortunately this is done currently before the
562 * nodes are discovered. 570 * nodes are discovered.
563 */ 571 */
564 if (!after_bootmem) { 572 if (!after_bootmem)
565 init_gbpages(); 573 init_gbpages();
566 find_early_table_space(end);
567 }
568 574
569 start = (unsigned long)__va(start); 575 if (direct_gbpages)
570 end = (unsigned long)__va(end); 576 page_size_mask |= 1 << PG_LEVEL_1G;
577 if (cpu_has_pse)
578 page_size_mask |= 1 << PG_LEVEL_2M;
579
580 memset(mr, 0, sizeof(mr));
581 nr_range = 0;
582
583 /* head if not big page alignment ?*/
584 start_pfn = start >> PAGE_SHIFT;
585 end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT)
586 << (PMD_SHIFT - PAGE_SHIFT);
587 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
588
589 /* big page (2M) range*/
590 start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
591 << (PMD_SHIFT - PAGE_SHIFT);
592 end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT)
593 << (PUD_SHIFT - PAGE_SHIFT);
594 if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)))
595 end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT));
596 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
597 page_size_mask & (1<<PG_LEVEL_2M));
598
599 /* big page (1G) range */
600 start_pfn = end_pfn;
601 end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
602 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
603 page_size_mask &
604 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
605
606 /* tail is not big page (1G) alignment */
607 start_pfn = end_pfn;
608 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
609 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
610 page_size_mask & (1<<PG_LEVEL_2M));
611
612 /* tail is not big page (2M) alignment */
613 start_pfn = end_pfn;
614 end_pfn = end>>PAGE_SHIFT;
615 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
616
617 /* try to merge same page size and continuous */
618 for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
619 unsigned long old_start;
620 if (mr[i].end != mr[i+1].start ||
621 mr[i].page_size_mask != mr[i+1].page_size_mask)
622 continue;
623 /* move it */
624 old_start = mr[i].start;
625 memmove(&mr[i], &mr[i+1],
626 (nr_range - 1 - i) * sizeof (struct map_range));
627 mr[i].start = old_start;
628 nr_range--;
629 }
571 630
572 for (; start < end; start = next) { 631 for (i = 0; i < nr_range; i++)
573 pgd_t *pgd = pgd_offset_k(start); 632 printk(KERN_DEBUG " %010lx - %010lx page %s\n",
574 unsigned long pud_phys; 633 mr[i].start, mr[i].end,
575 pud_t *pud; 634 (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
635 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
576 636
577 if (after_bootmem) 637 if (!after_bootmem)
578 pud = pud_offset(pgd, start & PGDIR_MASK); 638 find_early_table_space(end);
579 else
580 pud = alloc_low_page(&pud_phys);
581 639
582 next = start + PGDIR_SIZE; 640 for (i = 0; i < nr_range; i++)
583 if (next > end) 641 last_map_addr = kernel_physical_mapping_init(
584 next = end; 642 mr[i].start, mr[i].end,
585 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next)); 643 mr[i].page_size_mask);
586 if (!after_bootmem)
587 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
588 unmap_low_page(pud);
589 }
590 644
591 if (!after_bootmem) 645 if (!after_bootmem)
592 mmu_cr4_features = read_cr4(); 646 mmu_cr4_features = read_cr4();
593 __flush_tlb_all(); 647 __flush_tlb_all();
594 648
595 if (!after_bootmem) 649 if (!after_bootmem && table_end > table_start)
596 reserve_early(table_start << PAGE_SHIFT, 650 reserve_early(table_start << PAGE_SHIFT,
597 table_end << PAGE_SHIFT, "PGTABLE"); 651 table_end << PAGE_SHIFT, "PGTABLE");
598 652
653 printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
654 last_map_addr, end);
655
599 if (!after_bootmem) 656 if (!after_bootmem)
600 early_memtest(start_phys, end_phys); 657 early_memtest(start, end);
601 658
602 return last_map_addr; 659 return last_map_addr >> PAGE_SHIFT;
603} 660}
604 661
605#ifndef CONFIG_NUMA 662#ifndef CONFIG_NUMA
663void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
664{
665 unsigned long bootmap_size, bootmap;
666
667 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
668 bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
669 PAGE_SIZE);
670 if (bootmap == -1L)
671 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
672 /* don't touch min_low_pfn */
673 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
674 0, end_pfn);
675 e820_register_active_regions(0, start_pfn, end_pfn);
676 free_bootmem_with_active_regions(0, end_pfn);
677 early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
678 reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
679}
680
606void __init paging_init(void) 681void __init paging_init(void)
607{ 682{
608 unsigned long max_zone_pfns[MAX_NR_ZONES]; 683 unsigned long max_zone_pfns[MAX_NR_ZONES];
@@ -610,9 +685,9 @@ void __init paging_init(void)
610 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 685 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
611 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; 686 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
612 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 687 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
613 max_zone_pfns[ZONE_NORMAL] = end_pfn; 688 max_zone_pfns[ZONE_NORMAL] = max_pfn;
614 689
615 memory_present(0, 0, end_pfn); 690 memory_present(0, 0, max_pfn);
616 sparse_init(); 691 sparse_init();
617 free_area_init_nodes(max_zone_pfns); 692 free_area_init_nodes(max_zone_pfns);
618} 693}
@@ -694,8 +769,8 @@ void __init mem_init(void)
694#else 769#else
695 totalram_pages = free_all_bootmem(); 770 totalram_pages = free_all_bootmem();
696#endif 771#endif
697 reservedpages = end_pfn - totalram_pages - 772 reservedpages = max_pfn - totalram_pages -
698 absent_pages_in_range(0, end_pfn); 773 absent_pages_in_range(0, max_pfn);
699 after_bootmem = 1; 774 after_bootmem = 1;
700 775
701 codesize = (unsigned long) &_etext - (unsigned long) &_text; 776 codesize = (unsigned long) &_etext - (unsigned long) &_text;
@@ -714,7 +789,7 @@ void __init mem_init(void)
714 printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " 789 printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
715 "%ldk reserved, %ldk data, %ldk init)\n", 790 "%ldk reserved, %ldk data, %ldk init)\n",
716 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), 791 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
717 end_pfn << (PAGE_SHIFT-10), 792 max_pfn << (PAGE_SHIFT-10),
718 codesize >> 10, 793 codesize >> 10,
719 reservedpages << (PAGE_SHIFT-10), 794 reservedpages << (PAGE_SHIFT-10),
720 datasize >> 10, 795 datasize >> 10,
@@ -767,6 +842,13 @@ EXPORT_SYMBOL_GPL(rodata_test_data);
767void mark_rodata_ro(void) 842void mark_rodata_ro(void)
768{ 843{
769 unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); 844 unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
845 unsigned long rodata_start =
846 ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
847
848#ifdef CONFIG_DYNAMIC_FTRACE
849 /* Dynamic tracing modifies the kernel text section */
850 start = rodata_start;
851#endif
770 852
771 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", 853 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
772 (end - start) >> 10); 854 (end - start) >> 10);
@@ -776,8 +858,7 @@ void mark_rodata_ro(void)
776 * The rodata section (but not the kernel text!) should also be 858 * The rodata section (but not the kernel text!) should also be
777 * not-executable. 859 * not-executable.
778 */ 860 */
779 start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; 861 set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
780 set_memory_nx(start, (end - start) >> PAGE_SHIFT);
781 862
782 rodata_test(); 863 rodata_test();
783 864
@@ -799,24 +880,26 @@ void free_initrd_mem(unsigned long start, unsigned long end)
799} 880}
800#endif 881#endif
801 882
802void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 883int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
884 int flags)
803{ 885{
804#ifdef CONFIG_NUMA 886#ifdef CONFIG_NUMA
805 int nid, next_nid; 887 int nid, next_nid;
888 int ret;
806#endif 889#endif
807 unsigned long pfn = phys >> PAGE_SHIFT; 890 unsigned long pfn = phys >> PAGE_SHIFT;
808 891
809 if (pfn >= end_pfn) { 892 if (pfn >= max_pfn) {
810 /* 893 /*
811 * This can happen with kdump kernels when accessing 894 * This can happen with kdump kernels when accessing
812 * firmware tables: 895 * firmware tables:
813 */ 896 */
814 if (pfn < max_pfn_mapped) 897 if (pfn < max_pfn_mapped)
815 return; 898 return -EFAULT;
816 899
817 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", 900 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
818 phys, len); 901 phys, len);
819 return; 902 return -EFAULT;
820 } 903 }
821 904
822 /* Should check here against the e820 map to avoid double free */ 905 /* Should check here against the e820 map to avoid double free */
@@ -824,9 +907,13 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
824 nid = phys_to_nid(phys); 907 nid = phys_to_nid(phys);
825 next_nid = phys_to_nid(phys + len - 1); 908 next_nid = phys_to_nid(phys + len - 1);
826 if (nid == next_nid) 909 if (nid == next_nid)
827 reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT); 910 ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
828 else 911 else
829 reserve_bootmem(phys, len, BOOTMEM_DEFAULT); 912 ret = reserve_bootmem(phys, len, flags);
913
914 if (ret != 0)
915 return ret;
916
830#else 917#else
831 reserve_bootmem(phys, len, BOOTMEM_DEFAULT); 918 reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
832#endif 919#endif
@@ -835,6 +922,8 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
835 dma_reserve += len / PAGE_SIZE; 922 dma_reserve += len / PAGE_SIZE;
836 set_dma_reserve(dma_reserve); 923 set_dma_reserve(dma_reserve);
837 } 924 }
925
926 return 0;
838} 927}
839 928
840int kern_addr_valid(unsigned long addr) 929int kern_addr_valid(unsigned long addr)
@@ -939,7 +1028,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
939 pmd_t *pmd; 1028 pmd_t *pmd;
940 1029
941 for (; addr < end; addr = next) { 1030 for (; addr < end; addr = next) {
942 next = pmd_addr_end(addr, end); 1031 void *p = NULL;
943 1032
944 pgd = vmemmap_pgd_populate(addr, node); 1033 pgd = vmemmap_pgd_populate(addr, node);
945 if (!pgd) 1034 if (!pgd)
@@ -949,33 +1038,51 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
949 if (!pud) 1038 if (!pud)
950 return -ENOMEM; 1039 return -ENOMEM;
951 1040
952 pmd = pmd_offset(pud, addr); 1041 if (!cpu_has_pse) {
953 if (pmd_none(*pmd)) { 1042 next = (addr + PAGE_SIZE) & PAGE_MASK;
954 pte_t entry; 1043 pmd = vmemmap_pmd_populate(pud, addr, node);
955 void *p; 1044
1045 if (!pmd)
1046 return -ENOMEM;
1047
1048 p = vmemmap_pte_populate(pmd, addr, node);
956 1049
957 p = vmemmap_alloc_block(PMD_SIZE, node);
958 if (!p) 1050 if (!p)
959 return -ENOMEM; 1051 return -ENOMEM;
960 1052
961 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, 1053 addr_end = addr + PAGE_SIZE;
962 PAGE_KERNEL_LARGE); 1054 p_end = p + PAGE_SIZE;
963 set_pmd(pmd, __pmd(pte_val(entry)));
964
965 /* check to see if we have contiguous blocks */
966 if (p_end != p || node_start != node) {
967 if (p_start)
968 printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
969 addr_start, addr_end-1, p_start, p_end-1, node_start);
970 addr_start = addr;
971 node_start = node;
972 p_start = p;
973 }
974 addr_end = addr + PMD_SIZE;
975 p_end = p + PMD_SIZE;
976 } else { 1055 } else {
977 vmemmap_verify((pte_t *)pmd, node, addr, next); 1056 next = pmd_addr_end(addr, end);
1057
1058 pmd = pmd_offset(pud, addr);
1059 if (pmd_none(*pmd)) {
1060 pte_t entry;
1061
1062 p = vmemmap_alloc_block(PMD_SIZE, node);
1063 if (!p)
1064 return -ENOMEM;
1065
1066 entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
1067 PAGE_KERNEL_LARGE);
1068 set_pmd(pmd, __pmd(pte_val(entry)));
1069
1070 /* check to see if we have contiguous blocks */
1071 if (p_end != p || node_start != node) {
1072 if (p_start)
1073 printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
1074 addr_start, addr_end-1, p_start, p_end-1, node_start);
1075 addr_start = addr;
1076 node_start = node;
1077 p_start = p;
1078 }
1079
1080 addr_end = addr + PMD_SIZE;
1081 p_end = p + PMD_SIZE;
1082 } else
1083 vmemmap_verify((pte_t *)pmd, node, addr, next);
978 } 1084 }
1085
979 } 1086 }
980 return 0; 1087 return 0;
981} 1088}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index d1b867101e5f..016f335bbeea 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -12,6 +12,7 @@
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
15#include <linux/mmiotrace.h>
15 16
16#include <asm/cacheflush.h> 17#include <asm/cacheflush.h>
17#include <asm/e820.h> 18#include <asm/e820.h>
@@ -122,10 +123,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
122{ 123{
123 unsigned long pfn, offset, vaddr; 124 unsigned long pfn, offset, vaddr;
124 resource_size_t last_addr; 125 resource_size_t last_addr;
126 const resource_size_t unaligned_phys_addr = phys_addr;
127 const unsigned long unaligned_size = size;
125 struct vm_struct *area; 128 struct vm_struct *area;
126 unsigned long new_prot_val; 129 unsigned long new_prot_val;
127 pgprot_t prot; 130 pgprot_t prot;
128 int retval; 131 int retval;
132 void __iomem *ret_addr;
129 133
130 /* Don't allow wraparound or zero size */ 134 /* Don't allow wraparound or zero size */
131 last_addr = phys_addr + size - 1; 135 last_addr = phys_addr + size - 1;
@@ -142,7 +146,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
142 /* 146 /*
143 * Don't remap the low PCI/ISA area, it's always mapped.. 147 * Don't remap the low PCI/ISA area, it's always mapped..
144 */ 148 */
145 if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) 149 if (is_ISA_range(phys_addr, last_addr))
146 return (__force void __iomem *)phys_to_virt(phys_addr); 150 return (__force void __iomem *)phys_to_virt(phys_addr);
147 151
148 /* 152 /*
@@ -233,7 +237,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
233 return NULL; 237 return NULL;
234 } 238 }
235 239
236 return (void __iomem *) (vaddr + offset); 240 ret_addr = (void __iomem *) (vaddr + offset);
241 mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
242
243 return ret_addr;
237} 244}
238 245
239/** 246/**
@@ -261,7 +268,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
261{ 268{
262 /* 269 /*
263 * Ideally, this should be: 270 * Ideally, this should be:
264 * pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS; 271 * pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
265 * 272 *
266 * Till we fix all X drivers to use ioremap_wc(), we will use 273 * Till we fix all X drivers to use ioremap_wc(), we will use
267 * UC MINUS. 274 * UC MINUS.
@@ -285,7 +292,7 @@ EXPORT_SYMBOL(ioremap_nocache);
285 */ 292 */
286void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size) 293void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
287{ 294{
288 if (pat_wc_enabled) 295 if (pat_enabled)
289 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, 296 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
290 __builtin_return_address(0)); 297 __builtin_return_address(0));
291 else 298 else
@@ -323,6 +330,14 @@ static void __iomem *ioremap_default(resource_size_t phys_addr,
323 return (void __iomem *)ret; 330 return (void __iomem *)ret;
324} 331}
325 332
333void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
334 unsigned long prot_val)
335{
336 return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK),
337 __builtin_return_address(0));
338}
339EXPORT_SYMBOL(ioremap_prot);
340
326/** 341/**
327 * iounmap - Free a IO remapping 342 * iounmap - Free a IO remapping
328 * @addr: virtual address from ioremap_* 343 * @addr: virtual address from ioremap_*
@@ -341,13 +356,15 @@ void iounmap(volatile void __iomem *addr)
341 * vm_area and by simply returning an address into the kernel mapping 356 * vm_area and by simply returning an address into the kernel mapping
342 * of ISA space. So handle that here. 357 * of ISA space. So handle that here.
343 */ 358 */
344 if (addr >= phys_to_virt(ISA_START_ADDRESS) && 359 if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) &&
345 addr < phys_to_virt(ISA_END_ADDRESS)) 360 (void __force *)addr < phys_to_virt(ISA_END_ADDRESS))
346 return; 361 return;
347 362
348 addr = (volatile void __iomem *) 363 addr = (volatile void __iomem *)
349 (PAGE_MASK & (unsigned long __force)addr); 364 (PAGE_MASK & (unsigned long __force)addr);
350 365
366 mmiotrace_iounmap(addr);
367
351 /* Use the vm area unlocked, assuming the caller 368 /* Use the vm area unlocked, assuming the caller
352 ensures there isn't another iounmap for the same address 369 ensures there isn't another iounmap for the same address
353 in parallel. Reuse of the virtual address is prevented by 370 in parallel. Reuse of the virtual address is prevented by
@@ -355,7 +372,7 @@ void iounmap(volatile void __iomem *addr)
355 cpa takes care of the direct mappings. */ 372 cpa takes care of the direct mappings. */
356 read_lock(&vmlist_lock); 373 read_lock(&vmlist_lock);
357 for (p = vmlist; p; p = p->next) { 374 for (p = vmlist; p; p = p->next) {
358 if (p->addr == addr) 375 if (p->addr == (void __force *)addr)
359 break; 376 break;
360 } 377 }
361 read_unlock(&vmlist_lock); 378 read_unlock(&vmlist_lock);
@@ -369,7 +386,7 @@ void iounmap(volatile void __iomem *addr)
369 free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p)); 386 free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
370 387
371 /* Finally remove it */ 388 /* Finally remove it */
372 o = remove_vm_area((void *)addr); 389 o = remove_vm_area((void __force *)addr);
373 BUG_ON(p != o || o == NULL); 390 BUG_ON(p != o || o == NULL);
374 kfree(p); 391 kfree(p);
375} 392}
@@ -388,7 +405,7 @@ void *xlate_dev_mem_ptr(unsigned long phys)
388 if (page_is_ram(start >> PAGE_SHIFT)) 405 if (page_is_ram(start >> PAGE_SHIFT))
389 return __va(phys); 406 return __va(phys);
390 407
391 addr = (void *)ioremap_default(start, PAGE_SIZE); 408 addr = (void __force *)ioremap_default(start, PAGE_SIZE);
392 if (addr) 409 if (addr)
393 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); 410 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
394 411
@@ -404,8 +421,6 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
404 return; 421 return;
405} 422}
406 423
407#ifdef CONFIG_X86_32
408
409int __initdata early_ioremap_debug; 424int __initdata early_ioremap_debug;
410 425
411static int __init early_ioremap_debug_setup(char *str) 426static int __init early_ioremap_debug_setup(char *str)
@@ -417,8 +432,7 @@ static int __init early_ioremap_debug_setup(char *str)
417early_param("early_ioremap_debug", early_ioremap_debug_setup); 432early_param("early_ioremap_debug", early_ioremap_debug_setup);
418 433
419static __initdata int after_paging_init; 434static __initdata int after_paging_init;
420static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] 435static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
421 __section(.bss.page_aligned);
422 436
423static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) 437static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
424{ 438{
@@ -507,10 +521,11 @@ static void __init __early_set_fixmap(enum fixed_addresses idx,
507 return; 521 return;
508 } 522 }
509 pte = early_ioremap_pte(addr); 523 pte = early_ioremap_pte(addr);
524
510 if (pgprot_val(flags)) 525 if (pgprot_val(flags))
511 set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); 526 set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
512 else 527 else
513 pte_clear(NULL, addr, pte); 528 pte_clear(&init_mm, addr, pte);
514 __flush_tlb_one(addr); 529 __flush_tlb_one(addr);
515} 530}
516 531
@@ -648,5 +663,3 @@ void __this_fixmap_does_not_exist(void)
648{ 663{
649 WARN_ON(1); 664 WARN_ON(1);
650} 665}
651
652#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 1f476e477844..41f1b5c00a1d 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -22,6 +22,7 @@
22#include <asm/numa.h> 22#include <asm/numa.h>
23#include <asm/mpspec.h> 23#include <asm/mpspec.h>
24#include <asm/apic.h> 24#include <asm/apic.h>
25#include <asm/k8.h>
25 26
26static __init int find_northbridge(void) 27static __init int find_northbridge(void)
27{ 28{
@@ -56,34 +57,33 @@ static __init void early_get_boot_cpu_id(void)
56 /* 57 /*
57 * Find possible boot-time SMP configuration: 58 * Find possible boot-time SMP configuration:
58 */ 59 */
60#ifdef CONFIG_X86_MPPARSE
59 early_find_smp_config(); 61 early_find_smp_config();
62#endif
60#ifdef CONFIG_ACPI 63#ifdef CONFIG_ACPI
61 /* 64 /*
62 * Read APIC information from ACPI tables. 65 * Read APIC information from ACPI tables.
63 */ 66 */
64 early_acpi_boot_init(); 67 early_acpi_boot_init();
65#endif 68#endif
69#ifdef CONFIG_X86_MPPARSE
66 /* 70 /*
67 * get boot-time SMP configuration: 71 * get boot-time SMP configuration:
68 */ 72 */
69 if (smp_found_config) 73 if (smp_found_config)
70 early_get_smp_config(); 74 early_get_smp_config();
75#endif
71 early_init_lapic_mapping(); 76 early_init_lapic_mapping();
72} 77}
73 78
74int __init k8_scan_nodes(unsigned long start, unsigned long end) 79int __init k8_scan_nodes(unsigned long start, unsigned long end)
75{ 80{
81 unsigned numnodes, cores, bits, apicid_base;
76 unsigned long prevbase; 82 unsigned long prevbase;
77 struct bootnode nodes[8]; 83 struct bootnode nodes[8];
78 int nodeid, i, nb;
79 unsigned char nodeids[8]; 84 unsigned char nodeids[8];
80 int found = 0; 85 int i, j, nb, found = 0;
81 u32 reg; 86 u32 nodeid, reg;
82 unsigned numnodes;
83 unsigned cores;
84 unsigned bits;
85 int j;
86 unsigned apicid_base;
87 87
88 if (!early_pci_allowed()) 88 if (!early_pci_allowed())
89 return -1; 89 return -1;
@@ -105,7 +105,6 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
105 prevbase = 0; 105 prevbase = 0;
106 for (i = 0; i < 8; i++) { 106 for (i = 0; i < 8; i++) {
107 unsigned long base, limit; 107 unsigned long base, limit;
108 u32 nodeid;
109 108
110 base = read_pci_config(0, nb, 1, 0x40 + i*8); 109 base = read_pci_config(0, nb, 1, 0x40 + i*8);
111 limit = read_pci_config(0, nb, 1, 0x44 + i*8); 110 limit = read_pci_config(0, nb, 1, 0x44 + i*8);
@@ -144,8 +143,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
144 limit |= (1<<24)-1; 143 limit |= (1<<24)-1;
145 limit++; 144 limit++;
146 145
147 if (limit > end_pfn << PAGE_SHIFT) 146 if (limit > max_pfn << PAGE_SHIFT)
148 limit = end_pfn << PAGE_SHIFT; 147 limit = max_pfn << PAGE_SHIFT;
149 if (limit <= base) 148 if (limit <= base)
150 continue; 149 continue;
151 150
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
new file mode 100644
index 000000000000..93d82038af4b
--- /dev/null
+++ b/arch/x86/mm/kmmio.c
@@ -0,0 +1,510 @@
1/* Support for MMIO probes.
2 * Benfit many code from kprobes
3 * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>.
4 * 2007 Alexander Eichner
5 * 2008 Pekka Paalanen <pq@iki.fi>
6 */
7
8#include <linux/list.h>
9#include <linux/rculist.h>
10#include <linux/spinlock.h>
11#include <linux/hash.h>
12#include <linux/init.h>
13#include <linux/module.h>
14#include <linux/kernel.h>
15#include <linux/uaccess.h>
16#include <linux/ptrace.h>
17#include <linux/preempt.h>
18#include <linux/percpu.h>
19#include <linux/kdebug.h>
20#include <linux/mutex.h>
21#include <linux/io.h>
22#include <asm/cacheflush.h>
23#include <asm/tlbflush.h>
24#include <linux/errno.h>
25#include <asm/debugreg.h>
26#include <linux/mmiotrace.h>
27
28#define KMMIO_PAGE_HASH_BITS 4
29#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
30
31struct kmmio_fault_page {
32 struct list_head list;
33 struct kmmio_fault_page *release_next;
34 unsigned long page; /* location of the fault page */
35
36 /*
37 * Number of times this page has been registered as a part
38 * of a probe. If zero, page is disarmed and this may be freed.
39 * Used only by writers (RCU).
40 */
41 int count;
42};
43
44struct kmmio_delayed_release {
45 struct rcu_head rcu;
46 struct kmmio_fault_page *release_list;
47};
48
49struct kmmio_context {
50 struct kmmio_fault_page *fpage;
51 struct kmmio_probe *probe;
52 unsigned long saved_flags;
53 unsigned long addr;
54 int active;
55};
56
57static DEFINE_SPINLOCK(kmmio_lock);
58
59/* Protected by kmmio_lock */
60unsigned int kmmio_count;
61
62/* Read-protected by RCU, write-protected by kmmio_lock. */
63static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
64static LIST_HEAD(kmmio_probes);
65
66static struct list_head *kmmio_page_list(unsigned long page)
67{
68 return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
69}
70
71/* Accessed per-cpu */
72static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
73
74/*
75 * this is basically a dynamic stabbing problem:
76 * Could use the existing prio tree code or
77 * Possible better implementations:
78 * The Interval Skip List: A Data Structure for Finding All Intervals That
79 * Overlap a Point (might be simple)
80 * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
81 */
82/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
83static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
84{
85 struct kmmio_probe *p;
86 list_for_each_entry_rcu(p, &kmmio_probes, list) {
87 if (addr >= p->addr && addr <= (p->addr + p->len))
88 return p;
89 }
90 return NULL;
91}
92
93/* You must be holding RCU read lock. */
94static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
95{
96 struct list_head *head;
97 struct kmmio_fault_page *p;
98
99 page &= PAGE_MASK;
100 head = kmmio_page_list(page);
101 list_for_each_entry_rcu(p, head, list) {
102 if (p->page == page)
103 return p;
104 }
105 return NULL;
106}
107
108static void set_page_present(unsigned long addr, bool present,
109 unsigned int *pglevel)
110{
111 pteval_t pteval;
112 pmdval_t pmdval;
113 unsigned int level;
114 pmd_t *pmd;
115 pte_t *pte = lookup_address(addr, &level);
116
117 if (!pte) {
118 pr_err("kmmio: no pte for page 0x%08lx\n", addr);
119 return;
120 }
121
122 if (pglevel)
123 *pglevel = level;
124
125 switch (level) {
126 case PG_LEVEL_2M:
127 pmd = (pmd_t *)pte;
128 pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT;
129 if (present)
130 pmdval |= _PAGE_PRESENT;
131 set_pmd(pmd, __pmd(pmdval));
132 break;
133
134 case PG_LEVEL_4K:
135 pteval = pte_val(*pte) & ~_PAGE_PRESENT;
136 if (present)
137 pteval |= _PAGE_PRESENT;
138 set_pte_atomic(pte, __pte(pteval));
139 break;
140
141 default:
142 pr_err("kmmio: unexpected page level 0x%x.\n", level);
143 return;
144 }
145
146 __flush_tlb_one(addr);
147}
148
149/** Mark the given page as not present. Access to it will trigger a fault. */
150static void arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
151{
152 set_page_present(page & PAGE_MASK, false, pglevel);
153}
154
155/** Mark the given page as present. */
156static void disarm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
157{
158 set_page_present(page & PAGE_MASK, true, pglevel);
159}
160
161/*
162 * This is being called from do_page_fault().
163 *
164 * We may be in an interrupt or a critical section. Also prefecthing may
165 * trigger a page fault. We may be in the middle of process switch.
166 * We cannot take any locks, because we could be executing especially
167 * within a kmmio critical section.
168 *
169 * Local interrupts are disabled, so preemption cannot happen.
170 * Do not enable interrupts, do not sleep, and watch out for other CPUs.
171 */
172/*
173 * Interrupts are disabled on entry as trap3 is an interrupt gate
174 * and they remain disabled thorough out this function.
175 */
176int kmmio_handler(struct pt_regs *regs, unsigned long addr)
177{
178 struct kmmio_context *ctx;
179 struct kmmio_fault_page *faultpage;
180 int ret = 0; /* default to fault not handled */
181
182 /*
183 * Preemption is now disabled to prevent process switch during
184 * single stepping. We can only handle one active kmmio trace
185 * per cpu, so ensure that we finish it before something else
186 * gets to run. We also hold the RCU read lock over single
187 * stepping to avoid looking up the probe and kmmio_fault_page
188 * again.
189 */
190 preempt_disable();
191 rcu_read_lock();
192
193 faultpage = get_kmmio_fault_page(addr);
194 if (!faultpage) {
195 /*
196 * Either this page fault is not caused by kmmio, or
197 * another CPU just pulled the kmmio probe from under
198 * our feet. The latter case should not be possible.
199 */
200 goto no_kmmio;
201 }
202
203 ctx = &get_cpu_var(kmmio_ctx);
204 if (ctx->active) {
205 disarm_kmmio_fault_page(faultpage->page, NULL);
206 if (addr == ctx->addr) {
207 /*
208 * On SMP we sometimes get recursive probe hits on the
209 * same address. Context is already saved, fall out.
210 */
211 pr_debug("kmmio: duplicate probe hit on CPU %d, for "
212 "address 0x%08lx.\n",
213 smp_processor_id(), addr);
214 ret = 1;
215 goto no_kmmio_ctx;
216 }
217 /*
218 * Prevent overwriting already in-flight context.
219 * This should not happen, let's hope disarming at least
220 * prevents a panic.
221 */
222 pr_emerg("kmmio: recursive probe hit on CPU %d, "
223 "for address 0x%08lx. Ignoring.\n",
224 smp_processor_id(), addr);
225 pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
226 ctx->addr);
227 goto no_kmmio_ctx;
228 }
229 ctx->active++;
230
231 ctx->fpage = faultpage;
232 ctx->probe = get_kmmio_probe(addr);
233 ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
234 ctx->addr = addr;
235
236 if (ctx->probe && ctx->probe->pre_handler)
237 ctx->probe->pre_handler(ctx->probe, regs, addr);
238
239 /*
240 * Enable single-stepping and disable interrupts for the faulting
241 * context. Local interrupts must not get enabled during stepping.
242 */
243 regs->flags |= X86_EFLAGS_TF;
244 regs->flags &= ~X86_EFLAGS_IF;
245
246 /* Now we set present bit in PTE and single step. */
247 disarm_kmmio_fault_page(ctx->fpage->page, NULL);
248
249 /*
250 * If another cpu accesses the same page while we are stepping,
251 * the access will not be caught. It will simply succeed and the
252 * only downside is we lose the event. If this becomes a problem,
253 * the user should drop to single cpu before tracing.
254 */
255
256 put_cpu_var(kmmio_ctx);
257 return 1; /* fault handled */
258
259no_kmmio_ctx:
260 put_cpu_var(kmmio_ctx);
261no_kmmio:
262 rcu_read_unlock();
263 preempt_enable_no_resched();
264 return ret;
265}
266
267/*
268 * Interrupts are disabled on entry as trap1 is an interrupt gate
269 * and they remain disabled thorough out this function.
270 * This must always get called as the pair to kmmio_handler().
271 */
272static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
273{
274 int ret = 0;
275 struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
276
277 if (!ctx->active) {
278 pr_debug("kmmio: spurious debug trap on CPU %d.\n",
279 smp_processor_id());
280 goto out;
281 }
282
283 if (ctx->probe && ctx->probe->post_handler)
284 ctx->probe->post_handler(ctx->probe, condition, regs);
285
286 arm_kmmio_fault_page(ctx->fpage->page, NULL);
287
288 regs->flags &= ~X86_EFLAGS_TF;
289 regs->flags |= ctx->saved_flags;
290
291 /* These were acquired in kmmio_handler(). */
292 ctx->active--;
293 BUG_ON(ctx->active);
294 rcu_read_unlock();
295 preempt_enable_no_resched();
296
297 /*
298 * if somebody else is singlestepping across a probe point, flags
299 * will have TF set, in which case, continue the remaining processing
300 * of do_debug, as if this is not a probe hit.
301 */
302 if (!(regs->flags & X86_EFLAGS_TF))
303 ret = 1;
304out:
305 put_cpu_var(kmmio_ctx);
306 return ret;
307}
308
309/* You must be holding kmmio_lock. */
310static int add_kmmio_fault_page(unsigned long page)
311{
312 struct kmmio_fault_page *f;
313
314 page &= PAGE_MASK;
315 f = get_kmmio_fault_page(page);
316 if (f) {
317 if (!f->count)
318 arm_kmmio_fault_page(f->page, NULL);
319 f->count++;
320 return 0;
321 }
322
323 f = kmalloc(sizeof(*f), GFP_ATOMIC);
324 if (!f)
325 return -1;
326
327 f->count = 1;
328 f->page = page;
329 list_add_rcu(&f->list, kmmio_page_list(f->page));
330
331 arm_kmmio_fault_page(f->page, NULL);
332
333 return 0;
334}
335
336/* You must be holding kmmio_lock. */
337static void release_kmmio_fault_page(unsigned long page,
338 struct kmmio_fault_page **release_list)
339{
340 struct kmmio_fault_page *f;
341
342 page &= PAGE_MASK;
343 f = get_kmmio_fault_page(page);
344 if (!f)
345 return;
346
347 f->count--;
348 BUG_ON(f->count < 0);
349 if (!f->count) {
350 disarm_kmmio_fault_page(f->page, NULL);
351 f->release_next = *release_list;
352 *release_list = f;
353 }
354}
355
356/*
357 * With page-unaligned ioremaps, one or two armed pages may contain
358 * addresses from outside the intended mapping. Events for these addresses
359 * are currently silently dropped. The events may result only from programming
360 * mistakes by accessing addresses before the beginning or past the end of a
361 * mapping.
362 */
363int register_kmmio_probe(struct kmmio_probe *p)
364{
365 unsigned long flags;
366 int ret = 0;
367 unsigned long size = 0;
368 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
369
370 spin_lock_irqsave(&kmmio_lock, flags);
371 if (get_kmmio_probe(p->addr)) {
372 ret = -EEXIST;
373 goto out;
374 }
375 kmmio_count++;
376 list_add_rcu(&p->list, &kmmio_probes);
377 while (size < size_lim) {
378 if (add_kmmio_fault_page(p->addr + size))
379 pr_err("kmmio: Unable to set page fault.\n");
380 size += PAGE_SIZE;
381 }
382out:
383 spin_unlock_irqrestore(&kmmio_lock, flags);
384 /*
385 * XXX: What should I do here?
386 * Here was a call to global_flush_tlb(), but it does not exist
387 * anymore. It seems it's not needed after all.
388 */
389 return ret;
390}
391EXPORT_SYMBOL(register_kmmio_probe);
392
393static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
394{
395 struct kmmio_delayed_release *dr = container_of(
396 head,
397 struct kmmio_delayed_release,
398 rcu);
399 struct kmmio_fault_page *p = dr->release_list;
400 while (p) {
401 struct kmmio_fault_page *next = p->release_next;
402 BUG_ON(p->count);
403 kfree(p);
404 p = next;
405 }
406 kfree(dr);
407}
408
409static void remove_kmmio_fault_pages(struct rcu_head *head)
410{
411 struct kmmio_delayed_release *dr = container_of(
412 head,
413 struct kmmio_delayed_release,
414 rcu);
415 struct kmmio_fault_page *p = dr->release_list;
416 struct kmmio_fault_page **prevp = &dr->release_list;
417 unsigned long flags;
418 spin_lock_irqsave(&kmmio_lock, flags);
419 while (p) {
420 if (!p->count)
421 list_del_rcu(&p->list);
422 else
423 *prevp = p->release_next;
424 prevp = &p->release_next;
425 p = p->release_next;
426 }
427 spin_unlock_irqrestore(&kmmio_lock, flags);
428 /* This is the real RCU destroy call. */
429 call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
430}
431
432/*
433 * Remove a kmmio probe. You have to synchronize_rcu() before you can be
434 * sure that the callbacks will not be called anymore. Only after that
435 * you may actually release your struct kmmio_probe.
436 *
437 * Unregistering a kmmio fault page has three steps:
438 * 1. release_kmmio_fault_page()
439 * Disarm the page, wait a grace period to let all faults finish.
440 * 2. remove_kmmio_fault_pages()
441 * Remove the pages from kmmio_page_table.
442 * 3. rcu_free_kmmio_fault_pages()
443 * Actally free the kmmio_fault_page structs as with RCU.
444 */
445void unregister_kmmio_probe(struct kmmio_probe *p)
446{
447 unsigned long flags;
448 unsigned long size = 0;
449 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
450 struct kmmio_fault_page *release_list = NULL;
451 struct kmmio_delayed_release *drelease;
452
453 spin_lock_irqsave(&kmmio_lock, flags);
454 while (size < size_lim) {
455 release_kmmio_fault_page(p->addr + size, &release_list);
456 size += PAGE_SIZE;
457 }
458 list_del_rcu(&p->list);
459 kmmio_count--;
460 spin_unlock_irqrestore(&kmmio_lock, flags);
461
462 drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
463 if (!drelease) {
464 pr_crit("kmmio: leaking kmmio_fault_page objects.\n");
465 return;
466 }
467 drelease->release_list = release_list;
468
469 /*
470 * This is not really RCU here. We have just disarmed a set of
471 * pages so that they cannot trigger page faults anymore. However,
472 * we cannot remove the pages from kmmio_page_table,
473 * because a probe hit might be in flight on another CPU. The
474 * pages are collected into a list, and they will be removed from
475 * kmmio_page_table when it is certain that no probe hit related to
476 * these pages can be in flight. RCU grace period sounds like a
477 * good choice.
478 *
479 * If we removed the pages too early, kmmio page fault handler might
480 * not find the respective kmmio_fault_page and determine it's not
481 * a kmmio fault, when it actually is. This would lead to madness.
482 */
483 call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
484}
485EXPORT_SYMBOL(unregister_kmmio_probe);
486
487static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
488 void *args)
489{
490 struct die_args *arg = args;
491
492 if (val == DIE_DEBUG && (arg->err & DR_STEP))
493 if (post_kmmio_handler(arg->err, arg->regs) == 1)
494 return NOTIFY_STOP;
495
496 return NOTIFY_DONE;
497}
498
499static struct notifier_block nb_die = {
500 .notifier_call = kmmio_die_notifier
501};
502
503static int __init init_kmmio(void)
504{
505 int i;
506 for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
507 INIT_LIST_HEAD(&kmmio_page_table[i]);
508 return register_die_notifier(&nb_die);
509}
510fs_initcall(init_kmmio); /* should be before device_initcall() */
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
new file mode 100644
index 000000000000..672e17f8262a
--- /dev/null
+++ b/arch/x86/mm/memtest.c
@@ -0,0 +1,123 @@
1#include <linux/kernel.h>
2#include <linux/errno.h>
3#include <linux/string.h>
4#include <linux/types.h>
5#include <linux/mm.h>
6#include <linux/smp.h>
7#include <linux/init.h>
8#include <linux/pfn.h>
9
10#include <asm/e820.h>
11
12static void __init memtest(unsigned long start_phys, unsigned long size,
13 unsigned pattern)
14{
15 unsigned long i;
16 unsigned long *start;
17 unsigned long start_bad;
18 unsigned long last_bad;
19 unsigned long val;
20 unsigned long start_phys_aligned;
21 unsigned long count;
22 unsigned long incr;
23
24 switch (pattern) {
25 case 0:
26 val = 0UL;
27 break;
28 case 1:
29 val = -1UL;
30 break;
31 case 2:
32#ifdef CONFIG_X86_64
33 val = 0x5555555555555555UL;
34#else
35 val = 0x55555555UL;
36#endif
37 break;
38 case 3:
39#ifdef CONFIG_X86_64
40 val = 0xaaaaaaaaaaaaaaaaUL;
41#else
42 val = 0xaaaaaaaaUL;
43#endif
44 break;
45 default:
46 return;
47 }
48
49 incr = sizeof(unsigned long);
50 start_phys_aligned = ALIGN(start_phys, incr);
51 count = (size - (start_phys_aligned - start_phys))/incr;
52 start = __va(start_phys_aligned);
53 start_bad = 0;
54 last_bad = 0;
55
56 for (i = 0; i < count; i++)
57 start[i] = val;
58 for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
59 if (*start != val) {
60 if (start_phys_aligned == last_bad + incr) {
61 last_bad += incr;
62 } else {
63 if (start_bad) {
64 printk(KERN_CONT "\n %010lx bad mem addr %010lx - %010lx reserved",
65 val, start_bad, last_bad + incr);
66 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
67 }
68 start_bad = last_bad = start_phys_aligned;
69 }
70 }
71 }
72 if (start_bad) {
73 printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved",
74 val, start_bad, last_bad + incr);
75 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
76 }
77
78}
79
80/* default is disabled */
81static int memtest_pattern __initdata;
82
83static int __init parse_memtest(char *arg)
84{
85 if (arg)
86 memtest_pattern = simple_strtoul(arg, NULL, 0);
87 return 0;
88}
89
90early_param("memtest", parse_memtest);
91
92void __init early_memtest(unsigned long start, unsigned long end)
93{
94 u64 t_start, t_size;
95 unsigned pattern;
96
97 if (!memtest_pattern)
98 return;
99
100 printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
101 for (pattern = 0; pattern < memtest_pattern; pattern++) {
102 t_start = start;
103 t_size = 0;
104 while (t_start < end) {
105 t_start = find_e820_area_size(t_start, &t_size, 1);
106
107 /* done ? */
108 if (t_start >= end)
109 break;
110 if (t_start + t_size > end)
111 t_size = end - t_start;
112
113 printk(KERN_CONT "\n %010llx - %010llx pattern %d",
114 (unsigned long long)t_start,
115 (unsigned long long)t_start + t_size, pattern);
116
117 memtest(t_start, t_size, pattern);
118
119 t_start += t_size;
120 }
121 }
122 printk(KERN_CONT "\n");
123}
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
new file mode 100644
index 000000000000..e7397e108beb
--- /dev/null
+++ b/arch/x86/mm/mmio-mod.c
@@ -0,0 +1,515 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2005
17 * Jeff Muizelaar, 2006, 2007
18 * Pekka Paalanen, 2008 <pq@iki.fi>
19 *
20 * Derived from the read-mod example from relay-examples by Tom Zanussi.
21 */
22#define DEBUG 1
23
24#include <linux/module.h>
25#include <linux/debugfs.h>
26#include <linux/uaccess.h>
27#include <linux/io.h>
28#include <linux/version.h>
29#include <linux/kallsyms.h>
30#include <asm/pgtable.h>
31#include <linux/mmiotrace.h>
32#include <asm/e820.h> /* for ISA_START_ADDRESS */
33#include <asm/atomic.h>
34#include <linux/percpu.h>
35#include <linux/cpu.h>
36
37#include "pf_in.h"
38
39#define NAME "mmiotrace: "
40
41struct trap_reason {
42 unsigned long addr;
43 unsigned long ip;
44 enum reason_type type;
45 int active_traces;
46};
47
48struct remap_trace {
49 struct list_head list;
50 struct kmmio_probe probe;
51 resource_size_t phys;
52 unsigned long id;
53};
54
55/* Accessed per-cpu. */
56static DEFINE_PER_CPU(struct trap_reason, pf_reason);
57static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);
58
59#if 0 /* XXX: no way gather this info anymore */
60/* Access to this is not per-cpu. */
61static DEFINE_PER_CPU(atomic_t, dropped);
62#endif
63
64static struct dentry *marker_file;
65
66static DEFINE_MUTEX(mmiotrace_mutex);
67static DEFINE_SPINLOCK(trace_lock);
68static atomic_t mmiotrace_enabled;
69static LIST_HEAD(trace_list); /* struct remap_trace */
70
71/*
72 * Locking in this file:
73 * - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections.
74 * - mmiotrace_enabled may be modified only when holding mmiotrace_mutex
75 * and trace_lock.
76 * - Routines depending on is_enabled() must take trace_lock.
77 * - trace_list users must hold trace_lock.
78 * - is_enabled() guarantees that mmio_trace_record is allowed.
79 * - pre/post callbacks assume the effect of is_enabled() being true.
80 */
81
82/* module parameters */
83static unsigned long filter_offset;
84static int nommiotrace;
85static int trace_pc;
86
87module_param(filter_offset, ulong, 0);
88module_param(nommiotrace, bool, 0);
89module_param(trace_pc, bool, 0);
90
91MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
92MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
93MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions.");
94
95static bool is_enabled(void)
96{
97 return atomic_read(&mmiotrace_enabled);
98}
99
100#if 0 /* XXX: needs rewrite */
101/*
102 * Write callback for the debugfs entry:
103 * Read a marker and write it to the mmio trace log
104 */
105static ssize_t write_marker(struct file *file, const char __user *buffer,
106 size_t count, loff_t *ppos)
107{
108 char *event = NULL;
109 struct mm_io_header *headp;
110 ssize_t len = (count > 65535) ? 65535 : count;
111
112 event = kzalloc(sizeof(*headp) + len, GFP_KERNEL);
113 if (!event)
114 return -ENOMEM;
115
116 headp = (struct mm_io_header *)event;
117 headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT);
118 headp->data_len = len;
119
120 if (copy_from_user(event + sizeof(*headp), buffer, len)) {
121 kfree(event);
122 return -EFAULT;
123 }
124
125 spin_lock_irq(&trace_lock);
126#if 0 /* XXX: convert this to use tracing */
127 if (is_enabled())
128 relay_write(chan, event, sizeof(*headp) + len);
129 else
130#endif
131 len = -EINVAL;
132 spin_unlock_irq(&trace_lock);
133 kfree(event);
134 return len;
135}
136#endif
137
138static void print_pte(unsigned long address)
139{
140 unsigned int level;
141 pte_t *pte = lookup_address(address, &level);
142
143 if (!pte) {
144 pr_err(NAME "Error in %s: no pte for page 0x%08lx\n",
145 __func__, address);
146 return;
147 }
148
149 if (level == PG_LEVEL_2M) {
150 pr_emerg(NAME "4MB pages are not currently supported: "
151 "0x%08lx\n", address);
152 BUG();
153 }
154 pr_info(NAME "pte for 0x%lx: 0x%llx 0x%llx\n", address,
155 (unsigned long long)pte_val(*pte),
156 (unsigned long long)pte_val(*pte) & _PAGE_PRESENT);
157}
158
159/*
160 * For some reason the pre/post pairs have been called in an
161 * unmatched order. Report and die.
162 */
163static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
164{
165 const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
166 pr_emerg(NAME "unexpected fault for address: 0x%08lx, "
167 "last fault for address: 0x%08lx\n",
168 addr, my_reason->addr);
169 print_pte(addr);
170 print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip);
171 print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip);
172#ifdef __i386__
173 pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
174 regs->ax, regs->bx, regs->cx, regs->dx);
175 pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
176 regs->si, regs->di, regs->bp, regs->sp);
177#else
178 pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n",
179 regs->ax, regs->cx, regs->dx);
180 pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n",
181 regs->si, regs->di, regs->bp, regs->sp);
182#endif
183 put_cpu_var(pf_reason);
184 BUG();
185}
186
187static void pre(struct kmmio_probe *p, struct pt_regs *regs,
188 unsigned long addr)
189{
190 struct trap_reason *my_reason = &get_cpu_var(pf_reason);
191 struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
192 const unsigned long instptr = instruction_pointer(regs);
193 const enum reason_type type = get_ins_type(instptr);
194 struct remap_trace *trace = p->private;
195
196 /* it doesn't make sense to have more than one active trace per cpu */
197 if (my_reason->active_traces)
198 die_kmmio_nesting_error(regs, addr);
199 else
200 my_reason->active_traces++;
201
202 my_reason->type = type;
203 my_reason->addr = addr;
204 my_reason->ip = instptr;
205
206 my_trace->phys = addr - trace->probe.addr + trace->phys;
207 my_trace->map_id = trace->id;
208
209 /*
210 * Only record the program counter when requested.
211 * It may taint clean-room reverse engineering.
212 */
213 if (trace_pc)
214 my_trace->pc = instptr;
215 else
216 my_trace->pc = 0;
217
218 /*
219 * XXX: the timestamp recorded will be *after* the tracing has been
220 * done, not at the time we hit the instruction. SMP implications
221 * on event ordering?
222 */
223
224 switch (type) {
225 case REG_READ:
226 my_trace->opcode = MMIO_READ;
227 my_trace->width = get_ins_mem_width(instptr);
228 break;
229 case REG_WRITE:
230 my_trace->opcode = MMIO_WRITE;
231 my_trace->width = get_ins_mem_width(instptr);
232 my_trace->value = get_ins_reg_val(instptr, regs);
233 break;
234 case IMM_WRITE:
235 my_trace->opcode = MMIO_WRITE;
236 my_trace->width = get_ins_mem_width(instptr);
237 my_trace->value = get_ins_imm_val(instptr);
238 break;
239 default:
240 {
241 unsigned char *ip = (unsigned char *)instptr;
242 my_trace->opcode = MMIO_UNKNOWN_OP;
243 my_trace->width = 0;
244 my_trace->value = (*ip) << 16 | *(ip + 1) << 8 |
245 *(ip + 2);
246 }
247 }
248 put_cpu_var(cpu_trace);
249 put_cpu_var(pf_reason);
250}
251
252static void post(struct kmmio_probe *p, unsigned long condition,
253 struct pt_regs *regs)
254{
255 struct trap_reason *my_reason = &get_cpu_var(pf_reason);
256 struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
257
258 /* this should always return the active_trace count to 0 */
259 my_reason->active_traces--;
260 if (my_reason->active_traces) {
261 pr_emerg(NAME "unexpected post handler");
262 BUG();
263 }
264
265 switch (my_reason->type) {
266 case REG_READ:
267 my_trace->value = get_ins_reg_val(my_reason->ip, regs);
268 break;
269 default:
270 break;
271 }
272
273 mmio_trace_rw(my_trace);
274 put_cpu_var(cpu_trace);
275 put_cpu_var(pf_reason);
276}
277
278static void ioremap_trace_core(resource_size_t offset, unsigned long size,
279 void __iomem *addr)
280{
281 static atomic_t next_id;
282 struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL);
283 /* These are page-unaligned. */
284 struct mmiotrace_map map = {
285 .phys = offset,
286 .virt = (unsigned long)addr,
287 .len = size,
288 .opcode = MMIO_PROBE
289 };
290
291 if (!trace) {
292 pr_err(NAME "kmalloc failed in ioremap\n");
293 return;
294 }
295
296 *trace = (struct remap_trace) {
297 .probe = {
298 .addr = (unsigned long)addr,
299 .len = size,
300 .pre_handler = pre,
301 .post_handler = post,
302 .private = trace
303 },
304 .phys = offset,
305 .id = atomic_inc_return(&next_id)
306 };
307 map.map_id = trace->id;
308
309 spin_lock_irq(&trace_lock);
310 if (!is_enabled())
311 goto not_enabled;
312
313 mmio_trace_mapping(&map);
314 list_add_tail(&trace->list, &trace_list);
315 if (!nommiotrace)
316 register_kmmio_probe(&trace->probe);
317
318not_enabled:
319 spin_unlock_irq(&trace_lock);
320}
321
322void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
323 void __iomem *addr)
324{
325 if (!is_enabled()) /* recheck and proper locking in *_core() */
326 return;
327
328 pr_debug(NAME "ioremap_*(0x%llx, 0x%lx) = %p\n",
329 (unsigned long long)offset, size, addr);
330 if ((filter_offset) && (offset != filter_offset))
331 return;
332 ioremap_trace_core(offset, size, addr);
333}
334
335static void iounmap_trace_core(volatile void __iomem *addr)
336{
337 struct mmiotrace_map map = {
338 .phys = 0,
339 .virt = (unsigned long)addr,
340 .len = 0,
341 .opcode = MMIO_UNPROBE
342 };
343 struct remap_trace *trace;
344 struct remap_trace *tmp;
345 struct remap_trace *found_trace = NULL;
346
347 pr_debug(NAME "Unmapping %p.\n", addr);
348
349 spin_lock_irq(&trace_lock);
350 if (!is_enabled())
351 goto not_enabled;
352
353 list_for_each_entry_safe(trace, tmp, &trace_list, list) {
354 if ((unsigned long)addr == trace->probe.addr) {
355 if (!nommiotrace)
356 unregister_kmmio_probe(&trace->probe);
357 list_del(&trace->list);
358 found_trace = trace;
359 break;
360 }
361 }
362 map.map_id = (found_trace) ? found_trace->id : -1;
363 mmio_trace_mapping(&map);
364
365not_enabled:
366 spin_unlock_irq(&trace_lock);
367 if (found_trace) {
368 synchronize_rcu(); /* unregister_kmmio_probe() requirement */
369 kfree(found_trace);
370 }
371}
372
373void mmiotrace_iounmap(volatile void __iomem *addr)
374{
375 might_sleep();
376 if (is_enabled()) /* recheck and proper locking in *_core() */
377 iounmap_trace_core(addr);
378}
379
380static void clear_trace_list(void)
381{
382 struct remap_trace *trace;
383 struct remap_trace *tmp;
384
385 /*
386 * No locking required, because the caller ensures we are in a
387 * critical section via mutex, and is_enabled() is false,
388 * i.e. nothing can traverse or modify this list.
389 * Caller also ensures is_enabled() cannot change.
390 */
391 list_for_each_entry(trace, &trace_list, list) {
392 pr_notice(NAME "purging non-iounmapped "
393 "trace @0x%08lx, size 0x%lx.\n",
394 trace->probe.addr, trace->probe.len);
395 if (!nommiotrace)
396 unregister_kmmio_probe(&trace->probe);
397 }
398 synchronize_rcu(); /* unregister_kmmio_probe() requirement */
399
400 list_for_each_entry_safe(trace, tmp, &trace_list, list) {
401 list_del(&trace->list);
402 kfree(trace);
403 }
404}
405
406#ifdef CONFIG_HOTPLUG_CPU
407static cpumask_t downed_cpus;
408
409static void enter_uniprocessor(void)
410{
411 int cpu;
412 int err;
413
414 get_online_cpus();
415 downed_cpus = cpu_online_map;
416 cpu_clear(first_cpu(cpu_online_map), downed_cpus);
417 if (num_online_cpus() > 1)
418 pr_notice(NAME "Disabling non-boot CPUs...\n");
419 put_online_cpus();
420
421 for_each_cpu_mask(cpu, downed_cpus) {
422 err = cpu_down(cpu);
423 if (!err)
424 pr_info(NAME "CPU%d is down.\n", cpu);
425 else
426 pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err);
427 }
428 if (num_online_cpus() > 1)
429 pr_warning(NAME "multiple CPUs still online, "
430 "may miss events.\n");
431}
432
433static void leave_uniprocessor(void)
434{
435 int cpu;
436 int err;
437
438 if (cpus_weight(downed_cpus) == 0)
439 return;
440 pr_notice(NAME "Re-enabling CPUs...\n");
441 for_each_cpu_mask(cpu, downed_cpus) {
442 err = cpu_up(cpu);
443 if (!err)
444 pr_info(NAME "enabled CPU%d.\n", cpu);
445 else
446 pr_err(NAME "cannot re-enable CPU%d: %d\n", cpu, err);
447 }
448}
449
450#else /* !CONFIG_HOTPLUG_CPU */
451static void enter_uniprocessor(void)
452{
453 if (num_online_cpus() > 1)
454 pr_warning(NAME "multiple CPUs are online, may miss events. "
455 "Suggest booting with maxcpus=1 kernel argument.\n");
456}
457
458static void leave_uniprocessor(void)
459{
460}
461#endif
462
463#if 0 /* XXX: out of order */
464static struct file_operations fops_marker = {
465 .owner = THIS_MODULE,
466 .write = write_marker
467};
468#endif
469
470void enable_mmiotrace(void)
471{
472 mutex_lock(&mmiotrace_mutex);
473 if (is_enabled())
474 goto out;
475
476#if 0 /* XXX: tracing does not support text entries */
477 marker_file = debugfs_create_file("marker", 0660, dir, NULL,
478 &fops_marker);
479 if (!marker_file)
480 pr_err(NAME "marker file creation failed.\n");
481#endif
482
483 if (nommiotrace)
484 pr_info(NAME "MMIO tracing disabled.\n");
485 enter_uniprocessor();
486 spin_lock_irq(&trace_lock);
487 atomic_inc(&mmiotrace_enabled);
488 spin_unlock_irq(&trace_lock);
489 pr_info(NAME "enabled.\n");
490out:
491 mutex_unlock(&mmiotrace_mutex);
492}
493
494void disable_mmiotrace(void)
495{
496 mutex_lock(&mmiotrace_mutex);
497 if (!is_enabled())
498 goto out;
499
500 spin_lock_irq(&trace_lock);
501 atomic_dec(&mmiotrace_enabled);
502 BUG_ON(is_enabled());
503 spin_unlock_irq(&trace_lock);
504
505 clear_trace_list(); /* guarantees: no more kmmio callbacks */
506 leave_uniprocessor();
507 if (marker_file) {
508 debugfs_remove(marker_file);
509 marker_file = NULL;
510 }
511
512 pr_info(NAME "disabled.\n");
513out:
514 mutex_unlock(&mmiotrace_mutex);
515}
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index c5066d519e5d..a4dd793d6003 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -20,37 +20,18 @@
20#include <asm/acpi.h> 20#include <asm/acpi.h>
21#include <asm/k8.h> 21#include <asm/k8.h>
22 22
23#ifndef Dprintk
24#define Dprintk(x...)
25#endif
26
27struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 23struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
28EXPORT_SYMBOL(node_data); 24EXPORT_SYMBOL(node_data);
29 25
30bootmem_data_t plat_node_bdata[MAX_NUMNODES];
31
32struct memnode memnode; 26struct memnode memnode;
33 27
34#ifdef CONFIG_SMP
35int x86_cpu_to_node_map_init[NR_CPUS] = {
36 [0 ... NR_CPUS-1] = NUMA_NO_NODE
37};
38void *x86_cpu_to_node_map_early_ptr;
39EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr);
40#endif
41DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
42EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map);
43
44s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 28s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
45 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 29 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
46}; 30};
47 31
48cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly;
49EXPORT_SYMBOL(node_to_cpumask_map);
50
51int numa_off __initdata; 32int numa_off __initdata;
52unsigned long __initdata nodemap_addr; 33static unsigned long __initdata nodemap_addr;
53unsigned long __initdata nodemap_size; 34static unsigned long __initdata nodemap_size;
54 35
55/* 36/*
56 * Given a shift value, try to populate memnodemap[] 37 * Given a shift value, try to populate memnodemap[]
@@ -99,7 +80,7 @@ static int __init allocate_cachealigned_memnodemap(void)
99 80
100 addr = 0x8000; 81 addr = 0x8000;
101 nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); 82 nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
102 nodemap_addr = find_e820_area(addr, end_pfn<<PAGE_SHIFT, 83 nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT,
103 nodemap_size, L1_CACHE_BYTES); 84 nodemap_size, L1_CACHE_BYTES);
104 if (nodemap_addr == -1UL) { 85 if (nodemap_addr == -1UL) {
105 printk(KERN_ERR 86 printk(KERN_ERR
@@ -192,7 +173,7 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
192void __init setup_node_bootmem(int nodeid, unsigned long start, 173void __init setup_node_bootmem(int nodeid, unsigned long start,
193 unsigned long end) 174 unsigned long end)
194{ 175{
195 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size; 176 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
196 unsigned long bootmap_start, nodedata_phys; 177 unsigned long bootmap_start, nodedata_phys;
197 void *bootmap; 178 void *bootmap;
198 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); 179 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
@@ -204,7 +185,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
204 start, end); 185 start, end);
205 186
206 start_pfn = start >> PAGE_SHIFT; 187 start_pfn = start >> PAGE_SHIFT;
207 end_pfn = end >> PAGE_SHIFT; 188 last_pfn = end >> PAGE_SHIFT;
208 189
209 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size, 190 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
210 SMP_CACHE_BYTES); 191 SMP_CACHE_BYTES);
@@ -215,9 +196,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
215 nodedata_phys + pgdat_size - 1); 196 nodedata_phys + pgdat_size - 1);
216 197
217 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 198 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
218 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; 199 NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
219 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 200 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
220 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; 201 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
221 202
222 /* 203 /*
223 * Find a place for the bootmem map 204 * Find a place for the bootmem map
@@ -226,14 +207,14 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
226 * early_node_mem will get that with find_e820_area instead 207 * early_node_mem will get that with find_e820_area instead
227 * of alloc_bootmem, that could clash with reserved range 208 * of alloc_bootmem, that could clash with reserved range
228 */ 209 */
229 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 210 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
230 nid = phys_to_nid(nodedata_phys); 211 nid = phys_to_nid(nodedata_phys);
231 if (nid == nodeid) 212 if (nid == nodeid)
232 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); 213 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
233 else 214 else
234 bootmap_start = round_up(start, PAGE_SIZE); 215 bootmap_start = round_up(start, PAGE_SIZE);
235 /* 216 /*
236 * SMP_CAHCE_BYTES could be enough, but init_bootmem_node like 217 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
237 * to use that to align to PAGE_SIZE 218 * to use that to align to PAGE_SIZE
238 */ 219 */
239 bootmap = early_node_mem(nodeid, bootmap_start, end, 220 bootmap = early_node_mem(nodeid, bootmap_start, end,
@@ -248,7 +229,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
248 229
249 bootmap_size = init_bootmem_node(NODE_DATA(nodeid), 230 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
250 bootmap_start >> PAGE_SHIFT, 231 bootmap_start >> PAGE_SHIFT,
251 start_pfn, end_pfn); 232 start_pfn, last_pfn);
252 233
253 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", 234 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n",
254 bootmap_start, bootmap_start + bootmap_size - 1, 235 bootmap_start, bootmap_start + bootmap_size - 1,
@@ -309,7 +290,7 @@ void __init numa_init_array(void)
309 290
310#ifdef CONFIG_NUMA_EMU 291#ifdef CONFIG_NUMA_EMU
311/* Numa emulation */ 292/* Numa emulation */
312char *cmdline __initdata; 293static char *cmdline __initdata;
313 294
314/* 295/*
315 * Setups up nid to range from addr to addr + size. If the end 296 * Setups up nid to range from addr to addr + size. If the end
@@ -413,15 +394,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
413} 394}
414 395
415/* 396/*
416 * Sets up the system RAM area from start_pfn to end_pfn according to the 397 * Sets up the system RAM area from start_pfn to last_pfn according to the
417 * numa=fake command-line option. 398 * numa=fake command-line option.
418 */ 399 */
419static struct bootnode nodes[MAX_NUMNODES] __initdata; 400static struct bootnode nodes[MAX_NUMNODES] __initdata;
420 401
421static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) 402static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
422{ 403{
423 u64 size, addr = start_pfn << PAGE_SHIFT; 404 u64 size, addr = start_pfn << PAGE_SHIFT;
424 u64 max_addr = end_pfn << PAGE_SHIFT; 405 u64 max_addr = last_pfn << PAGE_SHIFT;
425 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; 406 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
426 407
427 memset(&nodes, 0, sizeof(nodes)); 408 memset(&nodes, 0, sizeof(nodes));
@@ -527,7 +508,7 @@ out:
527} 508}
528#endif /* CONFIG_NUMA_EMU */ 509#endif /* CONFIG_NUMA_EMU */
529 510
530void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) 511void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
531{ 512{
532 int i; 513 int i;
533 514
@@ -535,7 +516,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
535 nodes_clear(node_online_map); 516 nodes_clear(node_online_map);
536 517
537#ifdef CONFIG_NUMA_EMU 518#ifdef CONFIG_NUMA_EMU
538 if (cmdline && !numa_emulation(start_pfn, end_pfn)) 519 if (cmdline && !numa_emulation(start_pfn, last_pfn))
539 return; 520 return;
540 nodes_clear(node_possible_map); 521 nodes_clear(node_possible_map);
541 nodes_clear(node_online_map); 522 nodes_clear(node_online_map);
@@ -543,7 +524,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
543 524
544#ifdef CONFIG_ACPI_NUMA 525#ifdef CONFIG_ACPI_NUMA
545 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 526 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
546 end_pfn << PAGE_SHIFT)) 527 last_pfn << PAGE_SHIFT))
547 return; 528 return;
548 nodes_clear(node_possible_map); 529 nodes_clear(node_possible_map);
549 nodes_clear(node_online_map); 530 nodes_clear(node_online_map);
@@ -551,7 +532,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
551 532
552#ifdef CONFIG_K8_NUMA 533#ifdef CONFIG_K8_NUMA
553 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, 534 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT,
554 end_pfn<<PAGE_SHIFT)) 535 last_pfn<<PAGE_SHIFT))
555 return; 536 return;
556 nodes_clear(node_possible_map); 537 nodes_clear(node_possible_map);
557 nodes_clear(node_online_map); 538 nodes_clear(node_online_map);
@@ -561,7 +542,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
561 542
562 printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 543 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
563 start_pfn << PAGE_SHIFT, 544 start_pfn << PAGE_SHIFT,
564 end_pfn << PAGE_SHIFT); 545 last_pfn << PAGE_SHIFT);
565 /* setup dummy node covering all memory */ 546 /* setup dummy node covering all memory */
566 memnode_shift = 63; 547 memnode_shift = 63;
567 memnodemap = memnode.embedded_map; 548 memnodemap = memnode.embedded_map;
@@ -570,29 +551,8 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
570 node_set(0, node_possible_map); 551 node_set(0, node_possible_map);
571 for (i = 0; i < NR_CPUS; i++) 552 for (i = 0; i < NR_CPUS; i++)
572 numa_set_node(i, 0); 553 numa_set_node(i, 0);
573 /* cpumask_of_cpu() may not be available during early startup */ 554 e820_register_active_regions(0, start_pfn, last_pfn);
574 memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0])); 555 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
575 cpu_set(0, node_to_cpumask_map[0]);
576 e820_register_active_regions(0, start_pfn, end_pfn);
577 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
578}
579
580__cpuinit void numa_add_cpu(int cpu)
581{
582 set_bit(cpu,
583 (unsigned long *)&node_to_cpumask_map[early_cpu_to_node(cpu)]);
584}
585
586void __cpuinit numa_set_node(int cpu, int node)
587{
588 int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
589
590 if(cpu_to_node_map)
591 cpu_to_node_map[cpu] = node;
592 else if(per_cpu_offset(cpu))
593 per_cpu(x86_cpu_to_node_map, cpu) = node;
594 else
595 Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu);
596} 556}
597 557
598unsigned long __init numa_free_all_bootmem(void) 558unsigned long __init numa_free_all_bootmem(void)
@@ -613,7 +573,7 @@ void __init paging_init(void)
613 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 573 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
614 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; 574 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
615 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 575 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
616 max_zone_pfns[ZONE_NORMAL] = end_pfn; 576 max_zone_pfns[ZONE_NORMAL] = max_pfn;
617 577
618 sparse_memory_present_with_active_regions(MAX_NUMNODES); 578 sparse_memory_present_with_active_regions(MAX_NUMNODES);
619 sparse_init(); 579 sparse_init();
@@ -641,6 +601,7 @@ static __init int numa_setup(char *opt)
641} 601}
642early_param("numa", numa_setup); 602early_param("numa", numa_setup);
643 603
604#ifdef CONFIG_NUMA
644/* 605/*
645 * Setup early cpu_to_node. 606 * Setup early cpu_to_node.
646 * 607 *
@@ -652,14 +613,19 @@ early_param("numa", numa_setup);
652 * is already initialized in a round robin manner at numa_init_array, 613 * is already initialized in a round robin manner at numa_init_array,
653 * prior to this call, and this initialization is good enough 614 * prior to this call, and this initialization is good enough
654 * for the fake NUMA cases. 615 * for the fake NUMA cases.
616 *
617 * Called before the per_cpu areas are setup.
655 */ 618 */
656void __init init_cpu_to_node(void) 619void __init init_cpu_to_node(void)
657{ 620{
658 int i; 621 int cpu;
622 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
659 623
660 for (i = 0; i < NR_CPUS; i++) { 624 BUG_ON(cpu_to_apicid == NULL);
625
626 for_each_possible_cpu(cpu) {
661 int node; 627 int node;
662 u16 apicid = x86_cpu_to_apicid_init[i]; 628 u16 apicid = cpu_to_apicid[cpu];
663 629
664 if (apicid == BAD_APICID) 630 if (apicid == BAD_APICID)
665 continue; 631 continue;
@@ -668,8 +634,9 @@ void __init init_cpu_to_node(void)
668 continue; 634 continue;
669 if (!node_online(node)) 635 if (!node_online(node))
670 continue; 636 continue;
671 numa_set_node(i, node); 637 numa_set_node(cpu, node);
672 } 638 }
673} 639}
640#endif
674 641
675 642
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 75f1b109aae8..0dcd42eb94e6 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -1,8 +1,8 @@
1/* 1/*
2 * self test for change_page_attr. 2 * self test for change_page_attr.
3 * 3 *
4 * Clears the global bit on random pages in the direct mapping, then reverts 4 * Clears the a test pte bit on random pages in the direct mapping,
5 * and compares page tables forwards and afterwards. 5 * then reverts and compares page tables forwards and afterwards.
6 */ 6 */
7#include <linux/bootmem.h> 7#include <linux/bootmem.h>
8#include <linux/kthread.h> 8#include <linux/kthread.h>
@@ -32,6 +32,13 @@ enum {
32 GPS = (1<<30) 32 GPS = (1<<30)
33}; 33};
34 34
35#define PAGE_TESTBIT __pgprot(_PAGE_UNUSED1)
36
37static int pte_testbit(pte_t pte)
38{
39 return pte_flags(pte) & _PAGE_UNUSED1;
40}
41
35struct split_state { 42struct split_state {
36 long lpg, gpg, spg, exec; 43 long lpg, gpg, spg, exec;
37 long min_exec, max_exec; 44 long min_exec, max_exec;
@@ -165,15 +172,14 @@ static int pageattr_test(void)
165 continue; 172 continue;
166 } 173 }
167 174
168 err = change_page_attr_clear(addr[i], len[i], 175 err = change_page_attr_set(addr[i], len[i], PAGE_TESTBIT);
169 __pgprot(_PAGE_GLOBAL));
170 if (err < 0) { 176 if (err < 0) {
171 printk(KERN_ERR "CPA %d failed %d\n", i, err); 177 printk(KERN_ERR "CPA %d failed %d\n", i, err);
172 failed++; 178 failed++;
173 } 179 }
174 180
175 pte = lookup_address(addr[i], &level); 181 pte = lookup_address(addr[i], &level);
176 if (!pte || pte_global(*pte) || pte_huge(*pte)) { 182 if (!pte || !pte_testbit(*pte) || pte_huge(*pte)) {
177 printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i], 183 printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i],
178 pte ? (u64)pte_val(*pte) : 0ULL); 184 pte ? (u64)pte_val(*pte) : 0ULL);
179 failed++; 185 failed++;
@@ -198,14 +204,13 @@ static int pageattr_test(void)
198 failed++; 204 failed++;
199 continue; 205 continue;
200 } 206 }
201 err = change_page_attr_set(addr[i], len[i], 207 err = change_page_attr_clear(addr[i], len[i], PAGE_TESTBIT);
202 __pgprot(_PAGE_GLOBAL));
203 if (err < 0) { 208 if (err < 0) {
204 printk(KERN_ERR "CPA reverting failed: %d\n", err); 209 printk(KERN_ERR "CPA reverting failed: %d\n", err);
205 failed++; 210 failed++;
206 } 211 }
207 pte = lookup_address(addr[i], &level); 212 pte = lookup_address(addr[i], &level);
208 if (!pte || !pte_global(*pte)) { 213 if (!pte || pte_testbit(*pte)) {
209 printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n", 214 printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n",
210 addr[i], pte ? (u64)pte_val(*pte) : 0ULL); 215 addr[i], pte ? (u64)pte_val(*pte) : 0ULL);
211 failed++; 216 failed++;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 60bcb5b6a37e..65c6e46bf059 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -34,6 +34,41 @@ struct cpa_data {
34 unsigned force_split : 1; 34 unsigned force_split : 1;
35}; 35};
36 36
37#ifdef CONFIG_PROC_FS
38static unsigned long direct_pages_count[PG_LEVEL_NUM];
39
40void update_page_count(int level, unsigned long pages)
41{
42 unsigned long flags;
43
44 /* Protect against CPA */
45 spin_lock_irqsave(&pgd_lock, flags);
46 direct_pages_count[level] += pages;
47 spin_unlock_irqrestore(&pgd_lock, flags);
48}
49
50static void split_page_count(int level)
51{
52 direct_pages_count[level]--;
53 direct_pages_count[level - 1] += PTRS_PER_PTE;
54}
55
56int arch_report_meminfo(char *page)
57{
58 int n = sprintf(page, "DirectMap4k: %8lu\n"
59 "DirectMap2M: %8lu\n",
60 direct_pages_count[PG_LEVEL_4K],
61 direct_pages_count[PG_LEVEL_2M]);
62#ifdef CONFIG_X86_64
63 n += sprintf(page + n, "DirectMap1G: %8lu\n",
64 direct_pages_count[PG_LEVEL_1G]);
65#endif
66 return n;
67}
68#else
69static inline void split_page_count(int level) { }
70#endif
71
37#ifdef CONFIG_X86_64 72#ifdef CONFIG_X86_64
38 73
39static inline unsigned long highmap_start_pfn(void) 74static inline unsigned long highmap_start_pfn(void)
@@ -106,7 +141,7 @@ static void cpa_flush_all(unsigned long cache)
106{ 141{
107 BUG_ON(irqs_disabled()); 142 BUG_ON(irqs_disabled());
108 143
109 on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1); 144 on_each_cpu(__cpa_flush_all, (void *) cache, 1);
110} 145}
111 146
112static void __cpa_flush_range(void *arg) 147static void __cpa_flush_range(void *arg)
@@ -127,7 +162,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
127 BUG_ON(irqs_disabled()); 162 BUG_ON(irqs_disabled());
128 WARN_ON(PAGE_ALIGN(start) != start); 163 WARN_ON(PAGE_ALIGN(start) != start);
129 164
130 on_each_cpu(__cpa_flush_range, NULL, 1, 1); 165 on_each_cpu(__cpa_flush_range, NULL, 1);
131 166
132 if (!cache) 167 if (!cache)
133 return; 168 return;
@@ -227,6 +262,7 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
227 262
228 return pte_offset_kernel(pmd, address); 263 return pte_offset_kernel(pmd, address);
229} 264}
265EXPORT_SYMBOL_GPL(lookup_address);
230 266
231/* 267/*
232 * Set the new pmd in all the pgds we know about: 268 * Set the new pmd in all the pgds we know about:
@@ -500,6 +536,16 @@ static int split_large_page(pte_t *kpte, unsigned long address)
500 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) 536 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
501 set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); 537 set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
502 538
539 if (address >= (unsigned long)__va(0) &&
540 address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
541 split_page_count(level);
542
543#ifdef CONFIG_X86_64
544 if (address >= (unsigned long)__va(1UL<<32) &&
545 address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
546 split_page_count(level);
547#endif
548
503 /* 549 /*
504 * Install the new, split up pagetable. Important details here: 550 * Install the new, split up pagetable. Important details here:
505 * 551 *
@@ -613,15 +659,24 @@ static int cpa_process_alias(struct cpa_data *cpa)
613 struct cpa_data alias_cpa; 659 struct cpa_data alias_cpa;
614 int ret = 0; 660 int ret = 0;
615 661
616 if (cpa->pfn > max_pfn_mapped) 662 if (cpa->pfn >= max_pfn_mapped)
617 return 0; 663 return 0;
618 664
665#ifdef CONFIG_X86_64
666 if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
667 return 0;
668#endif
619 /* 669 /*
620 * No need to redo, when the primary call touched the direct 670 * No need to redo, when the primary call touched the direct
621 * mapping already: 671 * mapping already:
622 */ 672 */
623 if (!within(cpa->vaddr, PAGE_OFFSET, 673 if (!(within(cpa->vaddr, PAGE_OFFSET,
624 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { 674 PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
675#ifdef CONFIG_X86_64
676 || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32),
677 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
678#endif
679 )) {
625 680
626 alias_cpa = *cpa; 681 alias_cpa = *cpa;
627 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); 682 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
@@ -805,7 +860,7 @@ int _set_memory_wc(unsigned long addr, int numpages)
805 860
806int set_memory_wc(unsigned long addr, int numpages) 861int set_memory_wc(unsigned long addr, int numpages)
807{ 862{
808 if (!pat_wc_enabled) 863 if (!pat_enabled)
809 return set_memory_uc(addr, numpages); 864 return set_memory_uc(addr, numpages);
810 865
811 if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, 866 if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 06b7a1c90fb8..2fe30916d4b6 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -12,6 +12,8 @@
12#include <linux/gfp.h> 12#include <linux/gfp.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/bootmem.h> 14#include <linux/bootmem.h>
15#include <linux/debugfs.h>
16#include <linux/seq_file.h>
15 17
16#include <asm/msr.h> 18#include <asm/msr.h>
17#include <asm/tlbflush.h> 19#include <asm/tlbflush.h>
@@ -26,11 +28,11 @@
26#include <asm/io.h> 28#include <asm/io.h>
27 29
28#ifdef CONFIG_X86_PAT 30#ifdef CONFIG_X86_PAT
29int __read_mostly pat_wc_enabled = 1; 31int __read_mostly pat_enabled = 1;
30 32
31void __cpuinit pat_disable(char *reason) 33void __cpuinit pat_disable(char *reason)
32{ 34{
33 pat_wc_enabled = 0; 35 pat_enabled = 0;
34 printk(KERN_INFO "%s\n", reason); 36 printk(KERN_INFO "%s\n", reason);
35} 37}
36 38
@@ -42,6 +44,19 @@ static int __init nopat(char *str)
42early_param("nopat", nopat); 44early_param("nopat", nopat);
43#endif 45#endif
44 46
47
48static int debug_enable;
49static int __init pat_debug_setup(char *str)
50{
51 debug_enable = 1;
52 return 0;
53}
54__setup("debugpat", pat_debug_setup);
55
56#define dprintk(fmt, arg...) \
57 do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
58
59
45static u64 __read_mostly boot_pat_state; 60static u64 __read_mostly boot_pat_state;
46 61
47enum { 62enum {
@@ -53,24 +68,25 @@ enum {
53 PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */ 68 PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
54}; 69};
55 70
56#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8)) 71#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8))
57 72
58void pat_init(void) 73void pat_init(void)
59{ 74{
60 u64 pat; 75 u64 pat;
61 76
62 if (!pat_wc_enabled) 77 if (!pat_enabled)
63 return; 78 return;
64 79
65 /* Paranoia check. */ 80 /* Paranoia check. */
66 if (!cpu_has_pat) { 81 if (!cpu_has_pat && boot_pat_state) {
67 printk(KERN_ERR "PAT enabled, but CPU feature cleared\n");
68 /* 82 /*
69 * Panic if this happens on the secondary CPU, and we 83 * If this happens we are on a secondary CPU, but
70 * switched to PAT on the boot CPU. We have no way to 84 * switched to PAT on the boot CPU. We have no way to
71 * undo PAT. 85 * undo PAT.
72 */ 86 */
73 BUG_ON(boot_pat_state); 87 printk(KERN_ERR "PAT enabled, "
88 "but not supported by secondary CPU\n");
89 BUG();
74 } 90 }
75 91
76 /* Set PWT to Write-Combining. All other bits stay the same */ 92 /* Set PWT to Write-Combining. All other bits stay the same */
@@ -86,8 +102,8 @@ void pat_init(void)
86 * 011 UC _PAGE_CACHE_UC 102 * 011 UC _PAGE_CACHE_UC
87 * PAT bit unused 103 * PAT bit unused
88 */ 104 */
89 pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) | 105 pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
90 PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC); 106 PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
91 107
92 /* Boot CPU check */ 108 /* Boot CPU check */
93 if (!boot_pat_state) 109 if (!boot_pat_state)
@@ -103,11 +119,11 @@ void pat_init(void)
103static char *cattr_name(unsigned long flags) 119static char *cattr_name(unsigned long flags)
104{ 120{
105 switch (flags & _PAGE_CACHE_MASK) { 121 switch (flags & _PAGE_CACHE_MASK) {
106 case _PAGE_CACHE_UC: return "uncached"; 122 case _PAGE_CACHE_UC: return "uncached";
107 case _PAGE_CACHE_UC_MINUS: return "uncached-minus"; 123 case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
108 case _PAGE_CACHE_WB: return "write-back"; 124 case _PAGE_CACHE_WB: return "write-back";
109 case _PAGE_CACHE_WC: return "write-combining"; 125 case _PAGE_CACHE_WC: return "write-combining";
110 default: return "broken"; 126 default: return "broken";
111 } 127 }
112} 128}
113 129
@@ -145,47 +161,50 @@ static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
145 * The intersection is based on "Effective Memory Type" tables in IA-32 161 * The intersection is based on "Effective Memory Type" tables in IA-32
146 * SDM vol 3a 162 * SDM vol 3a
147 */ 163 */
148static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot, 164static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
149 unsigned long *ret_prot)
150{ 165{
151 unsigned long pat_type;
152 u8 mtrr_type;
153
154 pat_type = prot & _PAGE_CACHE_MASK;
155 prot &= (~_PAGE_CACHE_MASK);
156
157 /*
158 * We return the PAT request directly for types where PAT takes
159 * precedence with respect to MTRR and for UC_MINUS.
160 * Consistency checks with other PAT requests is done later
161 * while going through memtype list.
162 */
163 if (pat_type == _PAGE_CACHE_WC) {
164 *ret_prot = prot | _PAGE_CACHE_WC;
165 return 0;
166 } else if (pat_type == _PAGE_CACHE_UC_MINUS) {
167 *ret_prot = prot | _PAGE_CACHE_UC_MINUS;
168 return 0;
169 } else if (pat_type == _PAGE_CACHE_UC) {
170 *ret_prot = prot | _PAGE_CACHE_UC;
171 return 0;
172 }
173
174 /* 166 /*
175 * Look for MTRR hint to get the effective type in case where PAT 167 * Look for MTRR hint to get the effective type in case where PAT
176 * request is for WB. 168 * request is for WB.
177 */ 169 */
178 mtrr_type = mtrr_type_lookup(start, end); 170 if (req_type == _PAGE_CACHE_WB) {
171 u8 mtrr_type;
172
173 mtrr_type = mtrr_type_lookup(start, end);
174 if (mtrr_type == MTRR_TYPE_UNCACHABLE)
175 return _PAGE_CACHE_UC;
176 if (mtrr_type == MTRR_TYPE_WRCOMB)
177 return _PAGE_CACHE_WC;
178 }
179 179
180 if (mtrr_type == MTRR_TYPE_UNCACHABLE) { 180 return req_type;
181 *ret_prot = prot | _PAGE_CACHE_UC; 181}
182 } else if (mtrr_type == MTRR_TYPE_WRCOMB) { 182
183 *ret_prot = prot | _PAGE_CACHE_WC; 183static int chk_conflict(struct memtype *new, struct memtype *entry,
184 } else { 184 unsigned long *type)
185 *ret_prot = prot | _PAGE_CACHE_WB; 185{
186 if (new->type != entry->type) {
187 if (type) {
188 new->type = entry->type;
189 *type = entry->type;
190 } else
191 goto conflict;
186 } 192 }
187 193
194 /* check overlaps with more than one entry in the list */
195 list_for_each_entry_continue(entry, &memtype_list, nd) {
196 if (new->end <= entry->start)
197 break;
198 else if (new->type != entry->type)
199 goto conflict;
200 }
188 return 0; 201 return 0;
202
203 conflict:
204 printk(KERN_INFO "%s:%d conflicting memory types "
205 "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start,
206 new->end, cattr_name(new->type), cattr_name(entry->type));
207 return -EBUSY;
189} 208}
190 209
191/* 210/*
@@ -198,37 +217,36 @@ static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
198 * req_type will have a special case value '-1', when requester want to inherit 217 * req_type will have a special case value '-1', when requester want to inherit
199 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS. 218 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
200 * 219 *
201 * If ret_type is NULL, function will return an error if it cannot reserve the 220 * If new_type is NULL, function will return an error if it cannot reserve the
202 * region with req_type. If ret_type is non-null, function will return 221 * region with req_type. If new_type is non-NULL, function will return
203 * available type in ret_type in case of no error. In case of any error 222 * available type in new_type in case of no error. In case of any error
204 * it will return a negative return value. 223 * it will return a negative return value.
205 */ 224 */
206int reserve_memtype(u64 start, u64 end, unsigned long req_type, 225int reserve_memtype(u64 start, u64 end, unsigned long req_type,
207 unsigned long *ret_type) 226 unsigned long *new_type)
208{ 227{
209 struct memtype *new_entry = NULL; 228 struct memtype *new, *entry;
210 struct memtype *parse;
211 unsigned long actual_type; 229 unsigned long actual_type;
230 struct list_head *where;
212 int err = 0; 231 int err = 0;
213 232
214 /* Only track when pat_wc_enabled */ 233 BUG_ON(start >= end); /* end is exclusive */
215 if (!pat_wc_enabled) { 234
235 if (!pat_enabled) {
216 /* This is identical to page table setting without PAT */ 236 /* This is identical to page table setting without PAT */
217 if (ret_type) { 237 if (new_type) {
218 if (req_type == -1) { 238 if (req_type == -1)
219 *ret_type = _PAGE_CACHE_WB; 239 *new_type = _PAGE_CACHE_WB;
220 } else { 240 else
221 *ret_type = req_type; 241 *new_type = req_type & _PAGE_CACHE_MASK;
222 }
223 } 242 }
224 return 0; 243 return 0;
225 } 244 }
226 245
227 /* Low ISA region is always mapped WB in page table. No need to track */ 246 /* Low ISA region is always mapped WB in page table. No need to track */
228 if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) { 247 if (is_ISA_range(start, end - 1)) {
229 if (ret_type) 248 if (new_type)
230 *ret_type = _PAGE_CACHE_WB; 249 *new_type = _PAGE_CACHE_WB;
231
232 return 0; 250 return 0;
233 } 251 }
234 252
@@ -241,206 +259,92 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
241 */ 259 */
242 u8 mtrr_type = mtrr_type_lookup(start, end); 260 u8 mtrr_type = mtrr_type_lookup(start, end);
243 261
244 if (mtrr_type == MTRR_TYPE_WRBACK) { 262 if (mtrr_type == MTRR_TYPE_WRBACK)
245 req_type = _PAGE_CACHE_WB;
246 actual_type = _PAGE_CACHE_WB; 263 actual_type = _PAGE_CACHE_WB;
247 } else { 264 else
248 req_type = _PAGE_CACHE_UC_MINUS;
249 actual_type = _PAGE_CACHE_UC_MINUS; 265 actual_type = _PAGE_CACHE_UC_MINUS;
250 } 266 } else
251 } else { 267 actual_type = pat_x_mtrr_type(start, end,
252 req_type &= _PAGE_CACHE_MASK; 268 req_type & _PAGE_CACHE_MASK);
253 err = pat_x_mtrr_type(start, end, req_type, &actual_type);
254 }
255
256 if (err) {
257 if (ret_type)
258 *ret_type = actual_type;
259 269
260 return -EINVAL; 270 new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
261 } 271 if (!new)
262
263 new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
264 if (!new_entry)
265 return -ENOMEM; 272 return -ENOMEM;
266 273
267 new_entry->start = start; 274 new->start = start;
268 new_entry->end = end; 275 new->end = end;
269 new_entry->type = actual_type; 276 new->type = actual_type;
270 277
271 if (ret_type) 278 if (new_type)
272 *ret_type = actual_type; 279 *new_type = actual_type;
273 280
274 spin_lock(&memtype_lock); 281 spin_lock(&memtype_lock);
275 282
276 /* Search for existing mapping that overlaps the current range */ 283 /* Search for existing mapping that overlaps the current range */
277 list_for_each_entry(parse, &memtype_list, nd) { 284 where = NULL;
278 struct memtype *saved_ptr; 285 list_for_each_entry(entry, &memtype_list, nd) {
279 286 if (end <= entry->start) {
280 if (parse->start >= end) { 287 where = entry->nd.prev;
281 pr_debug("New Entry\n");
282 list_add(&new_entry->nd, parse->nd.prev);
283 new_entry = NULL;
284 break; 288 break;
285 } 289 } else if (start <= entry->start) { /* end > entry->start */
286 290 err = chk_conflict(new, entry, new_type);
287 if (start <= parse->start && end >= parse->start) { 291 if (!err) {
288 if (actual_type != parse->type && ret_type) { 292 dprintk("Overlap at 0x%Lx-0x%Lx\n",
289 actual_type = parse->type; 293 entry->start, entry->end);
290 *ret_type = actual_type; 294 where = entry->nd.prev;
291 new_entry->type = actual_type;
292 } 295 }
293
294 if (actual_type != parse->type) {
295 printk(
296 KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
297 current->comm, current->pid,
298 start, end,
299 cattr_name(actual_type),
300 cattr_name(parse->type));
301 err = -EBUSY;
302 break;
303 }
304
305 saved_ptr = parse;
306 /*
307 * Check to see whether the request overlaps more
308 * than one entry in the list
309 */
310 list_for_each_entry_continue(parse, &memtype_list, nd) {
311 if (end <= parse->start) {
312 break;
313 }
314
315 if (actual_type != parse->type) {
316 printk(
317 KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
318 current->comm, current->pid,
319 start, end,
320 cattr_name(actual_type),
321 cattr_name(parse->type));
322 err = -EBUSY;
323 break;
324 }
325 }
326
327 if (err) {
328 break;
329 }
330
331 pr_debug("Overlap at 0x%Lx-0x%Lx\n",
332 saved_ptr->start, saved_ptr->end);
333 /* No conflict. Go ahead and add this new entry */
334 list_add(&new_entry->nd, saved_ptr->nd.prev);
335 new_entry = NULL;
336 break; 296 break;
337 } 297 } else if (start < entry->end) { /* start > entry->start */
338 298 err = chk_conflict(new, entry, new_type);
339 if (start < parse->end) { 299 if (!err) {
340 if (actual_type != parse->type && ret_type) { 300 dprintk("Overlap at 0x%Lx-0x%Lx\n",
341 actual_type = parse->type; 301 entry->start, entry->end);
342 *ret_type = actual_type; 302 where = &entry->nd;
343 new_entry->type = actual_type;
344 }
345
346 if (actual_type != parse->type) {
347 printk(
348 KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
349 current->comm, current->pid,
350 start, end,
351 cattr_name(actual_type),
352 cattr_name(parse->type));
353 err = -EBUSY;
354 break;
355 }
356
357 saved_ptr = parse;
358 /*
359 * Check to see whether the request overlaps more
360 * than one entry in the list
361 */
362 list_for_each_entry_continue(parse, &memtype_list, nd) {
363 if (end <= parse->start) {
364 break;
365 }
366
367 if (actual_type != parse->type) {
368 printk(
369 KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
370 current->comm, current->pid,
371 start, end,
372 cattr_name(actual_type),
373 cattr_name(parse->type));
374 err = -EBUSY;
375 break;
376 }
377 } 303 }
378
379 if (err) {
380 break;
381 }
382
383 pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
384 saved_ptr->start, saved_ptr->end);
385 /* No conflict. Go ahead and add this new entry */
386 list_add(&new_entry->nd, &saved_ptr->nd);
387 new_entry = NULL;
388 break; 304 break;
389 } 305 }
390 } 306 }
391 307
392 if (err) { 308 if (err) {
393 printk(KERN_INFO 309 printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
394 "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n", 310 "track %s, req %s\n",
395 start, end, cattr_name(new_entry->type), 311 start, end, cattr_name(new->type), cattr_name(req_type));
396 cattr_name(req_type)); 312 kfree(new);
397 kfree(new_entry);
398 spin_unlock(&memtype_lock); 313 spin_unlock(&memtype_lock);
399 return err; 314 return err;
400 } 315 }
401 316
402 if (new_entry) { 317 if (where)
403 /* No conflict. Not yet added to the list. Add to the tail */ 318 list_add(&new->nd, where);
404 list_add_tail(&new_entry->nd, &memtype_list); 319 else
405 pr_debug("New Entry\n"); 320 list_add_tail(&new->nd, &memtype_list);
406 }
407
408 if (ret_type) {
409 pr_debug(
410 "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
411 start, end, cattr_name(actual_type),
412 cattr_name(req_type), cattr_name(*ret_type));
413 } else {
414 pr_debug(
415 "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
416 start, end, cattr_name(actual_type),
417 cattr_name(req_type));
418 }
419 321
420 spin_unlock(&memtype_lock); 322 spin_unlock(&memtype_lock);
323
324 dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
325 start, end, cattr_name(new->type), cattr_name(req_type),
326 new_type ? cattr_name(*new_type) : "-");
327
421 return err; 328 return err;
422} 329}
423 330
424int free_memtype(u64 start, u64 end) 331int free_memtype(u64 start, u64 end)
425{ 332{
426 struct memtype *ml; 333 struct memtype *entry;
427 int err = -EINVAL; 334 int err = -EINVAL;
428 335
429 /* Only track when pat_wc_enabled */ 336 if (!pat_enabled)
430 if (!pat_wc_enabled) {
431 return 0; 337 return 0;
432 }
433 338
434 /* Low ISA region is always mapped WB. No need to track */ 339 /* Low ISA region is always mapped WB. No need to track */
435 if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) { 340 if (is_ISA_range(start, end - 1))
436 return 0; 341 return 0;
437 }
438 342
439 spin_lock(&memtype_lock); 343 spin_lock(&memtype_lock);
440 list_for_each_entry(ml, &memtype_list, nd) { 344 list_for_each_entry(entry, &memtype_list, nd) {
441 if (ml->start == start && ml->end == end) { 345 if (entry->start == start && entry->end == end) {
442 list_del(&ml->nd); 346 list_del(&entry->nd);
443 kfree(ml); 347 kfree(entry);
444 err = 0; 348 err = 0;
445 break; 349 break;
446 } 350 }
@@ -452,7 +356,7 @@ int free_memtype(u64 start, u64 end)
452 current->comm, current->pid, start, end); 356 current->comm, current->pid, start, end);
453 } 357 }
454 358
455 pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end); 359 dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
456 return err; 360 return err;
457} 361}
458 362
@@ -471,8 +375,8 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
471 return vma_prot; 375 return vma_prot;
472} 376}
473 377
474#ifdef CONFIG_NONPROMISC_DEVMEM 378#ifdef CONFIG_STRICT_DEVMEM
475/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/ 379/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
476static inline int range_is_allowed(unsigned long pfn, unsigned long size) 380static inline int range_is_allowed(unsigned long pfn, unsigned long size)
477{ 381{
478 return 1; 382 return 1;
@@ -496,7 +400,7 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
496 } 400 }
497 return 1; 401 return 1;
498} 402}
499#endif /* CONFIG_NONPROMISC_DEVMEM */ 403#endif /* CONFIG_STRICT_DEVMEM */
500 404
501int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, 405int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
502 unsigned long size, pgprot_t *vma_prot) 406 unsigned long size, pgprot_t *vma_prot)
@@ -521,12 +425,12 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
521 * caching for the high addresses through the KEN pin, but 425 * caching for the high addresses through the KEN pin, but
522 * we maintain the tradition of paranoia in this code. 426 * we maintain the tradition of paranoia in this code.
523 */ 427 */
524 if (!pat_wc_enabled && 428 if (!pat_enabled &&
525 ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) || 429 !(boot_cpu_has(X86_FEATURE_MTRR) ||
526 test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) || 430 boot_cpu_has(X86_FEATURE_K6_MTRR) ||
527 test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) || 431 boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
528 test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) && 432 boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
529 (pfn << PAGE_SHIFT) >= __pa(high_memory)) { 433 (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
530 flags = _PAGE_CACHE_UC; 434 flags = _PAGE_CACHE_UC;
531 } 435 }
532#endif 436#endif
@@ -547,8 +451,9 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
547 if (retval < 0) 451 if (retval < 0)
548 return 0; 452 return 0;
549 453
550 if (pfn <= max_pfn_mapped && 454 if (((pfn < max_low_pfn_mapped) ||
551 ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) { 455 (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn < max_pfn_mapped)) &&
456 ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
552 free_memtype(offset, offset + size); 457 free_memtype(offset, offset + size);
553 printk(KERN_INFO 458 printk(KERN_INFO
554 "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n", 459 "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
@@ -587,3 +492,88 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
587 free_memtype(addr, addr + size); 492 free_memtype(addr, addr + size);
588} 493}
589 494
495#if defined(CONFIG_DEBUG_FS)
496
497/* get Nth element of the linked list */
498static struct memtype *memtype_get_idx(loff_t pos)
499{
500 struct memtype *list_node, *print_entry;
501 int i = 1;
502
503 print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
504 if (!print_entry)
505 return NULL;
506
507 spin_lock(&memtype_lock);
508 list_for_each_entry(list_node, &memtype_list, nd) {
509 if (pos == i) {
510 *print_entry = *list_node;
511 spin_unlock(&memtype_lock);
512 return print_entry;
513 }
514 ++i;
515 }
516 spin_unlock(&memtype_lock);
517 kfree(print_entry);
518 return NULL;
519}
520
521static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
522{
523 if (*pos == 0) {
524 ++*pos;
525 seq_printf(seq, "PAT memtype list:\n");
526 }
527
528 return memtype_get_idx(*pos);
529}
530
531static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
532{
533 ++*pos;
534 return memtype_get_idx(*pos);
535}
536
537static void memtype_seq_stop(struct seq_file *seq, void *v)
538{
539}
540
541static int memtype_seq_show(struct seq_file *seq, void *v)
542{
543 struct memtype *print_entry = (struct memtype *)v;
544
545 seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
546 print_entry->start, print_entry->end);
547 kfree(print_entry);
548 return 0;
549}
550
551static struct seq_operations memtype_seq_ops = {
552 .start = memtype_seq_start,
553 .next = memtype_seq_next,
554 .stop = memtype_seq_stop,
555 .show = memtype_seq_show,
556};
557
558static int memtype_seq_open(struct inode *inode, struct file *file)
559{
560 return seq_open(file, &memtype_seq_ops);
561}
562
563static const struct file_operations memtype_fops = {
564 .open = memtype_seq_open,
565 .read = seq_read,
566 .llseek = seq_lseek,
567 .release = seq_release,
568};
569
570static int __init pat_memtype_list_init(void)
571{
572 debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
573 NULL, &memtype_fops);
574 return 0;
575}
576
577late_initcall(pat_memtype_list_init);
578
579#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
new file mode 100644
index 000000000000..efa1911e20ca
--- /dev/null
+++ b/arch/x86/mm/pf_in.c
@@ -0,0 +1,489 @@
1/*
2 * Fault Injection Test harness (FI)
3 * Copyright (C) Intel Crop.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
18 * USA.
19 *
20 */
21
22/* Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp
23 * Copyright by Intel Crop., 2002
24 * Louis Zhuang (louis.zhuang@intel.com)
25 *
26 * Bjorn Steinbrink (B.Steinbrink@gmx.de), 2007
27 */
28
29#include <linux/module.h>
30#include <linux/ptrace.h> /* struct pt_regs */
31#include "pf_in.h"
32
33#ifdef __i386__
34/* IA32 Manual 3, 2-1 */
35static unsigned char prefix_codes[] = {
36 0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64,
37 0x65, 0x2E, 0x3E, 0x66, 0x67
38};
39/* IA32 Manual 3, 3-432*/
40static unsigned int reg_rop[] = {
41 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
42};
43static unsigned int reg_wop[] = { 0x88, 0x89 };
44static unsigned int imm_wop[] = { 0xC6, 0xC7 };
45/* IA32 Manual 3, 3-432*/
46static unsigned int rw8[] = { 0x88, 0x8A, 0xC6 };
47static unsigned int rw32[] = {
48 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
49};
50static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F };
51static unsigned int mw16[] = { 0xB70F, 0xBF0F };
52static unsigned int mw32[] = { 0x89, 0x8B, 0xC7 };
53static unsigned int mw64[] = {};
54#else /* not __i386__ */
55static unsigned char prefix_codes[] = {
56 0x66, 0x67, 0x2E, 0x3E, 0x26, 0x64, 0x65, 0x36,
57 0xF0, 0xF3, 0xF2,
58 /* REX Prefixes */
59 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
60 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f
61};
62/* AMD64 Manual 3, Appendix A*/
63static unsigned int reg_rop[] = {
64 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
65};
66static unsigned int reg_wop[] = { 0x88, 0x89 };
67static unsigned int imm_wop[] = { 0xC6, 0xC7 };
68static unsigned int rw8[] = { 0xC6, 0x88, 0x8A };
69static unsigned int rw32[] = {
70 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
71};
72/* 8 bit only */
73static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F };
74/* 16 bit only */
75static unsigned int mw16[] = { 0xB70F, 0xBF0F };
76/* 16 or 32 bit */
77static unsigned int mw32[] = { 0xC7 };
78/* 16, 32 or 64 bit */
79static unsigned int mw64[] = { 0x89, 0x8B };
80#endif /* not __i386__ */
81
82static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged,
83 int *rexr)
84{
85 int i;
86 unsigned char *p = addr;
87 *shorted = 0;
88 *enlarged = 0;
89 *rexr = 0;
90
91restart:
92 for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) {
93 if (*p == prefix_codes[i]) {
94 if (*p == 0x66)
95 *shorted = 1;
96#ifdef __amd64__
97 if ((*p & 0xf8) == 0x48)
98 *enlarged = 1;
99 if ((*p & 0xf4) == 0x44)
100 *rexr = 1;
101#endif
102 p++;
103 goto restart;
104 }
105 }
106
107 return (p - addr);
108}
109
110static int get_opcode(unsigned char *addr, unsigned int *opcode)
111{
112 int len;
113
114 if (*addr == 0x0F) {
115 /* 0x0F is extension instruction */
116 *opcode = *(unsigned short *)addr;
117 len = 2;
118 } else {
119 *opcode = *addr;
120 len = 1;
121 }
122
123 return len;
124}
125
126#define CHECK_OP_TYPE(opcode, array, type) \
127 for (i = 0; i < ARRAY_SIZE(array); i++) { \
128 if (array[i] == opcode) { \
129 rv = type; \
130 goto exit; \
131 } \
132 }
133
134enum reason_type get_ins_type(unsigned long ins_addr)
135{
136 unsigned int opcode;
137 unsigned char *p;
138 int shorted, enlarged, rexr;
139 int i;
140 enum reason_type rv = OTHERS;
141
142 p = (unsigned char *)ins_addr;
143 p += skip_prefix(p, &shorted, &enlarged, &rexr);
144 p += get_opcode(p, &opcode);
145
146 CHECK_OP_TYPE(opcode, reg_rop, REG_READ);
147 CHECK_OP_TYPE(opcode, reg_wop, REG_WRITE);
148 CHECK_OP_TYPE(opcode, imm_wop, IMM_WRITE);
149
150exit:
151 return rv;
152}
153#undef CHECK_OP_TYPE
154
155static unsigned int get_ins_reg_width(unsigned long ins_addr)
156{
157 unsigned int opcode;
158 unsigned char *p;
159 int i, shorted, enlarged, rexr;
160
161 p = (unsigned char *)ins_addr;
162 p += skip_prefix(p, &shorted, &enlarged, &rexr);
163 p += get_opcode(p, &opcode);
164
165 for (i = 0; i < ARRAY_SIZE(rw8); i++)
166 if (rw8[i] == opcode)
167 return 1;
168
169 for (i = 0; i < ARRAY_SIZE(rw32); i++)
170 if (rw32[i] == opcode)
171 return (shorted ? 2 : (enlarged ? 8 : 4));
172
173 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
174 return 0;
175}
176
177unsigned int get_ins_mem_width(unsigned long ins_addr)
178{
179 unsigned int opcode;
180 unsigned char *p;
181 int i, shorted, enlarged, rexr;
182
183 p = (unsigned char *)ins_addr;
184 p += skip_prefix(p, &shorted, &enlarged, &rexr);
185 p += get_opcode(p, &opcode);
186
187 for (i = 0; i < ARRAY_SIZE(mw8); i++)
188 if (mw8[i] == opcode)
189 return 1;
190
191 for (i = 0; i < ARRAY_SIZE(mw16); i++)
192 if (mw16[i] == opcode)
193 return 2;
194
195 for (i = 0; i < ARRAY_SIZE(mw32); i++)
196 if (mw32[i] == opcode)
197 return shorted ? 2 : 4;
198
199 for (i = 0; i < ARRAY_SIZE(mw64); i++)
200 if (mw64[i] == opcode)
201 return shorted ? 2 : (enlarged ? 8 : 4);
202
203 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
204 return 0;
205}
206
207/*
208 * Define register ident in mod/rm byte.
209 * Note: these are NOT the same as in ptrace-abi.h.
210 */
211enum {
212 arg_AL = 0,
213 arg_CL = 1,
214 arg_DL = 2,
215 arg_BL = 3,
216 arg_AH = 4,
217 arg_CH = 5,
218 arg_DH = 6,
219 arg_BH = 7,
220
221 arg_AX = 0,
222 arg_CX = 1,
223 arg_DX = 2,
224 arg_BX = 3,
225 arg_SP = 4,
226 arg_BP = 5,
227 arg_SI = 6,
228 arg_DI = 7,
229#ifdef __amd64__
230 arg_R8 = 8,
231 arg_R9 = 9,
232 arg_R10 = 10,
233 arg_R11 = 11,
234 arg_R12 = 12,
235 arg_R13 = 13,
236 arg_R14 = 14,
237 arg_R15 = 15
238#endif
239};
240
241static unsigned char *get_reg_w8(int no, struct pt_regs *regs)
242{
243 unsigned char *rv = NULL;
244
245 switch (no) {
246 case arg_AL:
247 rv = (unsigned char *)&regs->ax;
248 break;
249 case arg_BL:
250 rv = (unsigned char *)&regs->bx;
251 break;
252 case arg_CL:
253 rv = (unsigned char *)&regs->cx;
254 break;
255 case arg_DL:
256 rv = (unsigned char *)&regs->dx;
257 break;
258 case arg_AH:
259 rv = 1 + (unsigned char *)&regs->ax;
260 break;
261 case arg_BH:
262 rv = 1 + (unsigned char *)&regs->bx;
263 break;
264 case arg_CH:
265 rv = 1 + (unsigned char *)&regs->cx;
266 break;
267 case arg_DH:
268 rv = 1 + (unsigned char *)&regs->dx;
269 break;
270#ifdef __amd64__
271 case arg_R8:
272 rv = (unsigned char *)&regs->r8;
273 break;
274 case arg_R9:
275 rv = (unsigned char *)&regs->r9;
276 break;
277 case arg_R10:
278 rv = (unsigned char *)&regs->r10;
279 break;
280 case arg_R11:
281 rv = (unsigned char *)&regs->r11;
282 break;
283 case arg_R12:
284 rv = (unsigned char *)&regs->r12;
285 break;
286 case arg_R13:
287 rv = (unsigned char *)&regs->r13;
288 break;
289 case arg_R14:
290 rv = (unsigned char *)&regs->r14;
291 break;
292 case arg_R15:
293 rv = (unsigned char *)&regs->r15;
294 break;
295#endif
296 default:
297 printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
298 break;
299 }
300 return rv;
301}
302
303static unsigned long *get_reg_w32(int no, struct pt_regs *regs)
304{
305 unsigned long *rv = NULL;
306
307 switch (no) {
308 case arg_AX:
309 rv = &regs->ax;
310 break;
311 case arg_BX:
312 rv = &regs->bx;
313 break;
314 case arg_CX:
315 rv = &regs->cx;
316 break;
317 case arg_DX:
318 rv = &regs->dx;
319 break;
320 case arg_SP:
321 rv = &regs->sp;
322 break;
323 case arg_BP:
324 rv = &regs->bp;
325 break;
326 case arg_SI:
327 rv = &regs->si;
328 break;
329 case arg_DI:
330 rv = &regs->di;
331 break;
332#ifdef __amd64__
333 case arg_R8:
334 rv = &regs->r8;
335 break;
336 case arg_R9:
337 rv = &regs->r9;
338 break;
339 case arg_R10:
340 rv = &regs->r10;
341 break;
342 case arg_R11:
343 rv = &regs->r11;
344 break;
345 case arg_R12:
346 rv = &regs->r12;
347 break;
348 case arg_R13:
349 rv = &regs->r13;
350 break;
351 case arg_R14:
352 rv = &regs->r14;
353 break;
354 case arg_R15:
355 rv = &regs->r15;
356 break;
357#endif
358 default:
359 printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
360 }
361
362 return rv;
363}
364
365unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
366{
367 unsigned int opcode;
368 unsigned char mod_rm;
369 int reg;
370 unsigned char *p;
371 int i, shorted, enlarged, rexr;
372 unsigned long rv;
373
374 p = (unsigned char *)ins_addr;
375 p += skip_prefix(p, &shorted, &enlarged, &rexr);
376 p += get_opcode(p, &opcode);
377 for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
378 if (reg_rop[i] == opcode) {
379 rv = REG_READ;
380 goto do_work;
381 }
382
383 for (i = 0; i < ARRAY_SIZE(reg_wop); i++)
384 if (reg_wop[i] == opcode) {
385 rv = REG_WRITE;
386 goto do_work;
387 }
388
389 printk(KERN_ERR "mmiotrace: Not a register instruction, opcode "
390 "0x%02x\n", opcode);
391 goto err;
392
393do_work:
394 mod_rm = *p;
395 reg = ((mod_rm >> 3) & 0x7) | (rexr << 3);
396 switch (get_ins_reg_width(ins_addr)) {
397 case 1:
398 return *get_reg_w8(reg, regs);
399
400 case 2:
401 return *(unsigned short *)get_reg_w32(reg, regs);
402
403 case 4:
404 return *(unsigned int *)get_reg_w32(reg, regs);
405
406#ifdef __amd64__
407 case 8:
408 return *(unsigned long *)get_reg_w32(reg, regs);
409#endif
410
411 default:
412 printk(KERN_ERR "mmiotrace: Error width# %d\n", reg);
413 }
414
415err:
416 return 0;
417}
418
419unsigned long get_ins_imm_val(unsigned long ins_addr)
420{
421 unsigned int opcode;
422 unsigned char mod_rm;
423 unsigned char mod;
424 unsigned char *p;
425 int i, shorted, enlarged, rexr;
426 unsigned long rv;
427
428 p = (unsigned char *)ins_addr;
429 p += skip_prefix(p, &shorted, &enlarged, &rexr);
430 p += get_opcode(p, &opcode);
431 for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
432 if (imm_wop[i] == opcode) {
433 rv = IMM_WRITE;
434 goto do_work;
435 }
436
437 printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode "
438 "0x%02x\n", opcode);
439 goto err;
440
441do_work:
442 mod_rm = *p;
443 mod = mod_rm >> 6;
444 p++;
445 switch (mod) {
446 case 0:
447 /* if r/m is 5 we have a 32 disp (IA32 Manual 3, Table 2-2) */
448 /* AMD64: XXX Check for address size prefix? */
449 if ((mod_rm & 0x7) == 0x5)
450 p += 4;
451 break;
452
453 case 1:
454 p += 1;
455 break;
456
457 case 2:
458 p += 4;
459 break;
460
461 case 3:
462 default:
463 printk(KERN_ERR "mmiotrace: not a memory access instruction "
464 "at 0x%lx, rm_mod=0x%02x\n",
465 ins_addr, mod_rm);
466 }
467
468 switch (get_ins_reg_width(ins_addr)) {
469 case 1:
470 return *(unsigned char *)p;
471
472 case 2:
473 return *(unsigned short *)p;
474
475 case 4:
476 return *(unsigned int *)p;
477
478#ifdef __amd64__
479 case 8:
480 return *(unsigned long *)p;
481#endif
482
483 default:
484 printk(KERN_ERR "mmiotrace: Error: width.\n");
485 }
486
487err:
488 return 0;
489}
diff --git a/arch/x86/mm/pf_in.h b/arch/x86/mm/pf_in.h
new file mode 100644
index 000000000000..e05341a51a27
--- /dev/null
+++ b/arch/x86/mm/pf_in.h
@@ -0,0 +1,39 @@
1/*
2 * Fault Injection Test harness (FI)
3 * Copyright (C) Intel Crop.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
18 * USA.
19 *
20 */
21
22#ifndef __PF_H_
23#define __PF_H_
24
25enum reason_type {
26 NOT_ME, /* page fault is not in regions */
27 NOTHING, /* access others point in regions */
28 REG_READ, /* read from addr to reg */
29 REG_WRITE, /* write from reg to addr */
30 IMM_WRITE, /* write from imm to addr */
31 OTHERS /* Other instructions can not intercept */
32};
33
34enum reason_type get_ins_type(unsigned long ins_addr);
35unsigned int get_ins_mem_width(unsigned long ins_addr);
36unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs);
37unsigned long get_ins_imm_val(unsigned long ins_addr);
38
39#endif /* __PF_H_ */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 50159764f694..557b2abceef8 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -2,6 +2,7 @@
2#include <asm/pgalloc.h> 2#include <asm/pgalloc.h>
3#include <asm/pgtable.h> 3#include <asm/pgtable.h>
4#include <asm/tlb.h> 4#include <asm/tlb.h>
5#include <asm/fixmap.h>
5 6
6pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) 7pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
7{ 8{
@@ -65,12 +66,6 @@ static inline void pgd_list_del(pgd_t *pgd)
65static void pgd_ctor(void *p) 66static void pgd_ctor(void *p)
66{ 67{
67 pgd_t *pgd = p; 68 pgd_t *pgd = p;
68 unsigned long flags;
69
70 /* Clear usermode parts of PGD */
71 memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
72
73 spin_lock_irqsave(&pgd_lock, flags);
74 69
75 /* If the pgd points to a shared pagetable level (either the 70 /* If the pgd points to a shared pagetable level (either the
76 ptes in non-PAE, or shared PMD in PAE), then just copy the 71 ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -90,8 +85,6 @@ static void pgd_ctor(void *p)
90 /* list required to sync kernel mapping updates */ 85 /* list required to sync kernel mapping updates */
91 if (!SHARED_KERNEL_PMD) 86 if (!SHARED_KERNEL_PMD)
92 pgd_list_add(pgd); 87 pgd_list_add(pgd);
93
94 spin_unlock_irqrestore(&pgd_lock, flags);
95} 88}
96 89
97static void pgd_dtor(void *pgd) 90static void pgd_dtor(void *pgd)
@@ -119,6 +112,72 @@ static void pgd_dtor(void *pgd)
119 112
120#ifdef CONFIG_X86_PAE 113#ifdef CONFIG_X86_PAE
121/* 114/*
115 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
116 * updating the top-level pagetable entries to guarantee the
117 * processor notices the update. Since this is expensive, and
118 * all 4 top-level entries are used almost immediately in a
119 * new process's life, we just pre-populate them here.
120 *
121 * Also, if we're in a paravirt environment where the kernel pmd is
122 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
123 * and initialize the kernel pmds here.
124 */
125#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
126
127void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
128{
129 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
130
131 /* Note: almost everything apart from _PAGE_PRESENT is
132 reserved at the pmd (PDPT) level. */
133 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
134
135 /*
136 * According to Intel App note "TLBs, Paging-Structure Caches,
137 * and Their Invalidation", April 2007, document 317080-001,
138 * section 8.1: in PAE mode we explicitly have to flush the
139 * TLB via cr3 if the top-level pgd is changed...
140 */
141 if (mm == current->active_mm)
142 write_cr3(read_cr3());
143}
144#else /* !CONFIG_X86_PAE */
145
146/* No need to prepopulate any pagetable entries in non-PAE modes. */
147#define PREALLOCATED_PMDS 0
148
149#endif /* CONFIG_X86_PAE */
150
151static void free_pmds(pmd_t *pmds[])
152{
153 int i;
154
155 for(i = 0; i < PREALLOCATED_PMDS; i++)
156 if (pmds[i])
157 free_page((unsigned long)pmds[i]);
158}
159
160static int preallocate_pmds(pmd_t *pmds[])
161{
162 int i;
163 bool failed = false;
164
165 for(i = 0; i < PREALLOCATED_PMDS; i++) {
166 pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
167 if (pmd == NULL)
168 failed = true;
169 pmds[i] = pmd;
170 }
171
172 if (failed) {
173 free_pmds(pmds);
174 return -ENOMEM;
175 }
176
177 return 0;
178}
179
180/*
122 * Mop up any pmd pages which may still be attached to the pgd. 181 * Mop up any pmd pages which may still be attached to the pgd.
123 * Normally they will be freed by munmap/exit_mmap, but any pmd we 182 * Normally they will be freed by munmap/exit_mmap, but any pmd we
124 * preallocate which never got a corresponding vma will need to be 183 * preallocate which never got a corresponding vma will need to be
@@ -128,7 +187,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
128{ 187{
129 int i; 188 int i;
130 189
131 for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { 190 for(i = 0; i < PREALLOCATED_PMDS; i++) {
132 pgd_t pgd = pgdp[i]; 191 pgd_t pgd = pgdp[i];
133 192
134 if (pgd_val(pgd) != 0) { 193 if (pgd_val(pgd) != 0) {
@@ -142,32 +201,17 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
142 } 201 }
143} 202}
144 203
145/* 204static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
146 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
147 * updating the top-level pagetable entries to guarantee the
148 * processor notices the update. Since this is expensive, and
149 * all 4 top-level entries are used almost immediately in a
150 * new process's life, we just pre-populate them here.
151 *
152 * Also, if we're in a paravirt environment where the kernel pmd is
153 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
154 * and initialize the kernel pmds here.
155 */
156static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
157{ 205{
158 pud_t *pud; 206 pud_t *pud;
159 unsigned long addr; 207 unsigned long addr;
160 int i; 208 int i;
161 209
162 pud = pud_offset(pgd, 0); 210 pud = pud_offset(pgd, 0);
163 for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
164 i++, pud++, addr += PUD_SIZE) {
165 pmd_t *pmd = pmd_alloc_one(mm, addr);
166 211
167 if (!pmd) { 212 for (addr = i = 0; i < PREALLOCATED_PMDS;
168 pgd_mop_up_pmds(mm, pgd); 213 i++, pud++, addr += PUD_SIZE) {
169 return 0; 214 pmd_t *pmd = pmds[i];
170 }
171 215
172 if (i >= KERNEL_PGD_BOUNDARY) 216 if (i >= KERNEL_PGD_BOUNDARY)
173 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), 217 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
@@ -175,61 +219,54 @@ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
175 219
176 pud_populate(mm, pud, pmd); 220 pud_populate(mm, pud, pmd);
177 } 221 }
178
179 return 1;
180} 222}
181 223
182void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) 224pgd_t *pgd_alloc(struct mm_struct *mm)
183{ 225{
184 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); 226 pgd_t *pgd;
227 pmd_t *pmds[PREALLOCATED_PMDS];
228 unsigned long flags;
185 229
186 /* Note: almost everything apart from _PAGE_PRESENT is 230 pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
187 reserved at the pmd (PDPT) level. */
188 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
189 231
190 /* 232 if (pgd == NULL)
191 * According to Intel App note "TLBs, Paging-Structure Caches, 233 goto out;
192 * and Their Invalidation", April 2007, document 317080-001,
193 * section 8.1: in PAE mode we explicitly have to flush the
194 * TLB via cr3 if the top-level pgd is changed...
195 */
196 if (mm == current->active_mm)
197 write_cr3(read_cr3());
198}
199#else /* !CONFIG_X86_PAE */
200/* No need to prepopulate any pagetable entries in non-PAE modes. */
201static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
202{
203 return 1;
204}
205 234
206static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd) 235 mm->pgd = pgd;
207{
208}
209#endif /* CONFIG_X86_PAE */
210 236
211pgd_t *pgd_alloc(struct mm_struct *mm) 237 if (preallocate_pmds(pmds) != 0)
212{ 238 goto out_free_pgd;
213 pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
214 239
215 /* so that alloc_pmd can use it */ 240 if (paravirt_pgd_alloc(mm) != 0)
216 mm->pgd = pgd; 241 goto out_free_pmds;
217 if (pgd)
218 pgd_ctor(pgd);
219 242
220 if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { 243 /*
221 pgd_dtor(pgd); 244 * Make sure that pre-populating the pmds is atomic with
222 free_page((unsigned long)pgd); 245 * respect to anything walking the pgd_list, so that they
223 pgd = NULL; 246 * never see a partially populated pgd.
224 } 247 */
248 spin_lock_irqsave(&pgd_lock, flags);
249
250 pgd_ctor(pgd);
251 pgd_prepopulate_pmd(mm, pgd, pmds);
252
253 spin_unlock_irqrestore(&pgd_lock, flags);
225 254
226 return pgd; 255 return pgd;
256
257out_free_pmds:
258 free_pmds(pmds);
259out_free_pgd:
260 free_page((unsigned long)pgd);
261out:
262 return NULL;
227} 263}
228 264
229void pgd_free(struct mm_struct *mm, pgd_t *pgd) 265void pgd_free(struct mm_struct *mm, pgd_t *pgd)
230{ 266{
231 pgd_mop_up_pmds(mm, pgd); 267 pgd_mop_up_pmds(mm, pgd);
232 pgd_dtor(pgd); 268 pgd_dtor(pgd);
269 paravirt_pgd_free(mm, pgd);
233 free_page((unsigned long)pgd); 270 free_page((unsigned long)pgd);
234} 271}
235 272
@@ -255,7 +292,7 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
255 292
256 if (pte_young(*ptep)) 293 if (pte_young(*ptep))
257 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, 294 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
258 &ptep->pte); 295 (unsigned long *) &ptep->pte);
259 296
260 if (ret) 297 if (ret)
261 pte_update(vma->vm_mm, addr, ptep); 298 pte_update(vma->vm_mm, addr, ptep);
@@ -274,3 +311,22 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
274 311
275 return young; 312 return young;
276} 313}
314
315int fixmaps_set;
316
317void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
318{
319 unsigned long address = __fix_to_virt(idx);
320
321 if (idx >= __end_of_fixed_addresses) {
322 BUG();
323 return;
324 }
325 set_pte_vaddr(address, pte);
326 fixmaps_set++;
327}
328
329void native_set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
330{
331 __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
332}
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 369cf065b6a4..cab0abbd1ebe 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -20,58 +20,11 @@
20#include <asm/tlb.h> 20#include <asm/tlb.h>
21#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
22 22
23void show_mem(void)
24{
25 int total = 0, reserved = 0;
26 int shared = 0, cached = 0;
27 int highmem = 0;
28 struct page *page;
29 pg_data_t *pgdat;
30 unsigned long i;
31 unsigned long flags;
32
33 printk(KERN_INFO "Mem-info:\n");
34 show_free_areas();
35 for_each_online_pgdat(pgdat) {
36 pgdat_resize_lock(pgdat, &flags);
37 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
38 if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
39 touch_nmi_watchdog();
40 page = pgdat_page_nr(pgdat, i);
41 total++;
42 if (PageHighMem(page))
43 highmem++;
44 if (PageReserved(page))
45 reserved++;
46 else if (PageSwapCache(page))
47 cached++;
48 else if (page_count(page))
49 shared += page_count(page) - 1;
50 }
51 pgdat_resize_unlock(pgdat, &flags);
52 }
53 printk(KERN_INFO "%d pages of RAM\n", total);
54 printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
55 printk(KERN_INFO "%d reserved pages\n", reserved);
56 printk(KERN_INFO "%d pages shared\n", shared);
57 printk(KERN_INFO "%d pages swap cached\n", cached);
58
59 printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
60 printk(KERN_INFO "%lu pages writeback\n",
61 global_page_state(NR_WRITEBACK));
62 printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
63 printk(KERN_INFO "%lu pages slab\n",
64 global_page_state(NR_SLAB_RECLAIMABLE) +
65 global_page_state(NR_SLAB_UNRECLAIMABLE));
66 printk(KERN_INFO "%lu pages pagetables\n",
67 global_page_state(NR_PAGETABLE));
68}
69
70/* 23/*
71 * Associate a virtual page frame with a given physical page frame 24 * Associate a virtual page frame with a given physical page frame
72 * and protection flags for that frame. 25 * and protection flags for that frame.
73 */ 26 */
74static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) 27void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
75{ 28{
76 pgd_t *pgd; 29 pgd_t *pgd;
77 pud_t *pud; 30 pud_t *pud;
@@ -94,8 +47,8 @@ static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
94 return; 47 return;
95 } 48 }
96 pte = pte_offset_kernel(pmd, vaddr); 49 pte = pte_offset_kernel(pmd, vaddr);
97 if (pgprot_val(flags)) 50 if (pte_val(pteval))
98 set_pte_present(&init_mm, vaddr, pte, pfn_pte(pfn, flags)); 51 set_pte_present(&init_mm, vaddr, pte, pteval);
99 else 52 else
100 pte_clear(&init_mm, vaddr, pte); 53 pte_clear(&init_mm, vaddr, pte);
101 54
@@ -141,22 +94,9 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
141 __flush_tlb_one(vaddr); 94 __flush_tlb_one(vaddr);
142} 95}
143 96
144static int fixmaps;
145unsigned long __FIXADDR_TOP = 0xfffff000; 97unsigned long __FIXADDR_TOP = 0xfffff000;
146EXPORT_SYMBOL(__FIXADDR_TOP); 98EXPORT_SYMBOL(__FIXADDR_TOP);
147 99
148void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
149{
150 unsigned long address = __fix_to_virt(idx);
151
152 if (idx >= __end_of_fixed_addresses) {
153 BUG();
154 return;
155 }
156 set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
157 fixmaps++;
158}
159
160/** 100/**
161 * reserve_top_address - reserves a hole in the top of kernel address space 101 * reserve_top_address - reserves a hole in the top of kernel address space
162 * @reserve - size of hole to reserve 102 * @reserve - size of hole to reserve
@@ -164,11 +104,44 @@ void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
164 * Can be used to relocate the fixmap area and poke a hole in the top 104 * Can be used to relocate the fixmap area and poke a hole in the top
165 * of kernel address space to make room for a hypervisor. 105 * of kernel address space to make room for a hypervisor.
166 */ 106 */
167void reserve_top_address(unsigned long reserve) 107void __init reserve_top_address(unsigned long reserve)
168{ 108{
169 BUG_ON(fixmaps > 0); 109 BUG_ON(fixmaps_set > 0);
170 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", 110 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
171 (int)-reserve); 111 (int)-reserve);
172 __FIXADDR_TOP = -reserve - PAGE_SIZE; 112 __FIXADDR_TOP = -reserve - PAGE_SIZE;
173 __VMALLOC_RESERVE += reserve; 113 __VMALLOC_RESERVE += reserve;
174} 114}
115
116/*
117 * vmalloc=size forces the vmalloc area to be exactly 'size'
118 * bytes. This can be used to increase (or decrease) the
119 * vmalloc area - the default is 128m.
120 */
121static int __init parse_vmalloc(char *arg)
122{
123 if (!arg)
124 return -EINVAL;
125
126 __VMALLOC_RESERVE = memparse(arg, &arg);
127 return 0;
128}
129early_param("vmalloc", parse_vmalloc);
130
131/*
132 * reservetop=size reserves a hole at the top of the kernel address space which
133 * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
134 * so relocating the fixmap can be done before paging initialization.
135 */
136static int __init parse_reservetop(char *arg)
137{
138 unsigned long address;
139
140 if (!arg)
141 return -EINVAL;
142
143 address = memparse(arg, &arg);
144 reserve_top_address(address);
145 return 0;
146}
147early_param("reservetop", parse_reservetop);
diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/mm/srat_32.c
index 70e4a374b4e8..1eb2973a301c 100644
--- a/arch/x86/kernel/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -31,6 +31,7 @@
31#include <asm/srat.h> 31#include <asm/srat.h>
32#include <asm/topology.h> 32#include <asm/topology.h>
33#include <asm/smp.h> 33#include <asm/smp.h>
34#include <asm/e820.h>
34 35
35/* 36/*
36 * proximity macros and definitions 37 * proximity macros and definitions
@@ -41,7 +42,7 @@
41#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit))) 42#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
42/* bitmap length; _PXM is at most 255 */ 43/* bitmap length; _PXM is at most 255 */
43#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) 44#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
44static u8 pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */ 45static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */
45 46
46#define MAX_CHUNKS_PER_NODE 3 47#define MAX_CHUNKS_PER_NODE 3
47#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES) 48#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
@@ -52,16 +53,37 @@ struct node_memory_chunk_s {
52 u8 nid; // which cnode contains this chunk? 53 u8 nid; // which cnode contains this chunk?
53 u8 bank; // which mem bank on this node 54 u8 bank; // which mem bank on this node
54}; 55};
55static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS]; 56static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
56 57
57static int num_memory_chunks; /* total number of memory chunks */ 58static int __initdata num_memory_chunks; /* total number of memory chunks */
58static u8 __initdata apicid_to_pxm[MAX_APICID]; 59static u8 __initdata apicid_to_pxm[MAX_APICID];
59 60
61int numa_off __initdata;
62int acpi_numa __initdata;
63
64static __init void bad_srat(void)
65{
66 printk(KERN_ERR "SRAT: SRAT not used.\n");
67 acpi_numa = -1;
68 num_memory_chunks = 0;
69}
70
71static __init inline int srat_disabled(void)
72{
73 return numa_off || acpi_numa < 0;
74}
75
60/* Identify CPU proximity domains */ 76/* Identify CPU proximity domains */
61static void __init parse_cpu_affinity_structure(char *p) 77void __init
78acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
62{ 79{
63 struct acpi_srat_cpu_affinity *cpu_affinity = 80 if (srat_disabled())
64 (struct acpi_srat_cpu_affinity *) p; 81 return;
82 if (cpu_affinity->header.length !=
83 sizeof(struct acpi_srat_cpu_affinity)) {
84 bad_srat();
85 return;
86 }
65 87
66 if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0) 88 if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
67 return; /* empty entry */ 89 return; /* empty entry */
@@ -71,7 +93,7 @@ static void __init parse_cpu_affinity_structure(char *p)
71 93
72 apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo; 94 apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
73 95
74 printk("CPU 0x%02X in proximity domain 0x%02X\n", 96 printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
75 cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo); 97 cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
76} 98}
77 99
@@ -79,14 +101,21 @@ static void __init parse_cpu_affinity_structure(char *p)
79 * Identify memory proximity domains and hot-remove capabilities. 101 * Identify memory proximity domains and hot-remove capabilities.
80 * Fill node memory chunk list structure. 102 * Fill node memory chunk list structure.
81 */ 103 */
82static void __init parse_memory_affinity_structure (char *sratp) 104void __init
105acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
83{ 106{
84 unsigned long long paddr, size; 107 unsigned long long paddr, size;
85 unsigned long start_pfn, end_pfn; 108 unsigned long start_pfn, end_pfn;
86 u8 pxm; 109 u8 pxm;
87 struct node_memory_chunk_s *p, *q, *pend; 110 struct node_memory_chunk_s *p, *q, *pend;
88 struct acpi_srat_mem_affinity *memory_affinity = 111
89 (struct acpi_srat_mem_affinity *) sratp; 112 if (srat_disabled())
113 return;
114 if (memory_affinity->header.length !=
115 sizeof(struct acpi_srat_mem_affinity)) {
116 bad_srat();
117 return;
118 }
90 119
91 if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0) 120 if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
92 return; /* empty entry */ 121 return; /* empty entry */
@@ -105,7 +134,8 @@ static void __init parse_memory_affinity_structure (char *sratp)
105 134
106 135
107 if (num_memory_chunks >= MAXCHUNKS) { 136 if (num_memory_chunks >= MAXCHUNKS) {
108 printk("Too many mem chunks in SRAT. Ignoring %lld MBytes at %llx\n", 137 printk(KERN_WARNING "Too many mem chunks in SRAT."
138 " Ignoring %lld MBytes at %llx\n",
109 size/(1024*1024), paddr); 139 size/(1024*1024), paddr);
110 return; 140 return;
111 } 141 }
@@ -126,14 +156,22 @@ static void __init parse_memory_affinity_structure (char *sratp)
126 156
127 num_memory_chunks++; 157 num_memory_chunks++;
128 158
129 printk("Memory range 0x%lX to 0x%lX (type 0x%X) in proximity domain 0x%02X %s\n", 159 printk(KERN_DEBUG "Memory range %08lx to %08lx"
160 " in proximity domain %02x %s\n",
130 start_pfn, end_pfn, 161 start_pfn, end_pfn,
131 memory_affinity->memory_type,
132 pxm, 162 pxm,
133 ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ? 163 ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
134 "enabled and removable" : "enabled" ) ); 164 "enabled and removable" : "enabled" ) );
135} 165}
136 166
167/* Callback for SLIT parsing */
168void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
169{
170}
171
172void acpi_numa_arch_fixup(void)
173{
174}
137/* 175/*
138 * The SRAT table always lists ascending addresses, so can always 176 * The SRAT table always lists ascending addresses, so can always
139 * assume that the first "start" address that you see is the real 177 * assume that the first "start" address that you see is the real
@@ -149,7 +187,7 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c
149 * *possible* memory hotplug areas the same as normal RAM. 187 * *possible* memory hotplug areas the same as normal RAM.
150 */ 188 */
151 if (memory_chunk->start_pfn >= max_pfn) { 189 if (memory_chunk->start_pfn >= max_pfn) {
152 printk (KERN_INFO "Ignoring SRAT pfns: 0x%08lx -> %08lx\n", 190 printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
153 memory_chunk->start_pfn, memory_chunk->end_pfn); 191 memory_chunk->start_pfn, memory_chunk->end_pfn);
154 return; 192 return;
155 } 193 }
@@ -166,42 +204,17 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c
166 node_end_pfn[nid] = memory_chunk->end_pfn; 204 node_end_pfn[nid] = memory_chunk->end_pfn;
167} 205}
168 206
169/* Parse the ACPI Static Resource Affinity Table */ 207int __init get_memcfg_from_srat(void)
170static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
171{ 208{
172 u8 *start, *end, *p;
173 int i, j, nid; 209 int i, j, nid;
174 210
175 start = (u8 *)(&(sratp->reserved) + 1); /* skip header */
176 p = start;
177 end = (u8 *)sratp + sratp->header.length;
178 211
179 memset(pxm_bitmap, 0, sizeof(pxm_bitmap)); /* init proximity domain bitmap */ 212 if (srat_disabled())
180 memset(node_memory_chunk, 0, sizeof(node_memory_chunk)); 213 goto out_fail;
181
182 num_memory_chunks = 0;
183 while (p < end) {
184 switch (*p) {
185 case ACPI_SRAT_TYPE_CPU_AFFINITY:
186 parse_cpu_affinity_structure(p);
187 break;
188 case ACPI_SRAT_TYPE_MEMORY_AFFINITY:
189 parse_memory_affinity_structure(p);
190 break;
191 default:
192 printk("ACPI 2.0 SRAT: unknown entry skipped: type=0x%02X, len=%d\n", p[0], p[1]);
193 break;
194 }
195 p += p[1];
196 if (p[1] == 0) {
197 printk("acpi20_parse_srat: Entry length value is zero;"
198 " can't parse any further!\n");
199 break;
200 }
201 }
202 214
203 if (num_memory_chunks == 0) { 215 if (num_memory_chunks == 0) {
204 printk("could not finy any ACPI SRAT memory areas.\n"); 216 printk(KERN_WARNING
217 "could not finy any ACPI SRAT memory areas.\n");
205 goto out_fail; 218 goto out_fail;
206 } 219 }
207 220
@@ -228,131 +241,39 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
228 for (i = 0; i < num_memory_chunks; i++) 241 for (i = 0; i < num_memory_chunks; i++)
229 node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm); 242 node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
230 243
231 printk("pxm bitmap: "); 244 printk(KERN_DEBUG "pxm bitmap: ");
232 for (i = 0; i < sizeof(pxm_bitmap); i++) { 245 for (i = 0; i < sizeof(pxm_bitmap); i++) {
233 printk("%02X ", pxm_bitmap[i]); 246 printk(KERN_CONT "%02x ", pxm_bitmap[i]);
234 } 247 }
235 printk("\n"); 248 printk(KERN_CONT "\n");
236 printk("Number of logical nodes in system = %d\n", num_online_nodes()); 249 printk(KERN_DEBUG "Number of logical nodes in system = %d\n",
237 printk("Number of memory chunks in system = %d\n", num_memory_chunks); 250 num_online_nodes());
251 printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
252 num_memory_chunks);
238 253
239 for (i = 0; i < MAX_APICID; i++) 254 for (i = 0; i < MAX_APICID; i++)
240 apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]); 255 apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
241 256
242 for (j = 0; j < num_memory_chunks; j++){ 257 for (j = 0; j < num_memory_chunks; j++){
243 struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; 258 struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
244 printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", 259 printk(KERN_DEBUG
260 "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
245 j, chunk->nid, chunk->start_pfn, chunk->end_pfn); 261 j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
246 node_read_chunk(chunk->nid, chunk); 262 node_read_chunk(chunk->nid, chunk);
247 add_active_range(chunk->nid, chunk->start_pfn, chunk->end_pfn); 263 e820_register_active_regions(chunk->nid, chunk->start_pfn,
264 min(chunk->end_pfn, max_pfn));
248 } 265 }
249 266
250 for_each_online_node(nid) { 267 for_each_online_node(nid) {
251 unsigned long start = node_start_pfn[nid]; 268 unsigned long start = node_start_pfn[nid];
252 unsigned long end = node_end_pfn[nid]; 269 unsigned long end = min(node_end_pfn[nid], max_pfn);
253 270
254 memory_present(nid, start, end); 271 memory_present(nid, start, end);
255 node_remap_size[nid] = node_memmap_size_bytes(nid, start, end); 272 node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
256 } 273 }
257 return 1; 274 return 1;
258out_fail: 275out_fail:
259 return 0; 276 printk(KERN_ERR "failed to get NUMA memory information from SRAT"
260} 277 " table\n");
261
262struct acpi_static_rsdt {
263 struct acpi_table_rsdt table;
264 u32 padding[7]; /* Allow for 7 more table entries */
265};
266
267int __init get_memcfg_from_srat(void)
268{
269 struct acpi_table_header *header = NULL;
270 struct acpi_table_rsdp *rsdp = NULL;
271 struct acpi_table_rsdt *rsdt = NULL;
272 acpi_native_uint rsdp_address = 0;
273 struct acpi_static_rsdt saved_rsdt;
274 int tables = 0;
275 int i = 0;
276
277 rsdp_address = acpi_os_get_root_pointer();
278 if (!rsdp_address) {
279 printk("%s: System description tables not found\n",
280 __func__);
281 goto out_err;
282 }
283
284 printk("%s: assigning address to rsdp\n", __func__);
285 rsdp = (struct acpi_table_rsdp *)(u32)rsdp_address;
286 if (!rsdp) {
287 printk("%s: Didn't find ACPI root!\n", __func__);
288 goto out_err;
289 }
290
291 printk(KERN_INFO "%.8s v%d [%.6s]\n", rsdp->signature, rsdp->revision,
292 rsdp->oem_id);
293
294 if (strncmp(rsdp->signature, ACPI_SIG_RSDP,strlen(ACPI_SIG_RSDP))) {
295 printk(KERN_WARNING "%s: RSDP table signature incorrect\n", __func__);
296 goto out_err;
297 }
298
299 rsdt = (struct acpi_table_rsdt *)
300 early_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt));
301
302 if (!rsdt) {
303 printk(KERN_WARNING
304 "%s: ACPI: Invalid root system description tables (RSDT)\n",
305 __func__);
306 goto out_err;
307 }
308
309 header = &rsdt->header;
310
311 if (strncmp(header->signature, ACPI_SIG_RSDT, strlen(ACPI_SIG_RSDT))) {
312 printk(KERN_WARNING "ACPI: RSDT signature incorrect\n");
313 goto out_err;
314 }
315
316 /*
317 * The number of tables is computed by taking the
318 * size of all entries (header size minus total
319 * size of RSDT) divided by the size of each entry
320 * (4-byte table pointers).
321 */
322 tables = (header->length - sizeof(struct acpi_table_header)) / 4;
323
324 if (!tables)
325 goto out_err;
326
327 memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt));
328
329 if (saved_rsdt.table.header.length > sizeof(saved_rsdt)) {
330 printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n",
331 saved_rsdt.table.header.length);
332 goto out_err;
333 }
334
335 printk("Begin SRAT table scan....\n");
336
337 for (i = 0; i < tables; i++) {
338 /* Map in header, then map in full table length. */
339 header = (struct acpi_table_header *)
340 early_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header));
341 if (!header)
342 break;
343 header = (struct acpi_table_header *)
344 early_ioremap(saved_rsdt.table.table_offset_entry[i], header->length);
345 if (!header)
346 break;
347
348 if (strncmp((char *) &header->signature, ACPI_SIG_SRAT, 4))
349 continue;
350
351 /* we've found the srat table. don't need to look at any more tables */
352 return acpi20_parse_srat((struct acpi_table_srat *)header);
353 }
354out_err:
355 remove_all_active_ranges();
356 printk("failed to get NUMA memory information from SRAT table\n");
357 return 0; 278 return 0;
358} 279}
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 99649dccad28..1b4763e26ea9 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -100,7 +100,19 @@ static __init inline int srat_disabled(void)
100/* Callback for SLIT parsing */ 100/* Callback for SLIT parsing */
101void __init acpi_numa_slit_init(struct acpi_table_slit *slit) 101void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
102{ 102{
103 acpi_slit = slit; 103 unsigned length;
104 unsigned long phys;
105
106 length = slit->header.length;
107 phys = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, length,
108 PAGE_SIZE);
109
110 if (phys == -1L)
111 panic(" Can not save slit!\n");
112
113 acpi_slit = __va(phys);
114 memcpy(acpi_slit, slit, length);
115 reserve_early(phys, phys + length, "ACPI SLIT");
104} 116}
105 117
106/* Callback for Proximity Domain -> LAPIC mapping */ 118/* Callback for Proximity Domain -> LAPIC mapping */
@@ -299,7 +311,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
299 pxmram = 0; 311 pxmram = 0;
300 } 312 }
301 313
302 e820ram = end_pfn - absent_pages_in_range(0, end_pfn); 314 e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
303 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ 315 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
304 if ((long)(e820ram - pxmram) >= 1*1024*1024) { 316 if ((long)(e820ram - pxmram) >= 1*1024*1024) {
305 printk(KERN_ERR 317 printk(KERN_ERR
@@ -376,7 +388,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
376 if (node == NUMA_NO_NODE) 388 if (node == NUMA_NO_NODE)
377 continue; 389 continue;
378 if (!node_isset(node, node_possible_map)) 390 if (!node_isset(node, node_possible_map))
379 numa_set_node(i, NUMA_NO_NODE); 391 numa_clear_node(i);
380 } 392 }
381 numa_init_array(); 393 numa_init_array();
382 return 0; 394 return 0;
@@ -495,6 +507,7 @@ int __node_distance(int a, int b)
495 507
496EXPORT_SYMBOL(__node_distance); 508EXPORT_SYMBOL(__node_distance);
497 509
510#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
498int memory_add_physaddr_to_nid(u64 start) 511int memory_add_physaddr_to_nid(u64 start)
499{ 512{
500 int i, ret = 0; 513 int i, ret = 0;
@@ -506,4 +519,4 @@ int memory_add_physaddr_to_nid(u64 start)
506 return ret; 519 return ret;
507} 520}
508EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); 521EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
509 522#endif
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
new file mode 100644
index 000000000000..d877c5b423ef
--- /dev/null
+++ b/arch/x86/mm/testmmiotrace.c
@@ -0,0 +1,71 @@
1/*
2 * Written by Pekka Paalanen, 2008 <pq@iki.fi>
3 */
4#include <linux/module.h>
5#include <linux/io.h>
6
7#define MODULE_NAME "testmmiotrace"
8
9static unsigned long mmio_address;
10module_param(mmio_address, ulong, 0);
11MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB.");
12
13static void do_write_test(void __iomem *p)
14{
15 unsigned int i;
16 for (i = 0; i < 256; i++)
17 iowrite8(i, p + i);
18 for (i = 1024; i < (5 * 1024); i += 2)
19 iowrite16(i * 12 + 7, p + i);
20 for (i = (5 * 1024); i < (16 * 1024); i += 4)
21 iowrite32(i * 212371 + 13, p + i);
22}
23
24static void do_read_test(void __iomem *p)
25{
26 unsigned int i;
27 for (i = 0; i < 256; i++)
28 ioread8(p + i);
29 for (i = 1024; i < (5 * 1024); i += 2)
30 ioread16(p + i);
31 for (i = (5 * 1024); i < (16 * 1024); i += 4)
32 ioread32(p + i);
33}
34
35static void do_test(void)
36{
37 void __iomem *p = ioremap_nocache(mmio_address, 0x4000);
38 if (!p) {
39 pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
40 return;
41 }
42 do_write_test(p);
43 do_read_test(p);
44 iounmap(p);
45}
46
47static int __init init(void)
48{
49 if (mmio_address == 0) {
50 pr_err(MODULE_NAME ": you have to use the module argument "
51 "mmio_address.\n");
52 pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
53 " YOU REALLY KNOW WHAT YOU ARE DOING!\n");
54 return -ENXIO;
55 }
56
57 pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx "
58 "in PCI address space, and writing "
59 "rubbish in there.\n", mmio_address);
60 do_test();
61 return 0;
62}
63
64static void __exit cleanup(void)
65{
66 pr_debug(MODULE_NAME ": unloaded.\n");
67}
68
69module_init(init);
70module_exit(cleanup);
71MODULE_LICENSE("GPL");
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index cc48d3fde545..3f90289410e6 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -218,8 +218,8 @@ static int nmi_setup(void)
218 } 218 }
219 219
220 } 220 }
221 on_each_cpu(nmi_save_registers, NULL, 0, 1); 221 on_each_cpu(nmi_save_registers, NULL, 1);
222 on_each_cpu(nmi_cpu_setup, NULL, 0, 1); 222 on_each_cpu(nmi_cpu_setup, NULL, 1);
223 nmi_enabled = 1; 223 nmi_enabled = 1;
224 return 0; 224 return 0;
225} 225}
@@ -269,12 +269,13 @@ static void nmi_cpu_shutdown(void *dummy)
269 269
270static void nmi_shutdown(void) 270static void nmi_shutdown(void)
271{ 271{
272 struct op_msrs *msrs = &__get_cpu_var(cpu_msrs); 272 struct op_msrs *msrs = &get_cpu_var(cpu_msrs);
273 nmi_enabled = 0; 273 nmi_enabled = 0;
274 on_each_cpu(nmi_cpu_shutdown, NULL, 0, 1); 274 on_each_cpu(nmi_cpu_shutdown, NULL, 1);
275 unregister_die_notifier(&profile_exceptions_nb); 275 unregister_die_notifier(&profile_exceptions_nb);
276 model->shutdown(msrs); 276 model->shutdown(msrs);
277 free_msrs(); 277 free_msrs();
278 put_cpu_var(cpu_msrs);
278} 279}
279 280
280static void nmi_cpu_start(void *dummy) 281static void nmi_cpu_start(void *dummy)
@@ -285,7 +286,7 @@ static void nmi_cpu_start(void *dummy)
285 286
286static int nmi_start(void) 287static int nmi_start(void)
287{ 288{
288 on_each_cpu(nmi_cpu_start, NULL, 0, 1); 289 on_each_cpu(nmi_cpu_start, NULL, 1);
289 return 0; 290 return 0;
290} 291}
291 292
@@ -297,7 +298,7 @@ static void nmi_cpu_stop(void *dummy)
297 298
298static void nmi_stop(void) 299static void nmi_stop(void)
299{ 300{
300 on_each_cpu(nmi_cpu_stop, NULL, 0, 1); 301 on_each_cpu(nmi_cpu_stop, NULL, 1);
301} 302}
302 303
303struct op_counter_config counter_config[OP_MAX_COUNTER]; 304struct op_counter_config counter_config[OP_MAX_COUNTER];
@@ -368,20 +369,34 @@ static int __init ppro_init(char **cpu_type)
368{ 369{
369 __u8 cpu_model = boot_cpu_data.x86_model; 370 __u8 cpu_model = boot_cpu_data.x86_model;
370 371
371 if (cpu_model == 14) 372 switch (cpu_model) {
373 case 0 ... 2:
374 *cpu_type = "i386/ppro";
375 break;
376 case 3 ... 5:
377 *cpu_type = "i386/pii";
378 break;
379 case 6 ... 8:
380 *cpu_type = "i386/piii";
381 break;
382 case 9:
383 *cpu_type = "i386/p6_mobile";
384 break;
385 case 10 ... 13:
386 *cpu_type = "i386/p6";
387 break;
388 case 14:
372 *cpu_type = "i386/core"; 389 *cpu_type = "i386/core";
373 else if (cpu_model == 15 || cpu_model == 23) 390 break;
391 case 15: case 23:
392 *cpu_type = "i386/core_2";
393 break;
394 case 26:
374 *cpu_type = "i386/core_2"; 395 *cpu_type = "i386/core_2";
375 else if (cpu_model > 0xd) 396 break;
397 default:
398 /* Unknown */
376 return 0; 399 return 0;
377 else if (cpu_model == 9) {
378 *cpu_type = "i386/p6_mobile";
379 } else if (cpu_model > 5) {
380 *cpu_type = "i386/piii";
381 } else if (cpu_model > 2) {
382 *cpu_type = "i386/pii";
383 } else {
384 *cpu_type = "i386/ppro";
385 } 400 }
386 401
387 model = &op_ppro_spec; 402 model = &op_ppro_spec;
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index c5c8e485fc44..d49202e740ea 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -1,5 +1,17 @@
1ifeq ($(CONFIG_X86_32),y) 1obj-y := i386.o init.o
2include ${srctree}/arch/x86/pci/Makefile_32 2
3else 3obj-$(CONFIG_PCI_BIOS) += pcbios.o
4include ${srctree}/arch/x86/pci/Makefile_64 4obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_$(BITS).o direct.o mmconfig-shared.o
5endif 5obj-$(CONFIG_PCI_DIRECT) += direct.o
6obj-$(CONFIG_PCI_OLPC) += olpc.o
7
8obj-y += fixup.o
9obj-$(CONFIG_ACPI) += acpi.o
10obj-y += legacy.o irq.o
11
12obj-$(CONFIG_X86_VISWS) += visws.o
13
14obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
15
16obj-y += common.o early.o
17obj-y += amd_bus.o
diff --git a/arch/x86/pci/Makefile_32 b/arch/x86/pci/Makefile_32
deleted file mode 100644
index 89ec35d00efd..000000000000
--- a/arch/x86/pci/Makefile_32
+++ /dev/null
@@ -1,24 +0,0 @@
1obj-y := i386.o init.o
2
3obj-$(CONFIG_PCI_BIOS) += pcbios.o
4obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_32.o direct.o mmconfig-shared.o
5obj-$(CONFIG_PCI_DIRECT) += direct.o
6obj-$(CONFIG_PCI_OLPC) += olpc.o
7
8pci-y := fixup.o
9
10# Do not change the ordering here. There is a nasty init function
11# ordering dependency which breaks when you move acpi.o below
12# legacy/irq.o
13pci-$(CONFIG_ACPI) += acpi.o
14pci-y += legacy.o irq.o
15
16# Careful: VISWS and NUMAQ overrule the pci-y above. The colons are
17# therefor correct. This needs a proper fix by distangling the code.
18pci-$(CONFIG_X86_VISWS) := visws.o fixup.o
19pci-$(CONFIG_X86_NUMAQ) := numa.o irq.o
20
21# Necessary for NUMAQ as well
22pci-$(CONFIG_NUMA) += mp_bus_to_node.o
23
24obj-y += $(pci-y) common.o early.o
diff --git a/arch/x86/pci/Makefile_64 b/arch/x86/pci/Makefile_64
deleted file mode 100644
index 8fbd19832cf6..000000000000
--- a/arch/x86/pci/Makefile_64
+++ /dev/null
@@ -1,17 +0,0 @@
1#
2# Makefile for X86_64 specific PCI routines
3#
4# Reuse the i386 PCI subsystem
5#
6EXTRA_CFLAGS += -Iarch/x86/pci
7
8obj-y := i386.o
9obj-$(CONFIG_PCI_DIRECT)+= direct.o
10obj-y += fixup.o init.o
11obj-$(CONFIG_ACPI) += acpi.o
12obj-y += legacy.o irq.o common.o early.o
13# mmconfig has a 64bit special
14obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_64.o direct.o mmconfig-shared.o
15
16obj-y += k8-bus_64.o
17
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index d95de2f199cd..19af06927fbc 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -171,8 +171,11 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
171 if (node != -1) 171 if (node != -1)
172 set_mp_bus_to_node(busnum, node); 172 set_mp_bus_to_node(busnum, node);
173 else 173 else
174 node = get_mp_bus_to_node(busnum);
175#endif 174#endif
175 node = get_mp_bus_to_node(busnum);
176
177 if (node != -1 && !node_online(node))
178 node = -1;
176 179
177 /* Allocate per-root-bus (not per bus) arch-specific data. 180 /* Allocate per-root-bus (not per bus) arch-specific data.
178 * TODO: leak; this memory is never freed. 181 * TODO: leak; this memory is never freed.
@@ -204,22 +207,23 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
204 if (!bus) 207 if (!bus)
205 kfree(sd); 208 kfree(sd);
206 209
210 if (bus && node != -1) {
207#ifdef CONFIG_ACPI_NUMA 211#ifdef CONFIG_ACPI_NUMA
208 if (bus) { 212 if (pxm >= 0)
209 if (pxm >= 0) {
210 printk(KERN_DEBUG "bus %02x -> pxm %d -> node %d\n", 213 printk(KERN_DEBUG "bus %02x -> pxm %d -> node %d\n",
211 busnum, pxm, pxm_to_node(pxm)); 214 busnum, pxm, node);
212 } 215#else
213 } 216 printk(KERN_DEBUG "bus %02x -> node %d\n",
217 busnum, node);
214#endif 218#endif
219 }
215 220
216 if (bus && (pci_probe & PCI_USE__CRS)) 221 if (bus && (pci_probe & PCI_USE__CRS))
217 get_current_resources(device, busnum, domain, bus); 222 get_current_resources(device, busnum, domain, bus);
218 return bus; 223 return bus;
219} 224}
220 225
221extern int pci_routeirq; 226int __init pci_acpi_init(void)
222static int __init pci_acpi_init(void)
223{ 227{
224 struct pci_dev *dev = NULL; 228 struct pci_dev *dev = NULL;
225 229
@@ -253,4 +257,3 @@ static int __init pci_acpi_init(void)
253 257
254 return 0; 258 return 0;
255} 259}
256subsys_initcall(pci_acpi_init);
diff --git a/arch/x86/pci/k8-bus_64.c b/arch/x86/pci/amd_bus.c
index 5c2799c20e47..dbf532369711 100644
--- a/arch/x86/pci/k8-bus_64.c
+++ b/arch/x86/pci/amd_bus.c
@@ -1,40 +1,25 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/pci.h> 2#include <linux/pci.h>
3#include <linux/topology.h>
4#include "pci.h"
5
6#ifdef CONFIG_X86_64
3#include <asm/pci-direct.h> 7#include <asm/pci-direct.h>
4#include <asm/mpspec.h> 8#include <asm/mpspec.h>
5#include <linux/cpumask.h> 9#include <linux/cpumask.h>
6#include <linux/topology.h> 10#endif
7 11
8/* 12/*
9 * This discovers the pcibus <-> node mapping on AMD K8. 13 * This discovers the pcibus <-> node mapping on AMD K8.
10 * also get peer root bus resource for io,mmio 14 * also get peer root bus resource for io,mmio
11 */ 15 */
12 16
13
14/*
15 * sub bus (transparent) will use entres from 3 to store extra from root,
16 * so need to make sure have enought slot there, increase PCI_BUS_NUM_RESOURCES?
17 */
18#define RES_NUM 16
19struct pci_root_info {
20 char name[12];
21 unsigned int res_num;
22 struct resource res[RES_NUM];
23 int bus_min;
24 int bus_max;
25 int node;
26 int link;
27};
28
29/* 4 at this time, it may become to 32 */
30#define PCI_ROOT_NR 4
31static int pci_root_num;
32static struct pci_root_info pci_root_info[PCI_ROOT_NR];
33
34#ifdef CONFIG_NUMA 17#ifdef CONFIG_NUMA
35 18
36#define BUS_NR 256 19#define BUS_NR 256
37 20
21#ifdef CONFIG_X86_64
22
38static int mp_bus_to_node[BUS_NR]; 23static int mp_bus_to_node[BUS_NR];
39 24
40void set_mp_bus_to_node(int busnum, int node) 25void set_mp_bus_to_node(int busnum, int node)
@@ -61,7 +46,52 @@ int get_mp_bus_to_node(int busnum)
61 46
62 return node; 47 return node;
63} 48}
64#endif 49
50#else /* CONFIG_X86_32 */
51
52static unsigned char mp_bus_to_node[BUS_NR];
53
54void set_mp_bus_to_node(int busnum, int node)
55{
56 if (busnum >= 0 && busnum < BUS_NR)
57 mp_bus_to_node[busnum] = (unsigned char) node;
58}
59
60int get_mp_bus_to_node(int busnum)
61{
62 int node;
63
64 if (busnum < 0 || busnum > (BUS_NR - 1))
65 return 0;
66 node = mp_bus_to_node[busnum];
67 return node;
68}
69
70#endif /* CONFIG_X86_32 */
71
72#endif /* CONFIG_NUMA */
73
74#ifdef CONFIG_X86_64
75
76/*
77 * sub bus (transparent) will use entres from 3 to store extra from root,
78 * so need to make sure have enought slot there, increase PCI_BUS_NUM_RESOURCES?
79 */
80#define RES_NUM 16
81struct pci_root_info {
82 char name[12];
83 unsigned int res_num;
84 struct resource res[RES_NUM];
85 int bus_min;
86 int bus_max;
87 int node;
88 int link;
89};
90
91/* 4 at this time, it may become to 32 */
92#define PCI_ROOT_NR 4
93static int pci_root_num;
94static struct pci_root_info pci_root_info[PCI_ROOT_NR];
65 95
66void set_pci_bus_resources_arch_default(struct pci_bus *b) 96void set_pci_bus_resources_arch_default(struct pci_bus *b)
67{ 97{
@@ -384,7 +414,7 @@ static int __init early_fill_mp_bus_info(void)
384 /* need to take out [0, TOM) for RAM*/ 414 /* need to take out [0, TOM) for RAM*/
385 address = MSR_K8_TOP_MEM1; 415 address = MSR_K8_TOP_MEM1;
386 rdmsrl(address, val); 416 rdmsrl(address, val);
387 end = (val & 0xffffff8000000ULL); 417 end = (val & 0xffffff800000ULL);
388 printk(KERN_INFO "TOM: %016lx aka %ldM\n", end, end>>20); 418 printk(KERN_INFO "TOM: %016lx aka %ldM\n", end, end>>20);
389 if (end < (1ULL<<32)) 419 if (end < (1ULL<<32))
390 update_range(range, 0, end - 1); 420 update_range(range, 0, end - 1);
@@ -478,7 +508,7 @@ static int __init early_fill_mp_bus_info(void)
478 /* TOP_MEM2 */ 508 /* TOP_MEM2 */
479 address = MSR_K8_TOP_MEM2; 509 address = MSR_K8_TOP_MEM2;
480 rdmsrl(address, val); 510 rdmsrl(address, val);
481 end = (val & 0xffffff8000000ULL); 511 end = (val & 0xffffff800000ULL);
482 printk(KERN_INFO "TOM2: %016lx aka %ldM\n", end, end>>20); 512 printk(KERN_INFO "TOM2: %016lx aka %ldM\n", end, end>>20);
483 update_range(range, 1ULL<<32, end - 1); 513 update_range(range, 1ULL<<32, end - 1);
484 } 514 }
@@ -526,3 +556,31 @@ static int __init early_fill_mp_bus_info(void)
526} 556}
527 557
528postcore_initcall(early_fill_mp_bus_info); 558postcore_initcall(early_fill_mp_bus_info);
559
560#endif
561
562/* common 32/64 bit code */
563
564#define ENABLE_CF8_EXT_CFG (1ULL << 46)
565
566static void enable_pci_io_ecs_per_cpu(void *unused)
567{
568 u64 reg;
569 rdmsrl(MSR_AMD64_NB_CFG, reg);
570 if (!(reg & ENABLE_CF8_EXT_CFG)) {
571 reg |= ENABLE_CF8_EXT_CFG;
572 wrmsrl(MSR_AMD64_NB_CFG, reg);
573 }
574}
575
576static int __init enable_pci_io_ecs(void)
577{
578 /* assume all cpus from fam10h have IO ECS */
579 if (boot_cpu_data.x86 < 0x10)
580 return 0;
581 on_each_cpu(enable_pci_io_ecs_per_cpu, NULL, 1);
582 pci_probe |= PCI_HAS_IO_ECS;
583 return 0;
584}
585
586postcore_initcall(enable_pci_io_ecs);
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 6e64aaf00d1d..b67732bbb85a 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -20,6 +20,7 @@
20unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | 20unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
21 PCI_PROBE_MMCONF; 21 PCI_PROBE_MMCONF;
22 22
23unsigned int pci_early_dump_regs;
23static int pci_bf_sort; 24static int pci_bf_sort;
24int pci_routeirq; 25int pci_routeirq;
25int pcibios_last_bus = -1; 26int pcibios_last_bus = -1;
@@ -31,7 +32,7 @@ struct pci_raw_ops *raw_pci_ext_ops;
31int raw_pci_read(unsigned int domain, unsigned int bus, unsigned int devfn, 32int raw_pci_read(unsigned int domain, unsigned int bus, unsigned int devfn,
32 int reg, int len, u32 *val) 33 int reg, int len, u32 *val)
33{ 34{
34 if (reg < 256 && raw_pci_ops) 35 if (domain == 0 && reg < 256 && raw_pci_ops)
35 return raw_pci_ops->read(domain, bus, devfn, reg, len, val); 36 return raw_pci_ops->read(domain, bus, devfn, reg, len, val);
36 if (raw_pci_ext_ops) 37 if (raw_pci_ext_ops)
37 return raw_pci_ext_ops->read(domain, bus, devfn, reg, len, val); 38 return raw_pci_ext_ops->read(domain, bus, devfn, reg, len, val);
@@ -41,7 +42,7 @@ int raw_pci_read(unsigned int domain, unsigned int bus, unsigned int devfn,
41int raw_pci_write(unsigned int domain, unsigned int bus, unsigned int devfn, 42int raw_pci_write(unsigned int domain, unsigned int bus, unsigned int devfn,
42 int reg, int len, u32 val) 43 int reg, int len, u32 val)
43{ 44{
44 if (reg < 256 && raw_pci_ops) 45 if (domain == 0 && reg < 256 && raw_pci_ops)
45 return raw_pci_ops->write(domain, bus, devfn, reg, len, val); 46 return raw_pci_ops->write(domain, bus, devfn, reg, len, val);
46 if (raw_pci_ext_ops) 47 if (raw_pci_ext_ops)
47 return raw_pci_ext_ops->write(domain, bus, devfn, reg, len, val); 48 return raw_pci_ext_ops->write(domain, bus, devfn, reg, len, val);
@@ -121,6 +122,21 @@ void __init dmi_check_skip_isa_align(void)
121 dmi_check_system(can_skip_pciprobe_dmi_table); 122 dmi_check_system(can_skip_pciprobe_dmi_table);
122} 123}
123 124
125static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev)
126{
127 struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE];
128
129 if (pci_probe & PCI_NOASSIGN_ROMS) {
130 if (rom_r->parent)
131 return;
132 if (rom_r->start) {
133 /* we deal with BIOS assigned ROM later */
134 return;
135 }
136 rom_r->start = rom_r->end = rom_r->flags = 0;
137 }
138}
139
124/* 140/*
125 * Called after each bus is probed, but before its children 141 * Called after each bus is probed, but before its children
126 * are examined. 142 * are examined.
@@ -128,7 +144,11 @@ void __init dmi_check_skip_isa_align(void)
128 144
129void __devinit pcibios_fixup_bus(struct pci_bus *b) 145void __devinit pcibios_fixup_bus(struct pci_bus *b)
130{ 146{
147 struct pci_dev *dev;
148
131 pci_read_bridge_bases(b); 149 pci_read_bridge_bases(b);
150 list_for_each_entry(dev, &b->devices, bus_list)
151 pcibios_fixup_device_resources(dev);
132} 152}
133 153
134/* 154/*
@@ -384,7 +404,7 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum)
384 404
385extern u8 pci_cache_line_size; 405extern u8 pci_cache_line_size;
386 406
387static int __init pcibios_init(void) 407int __init pcibios_init(void)
388{ 408{
389 struct cpuinfo_x86 *c = &boot_cpu_data; 409 struct cpuinfo_x86 *c = &boot_cpu_data;
390 410
@@ -411,8 +431,6 @@ static int __init pcibios_init(void)
411 return 0; 431 return 0;
412} 432}
413 433
414subsys_initcall(pcibios_init);
415
416char * __devinit pcibios_setup(char *str) 434char * __devinit pcibios_setup(char *str)
417{ 435{
418 if (!strcmp(str, "off")) { 436 if (!strcmp(str, "off")) {
@@ -483,12 +501,18 @@ char * __devinit pcibios_setup(char *str)
483 else if (!strcmp(str, "rom")) { 501 else if (!strcmp(str, "rom")) {
484 pci_probe |= PCI_ASSIGN_ROMS; 502 pci_probe |= PCI_ASSIGN_ROMS;
485 return NULL; 503 return NULL;
504 } else if (!strcmp(str, "norom")) {
505 pci_probe |= PCI_NOASSIGN_ROMS;
506 return NULL;
486 } else if (!strcmp(str, "assign-busses")) { 507 } else if (!strcmp(str, "assign-busses")) {
487 pci_probe |= PCI_ASSIGN_ALL_BUSSES; 508 pci_probe |= PCI_ASSIGN_ALL_BUSSES;
488 return NULL; 509 return NULL;
489 } else if (!strcmp(str, "use_crs")) { 510 } else if (!strcmp(str, "use_crs")) {
490 pci_probe |= PCI_USE__CRS; 511 pci_probe |= PCI_USE__CRS;
491 return NULL; 512 return NULL;
513 } else if (!strcmp(str, "earlydump")) {
514 pci_early_dump_regs = 1;
515 return NULL;
492 } else if (!strcmp(str, "routeirq")) { 516 } else if (!strcmp(str, "routeirq")) {
493 pci_routeirq = 1; 517 pci_routeirq = 1;
494 return NULL; 518 return NULL;
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c
index 21d1e0e0d535..9915293500fb 100644
--- a/arch/x86/pci/direct.c
+++ b/arch/x86/pci/direct.c
@@ -8,18 +8,21 @@
8#include "pci.h" 8#include "pci.h"
9 9
10/* 10/*
11 * Functions for accessing PCI configuration space with type 1 accesses 11 * Functions for accessing PCI base (first 256 bytes) and extended
12 * (4096 bytes per PCI function) configuration space with type 1
13 * accesses.
12 */ 14 */
13 15
14#define PCI_CONF1_ADDRESS(bus, devfn, reg) \ 16#define PCI_CONF1_ADDRESS(bus, devfn, reg) \
15 (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3)) 17 (0x80000000 | ((reg & 0xF00) << 16) | (bus << 16) \
18 | (devfn << 8) | (reg & 0xFC))
16 19
17static int pci_conf1_read(unsigned int seg, unsigned int bus, 20static int pci_conf1_read(unsigned int seg, unsigned int bus,
18 unsigned int devfn, int reg, int len, u32 *value) 21 unsigned int devfn, int reg, int len, u32 *value)
19{ 22{
20 unsigned long flags; 23 unsigned long flags;
21 24
22 if ((bus > 255) || (devfn > 255) || (reg > 255)) { 25 if ((bus > 255) || (devfn > 255) || (reg > 4095)) {
23 *value = -1; 26 *value = -1;
24 return -EINVAL; 27 return -EINVAL;
25 } 28 }
@@ -50,7 +53,7 @@ static int pci_conf1_write(unsigned int seg, unsigned int bus,
50{ 53{
51 unsigned long flags; 54 unsigned long flags;
52 55
53 if ((bus > 255) || (devfn > 255) || (reg > 255)) 56 if ((bus > 255) || (devfn > 255) || (reg > 4095))
54 return -EINVAL; 57 return -EINVAL;
55 58
56 spin_lock_irqsave(&pci_config_lock, flags); 59 spin_lock_irqsave(&pci_config_lock, flags);
@@ -260,10 +263,18 @@ void __init pci_direct_init(int type)
260 return; 263 return;
261 printk(KERN_INFO "PCI: Using configuration type %d for base access\n", 264 printk(KERN_INFO "PCI: Using configuration type %d for base access\n",
262 type); 265 type);
263 if (type == 1) 266 if (type == 1) {
264 raw_pci_ops = &pci_direct_conf1; 267 raw_pci_ops = &pci_direct_conf1;
265 else 268 if (raw_pci_ext_ops)
266 raw_pci_ops = &pci_direct_conf2; 269 return;
270 if (!(pci_probe & PCI_HAS_IO_ECS))
271 return;
272 printk(KERN_INFO "PCI: Using configuration type 1 "
273 "for extended access\n");
274 raw_pci_ext_ops = &pci_direct_conf1;
275 return;
276 }
277 raw_pci_ops = &pci_direct_conf2;
267} 278}
268 279
269int __init pci_direct_probe(void) 280int __init pci_direct_probe(void)
diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
index 42df4b6606df..86631ccbc25a 100644
--- a/arch/x86/pci/early.c
+++ b/arch/x86/pci/early.c
@@ -7,15 +7,13 @@
7/* Direct PCI access. This is used for PCI accesses in early boot before 7/* Direct PCI access. This is used for PCI accesses in early boot before
8 the PCI subsystem works. */ 8 the PCI subsystem works. */
9 9
10#define PDprintk(x...)
11
12u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset) 10u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset)
13{ 11{
14 u32 v; 12 u32 v;
15 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 13 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
16 v = inl(0xcfc); 14 v = inl(0xcfc);
17 if (v != 0xffffffff) 15 if (v != 0xffffffff)
18 PDprintk("%x reading 4 from %x: %x\n", slot, offset, v); 16 pr_debug("%x reading 4 from %x: %x\n", slot, offset, v);
19 return v; 17 return v;
20} 18}
21 19
@@ -24,7 +22,7 @@ u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset)
24 u8 v; 22 u8 v;
25 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 23 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
26 v = inb(0xcfc + (offset&3)); 24 v = inb(0xcfc + (offset&3));
27 PDprintk("%x reading 1 from %x: %x\n", slot, offset, v); 25 pr_debug("%x reading 1 from %x: %x\n", slot, offset, v);
28 return v; 26 return v;
29} 27}
30 28
@@ -33,23 +31,30 @@ u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset)
33 u16 v; 31 u16 v;
34 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 32 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
35 v = inw(0xcfc + (offset&2)); 33 v = inw(0xcfc + (offset&2));
36 PDprintk("%x reading 2 from %x: %x\n", slot, offset, v); 34 pr_debug("%x reading 2 from %x: %x\n", slot, offset, v);
37 return v; 35 return v;
38} 36}
39 37
40void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, 38void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset,
41 u32 val) 39 u32 val)
42{ 40{
43 PDprintk("%x writing to %x: %x\n", slot, offset, val); 41 pr_debug("%x writing to %x: %x\n", slot, offset, val);
44 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 42 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
45 outl(val, 0xcfc); 43 outl(val, 0xcfc);
46} 44}
47 45
48void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val) 46void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val)
49{ 47{
50 PDprintk("%x writing to %x: %x\n", slot, offset, val); 48 pr_debug("%x writing to %x: %x\n", slot, offset, val);
49 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
50 outb(val, 0xcfc + (offset&3));
51}
52
53void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val)
54{
55 pr_debug("%x writing to %x: %x\n", slot, offset, val);
51 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 56 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
52 outb(val, 0xcfc); 57 outw(val, 0xcfc + (offset&2));
53} 58}
54 59
55int early_pci_allowed(void) 60int early_pci_allowed(void)
@@ -57,3 +62,54 @@ int early_pci_allowed(void)
57 return (pci_probe & (PCI_PROBE_CONF1|PCI_PROBE_NOEARLY)) == 62 return (pci_probe & (PCI_PROBE_CONF1|PCI_PROBE_NOEARLY)) ==
58 PCI_PROBE_CONF1; 63 PCI_PROBE_CONF1;
59} 64}
65
66void early_dump_pci_device(u8 bus, u8 slot, u8 func)
67{
68 int i;
69 int j;
70 u32 val;
71
72 printk(KERN_INFO "PCI: %02x:%02x:%02x", bus, slot, func);
73
74 for (i = 0; i < 256; i += 4) {
75 if (!(i & 0x0f))
76 printk("\n%04x:",i);
77
78 val = read_pci_config(bus, slot, func, i);
79 for (j = 0; j < 4; j++) {
80 printk(" %02x", val & 0xff);
81 val >>= 8;
82 }
83 }
84 printk("\n");
85}
86
87void early_dump_pci_devices(void)
88{
89 unsigned bus, slot, func;
90
91 if (!early_pci_allowed())
92 return;
93
94 for (bus = 0; bus < 256; bus++) {
95 for (slot = 0; slot < 32; slot++) {
96 for (func = 0; func < 8; func++) {
97 u32 class;
98 u8 type;
99 class = read_pci_config(bus, slot, func,
100 PCI_CLASS_REVISION);
101 if (class == 0xffffffff)
102 break;
103
104 early_dump_pci_device(bus, slot, func);
105
106 /* No multi-function device? */
107 type = read_pci_config_byte(bus, slot, func,
108 PCI_HEADER_TYPE);
109 if (!(type & 0x80))
110 break;
111 }
112 }
113 }
114}
115
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 10fb308fded8..a09505806b82 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -280,6 +280,7 @@ static void pci_track_mmap_page_range(struct vm_area_struct *vma)
280static struct vm_operations_struct pci_mmap_ops = { 280static struct vm_operations_struct pci_mmap_ops = {
281 .open = pci_track_mmap_page_range, 281 .open = pci_track_mmap_page_range,
282 .close = pci_unmap_page_range, 282 .close = pci_unmap_page_range,
283 .access = generic_access_phys,
283}; 284};
284 285
285int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, 286int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
@@ -299,9 +300,9 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
299 return -EINVAL; 300 return -EINVAL;
300 301
301 prot = pgprot_val(vma->vm_page_prot); 302 prot = pgprot_val(vma->vm_page_prot);
302 if (pat_wc_enabled && write_combine) 303 if (pat_enabled && write_combine)
303 prot |= _PAGE_CACHE_WC; 304 prot |= _PAGE_CACHE_WC;
304 else if (pat_wc_enabled || boot_cpu_data.x86 > 3) 305 else if (pat_enabled || boot_cpu_data.x86 > 3)
305 /* 306 /*
306 * ioremap() and ioremap_nocache() defaults to UC MINUS for now. 307 * ioremap() and ioremap_nocache() defaults to UC MINUS for now.
307 * To avoid attribute conflicts, request UC MINUS here 308 * To avoid attribute conflicts, request UC MINUS here
@@ -334,7 +335,9 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
334 flags = new_flags; 335 flags = new_flags;
335 } 336 }
336 337
337 if (vma->vm_pgoff <= max_pfn_mapped && 338 if (((vma->vm_pgoff < max_low_pfn_mapped) ||
339 (vma->vm_pgoff >= (1UL<<(32 - PAGE_SHIFT)) &&
340 vma->vm_pgoff < max_pfn_mapped)) &&
338 ioremap_change_attr((unsigned long)__va(addr), len, flags)) { 341 ioremap_change_attr((unsigned long)__va(addr), len, flags)) {
339 free_memtype(addr, addr + len); 342 free_memtype(addr, addr + len);
340 return -EINVAL; 343 return -EINVAL;
diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
index b821f4462d99..d6c950f81858 100644
--- a/arch/x86/pci/init.c
+++ b/arch/x86/pci/init.c
@@ -4,7 +4,7 @@
4 4
5/* arch_initcall has too random ordering, so call the initializers 5/* arch_initcall has too random ordering, so call the initializers
6 in the right sequence from here. */ 6 in the right sequence from here. */
7static __init int pci_access_init(void) 7static __init int pci_arch_init(void)
8{ 8{
9#ifdef CONFIG_PCI_DIRECT 9#ifdef CONFIG_PCI_DIRECT
10 int type = 0; 10 int type = 0;
@@ -40,4 +40,4 @@ static __init int pci_access_init(void)
40 40
41 return 0; 41 return 0;
42} 42}
43arch_initcall(pci_access_init); 43arch_initcall(pci_arch_init);
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index ca8df9c260bc..6a06a2eb0597 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -11,8 +11,8 @@
11#include <linux/slab.h> 11#include <linux/slab.h>
12#include <linux/interrupt.h> 12#include <linux/interrupt.h>
13#include <linux/dmi.h> 13#include <linux/dmi.h>
14#include <asm/io.h> 14#include <linux/io.h>
15#include <asm/smp.h> 15#include <linux/smp.h>
16#include <asm/io_apic.h> 16#include <asm/io_apic.h>
17#include <linux/irq.h> 17#include <linux/irq.h>
18#include <linux/acpi.h> 18#include <linux/acpi.h>
@@ -45,7 +45,8 @@ struct irq_router {
45 char *name; 45 char *name;
46 u16 vendor, device; 46 u16 vendor, device;
47 int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq); 47 int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
48 int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new); 48 int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq,
49 int new);
49}; 50};
50 51
51struct irq_router_handler { 52struct irq_router_handler {
@@ -61,7 +62,7 @@ void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL;
61 * and perform checksum verification. 62 * and perform checksum verification.
62 */ 63 */
63 64
64static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr) 65static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr)
65{ 66{
66 struct irq_routing_table *rt; 67 struct irq_routing_table *rt;
67 int i; 68 int i;
@@ -74,10 +75,11 @@ static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr)
74 rt->size < sizeof(struct irq_routing_table)) 75 rt->size < sizeof(struct irq_routing_table))
75 return NULL; 76 return NULL;
76 sum = 0; 77 sum = 0;
77 for (i=0; i < rt->size; i++) 78 for (i = 0; i < rt->size; i++)
78 sum += addr[i]; 79 sum += addr[i];
79 if (!sum) { 80 if (!sum) {
80 DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt); 81 DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n",
82 rt);
81 return rt; 83 return rt;
82 } 84 }
83 return NULL; 85 return NULL;
@@ -100,7 +102,7 @@ static struct irq_routing_table * __init pirq_find_routing_table(void)
100 return rt; 102 return rt;
101 printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n"); 103 printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
102 } 104 }
103 for(addr = (u8 *) __va(0xf0000); addr < (u8 *) __va(0x100000); addr += 16) { 105 for (addr = (u8 *) __va(0xf0000); addr < (u8 *) __va(0x100000); addr += 16) {
104 rt = pirq_check_routing_table(addr); 106 rt = pirq_check_routing_table(addr);
105 if (rt) 107 if (rt)
106 return rt; 108 return rt;
@@ -122,20 +124,20 @@ static void __init pirq_peer_trick(void)
122 struct irq_info *e; 124 struct irq_info *e;
123 125
124 memset(busmap, 0, sizeof(busmap)); 126 memset(busmap, 0, sizeof(busmap));
125 for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) { 127 for (i = 0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
126 e = &rt->slots[i]; 128 e = &rt->slots[i];
127#ifdef DEBUG 129#ifdef DEBUG
128 { 130 {
129 int j; 131 int j;
130 DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot); 132 DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
131 for(j=0; j<4; j++) 133 for (j = 0; j < 4; j++)
132 DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap); 134 DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
133 DBG("\n"); 135 DBG("\n");
134 } 136 }
135#endif 137#endif
136 busmap[e->bus] = 1; 138 busmap[e->bus] = 1;
137 } 139 }
138 for(i = 1; i < 256; i++) { 140 for (i = 1; i < 256; i++) {
139 int node; 141 int node;
140 if (!busmap[i] || pci_find_bus(0, i)) 142 if (!busmap[i] || pci_find_bus(0, i))
141 continue; 143 continue;
@@ -183,7 +185,8 @@ static unsigned int read_config_nybble(struct pci_dev *router, unsigned offset,
183 return (nr & 1) ? (x >> 4) : (x & 0xf); 185 return (nr & 1) ? (x >> 4) : (x & 0xf);
184} 186}
185 187
186static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val) 188static void write_config_nybble(struct pci_dev *router, unsigned offset,
189 unsigned nr, unsigned int val)
187{ 190{
188 u8 x; 191 u8 x;
189 unsigned reg = offset + (nr >> 1); 192 unsigned reg = offset + (nr >> 1);
@@ -285,7 +288,7 @@ static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
285 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 }; 288 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
286 289
287 WARN_ON_ONCE(pirq > 4); 290 WARN_ON_ONCE(pirq > 4);
288 return read_config_nybble(router,0x43, pirqmap[pirq-1]); 291 return read_config_nybble(router, 0x43, pirqmap[pirq-1]);
289} 292}
290 293
291static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) 294static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
@@ -314,7 +317,7 @@ static int pirq_opti_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
314 317
315/* 318/*
316 * Cyrix: nibble offset 0x5C 319 * Cyrix: nibble offset 0x5C
317 * 0x5C bits 7:4 is INTB bits 3:0 is INTA 320 * 0x5C bits 7:4 is INTB bits 3:0 is INTA
318 * 0x5D bits 7:4 is INTD bits 3:0 is INTC 321 * 0x5D bits 7:4 is INTD bits 3:0 is INTC
319 */ 322 */
320static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq) 323static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
@@ -350,7 +353,7 @@ static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
350 * Apparently there are systems implementing PCI routing table using 353 * Apparently there are systems implementing PCI routing table using
351 * link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D. 354 * link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D.
352 * We try our best to handle both link mappings. 355 * We try our best to handle both link mappings.
353 * 356 *
354 * Currently (2003-05-21) it appears most SiS chipsets follow the 357 * Currently (2003-05-21) it appears most SiS chipsets follow the
355 * definition of routing registers from the SiS-5595 southbridge. 358 * definition of routing registers from the SiS-5595 southbridge.
356 * According to the SiS 5595 datasheets the revision id's of the 359 * According to the SiS 5595 datasheets the revision id's of the
@@ -370,7 +373,7 @@ static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
370 * 373 *
371 * 0x62: USBIRQ: 374 * 0x62: USBIRQ:
372 * bit 6 OHCI function disabled (0), enabled (1) 375 * bit 6 OHCI function disabled (0), enabled (1)
373 * 376 *
374 * 0x6a: ACPI/SCI IRQ: bits 4-6 reserved 377 * 0x6a: ACPI/SCI IRQ: bits 4-6 reserved
375 * 378 *
376 * 0x7e: Data Acq. Module IRQ - bits 4-6 reserved 379 * 0x7e: Data Acq. Module IRQ - bits 4-6 reserved
@@ -467,7 +470,8 @@ static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int
467 return inb(0xc01) & 0xf; 470 return inb(0xc01) & 0xf;
468} 471}
469 472
470static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) 473static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev,
474 int pirq, int irq)
471{ 475{
472 outb(pirq, 0xc00); 476 outb(pirq, 0xc00);
473 outb(irq, 0xc01); 477 outb(irq, 0xc01);
@@ -487,9 +491,7 @@ static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq
487 u8 irq; 491 u8 irq;
488 irq = 0; 492 irq = 0;
489 if (pirq <= 4) 493 if (pirq <= 4)
490 {
491 irq = read_config_nybble(router, 0x56, pirq - 1); 494 irq = read_config_nybble(router, 0x56, pirq - 1);
492 }
493 printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n", 495 printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n",
494 dev->vendor, dev->device, pirq, irq); 496 dev->vendor, dev->device, pirq, irq);
495 return irq; 497 return irq;
@@ -497,12 +499,10 @@ static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq
497 499
498static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) 500static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
499{ 501{
500 printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n", 502 printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n",
501 dev->vendor, dev->device, pirq, irq); 503 dev->vendor, dev->device, pirq, irq);
502 if (pirq <= 4) 504 if (pirq <= 4)
503 {
504 write_config_nybble(router, 0x56, pirq - 1, irq); 505 write_config_nybble(router, 0x56, pirq - 1, irq);
505 }
506 return 1; 506 return 1;
507} 507}
508 508
@@ -549,50 +549,49 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
549 if (pci_dev_present(pirq_440gx)) 549 if (pci_dev_present(pirq_440gx))
550 return 0; 550 return 0;
551 551
552 switch(device) 552 switch (device) {
553 { 553 case PCI_DEVICE_ID_INTEL_82371FB_0:
554 case PCI_DEVICE_ID_INTEL_82371FB_0: 554 case PCI_DEVICE_ID_INTEL_82371SB_0:
555 case PCI_DEVICE_ID_INTEL_82371SB_0: 555 case PCI_DEVICE_ID_INTEL_82371AB_0:
556 case PCI_DEVICE_ID_INTEL_82371AB_0: 556 case PCI_DEVICE_ID_INTEL_82371MX:
557 case PCI_DEVICE_ID_INTEL_82371MX: 557 case PCI_DEVICE_ID_INTEL_82443MX_0:
558 case PCI_DEVICE_ID_INTEL_82443MX_0: 558 case PCI_DEVICE_ID_INTEL_82801AA_0:
559 case PCI_DEVICE_ID_INTEL_82801AA_0: 559 case PCI_DEVICE_ID_INTEL_82801AB_0:
560 case PCI_DEVICE_ID_INTEL_82801AB_0: 560 case PCI_DEVICE_ID_INTEL_82801BA_0:
561 case PCI_DEVICE_ID_INTEL_82801BA_0: 561 case PCI_DEVICE_ID_INTEL_82801BA_10:
562 case PCI_DEVICE_ID_INTEL_82801BA_10: 562 case PCI_DEVICE_ID_INTEL_82801CA_0:
563 case PCI_DEVICE_ID_INTEL_82801CA_0: 563 case PCI_DEVICE_ID_INTEL_82801CA_12:
564 case PCI_DEVICE_ID_INTEL_82801CA_12: 564 case PCI_DEVICE_ID_INTEL_82801DB_0:
565 case PCI_DEVICE_ID_INTEL_82801DB_0: 565 case PCI_DEVICE_ID_INTEL_82801E_0:
566 case PCI_DEVICE_ID_INTEL_82801E_0: 566 case PCI_DEVICE_ID_INTEL_82801EB_0:
567 case PCI_DEVICE_ID_INTEL_82801EB_0: 567 case PCI_DEVICE_ID_INTEL_ESB_1:
568 case PCI_DEVICE_ID_INTEL_ESB_1: 568 case PCI_DEVICE_ID_INTEL_ICH6_0:
569 case PCI_DEVICE_ID_INTEL_ICH6_0: 569 case PCI_DEVICE_ID_INTEL_ICH6_1:
570 case PCI_DEVICE_ID_INTEL_ICH6_1: 570 case PCI_DEVICE_ID_INTEL_ICH7_0:
571 case PCI_DEVICE_ID_INTEL_ICH7_0: 571 case PCI_DEVICE_ID_INTEL_ICH7_1:
572 case PCI_DEVICE_ID_INTEL_ICH7_1: 572 case PCI_DEVICE_ID_INTEL_ICH7_30:
573 case PCI_DEVICE_ID_INTEL_ICH7_30: 573 case PCI_DEVICE_ID_INTEL_ICH7_31:
574 case PCI_DEVICE_ID_INTEL_ICH7_31: 574 case PCI_DEVICE_ID_INTEL_ESB2_0:
575 case PCI_DEVICE_ID_INTEL_ESB2_0: 575 case PCI_DEVICE_ID_INTEL_ICH8_0:
576 case PCI_DEVICE_ID_INTEL_ICH8_0: 576 case PCI_DEVICE_ID_INTEL_ICH8_1:
577 case PCI_DEVICE_ID_INTEL_ICH8_1: 577 case PCI_DEVICE_ID_INTEL_ICH8_2:
578 case PCI_DEVICE_ID_INTEL_ICH8_2: 578 case PCI_DEVICE_ID_INTEL_ICH8_3:
579 case PCI_DEVICE_ID_INTEL_ICH8_3: 579 case PCI_DEVICE_ID_INTEL_ICH8_4:
580 case PCI_DEVICE_ID_INTEL_ICH8_4: 580 case PCI_DEVICE_ID_INTEL_ICH9_0:
581 case PCI_DEVICE_ID_INTEL_ICH9_0: 581 case PCI_DEVICE_ID_INTEL_ICH9_1:
582 case PCI_DEVICE_ID_INTEL_ICH9_1: 582 case PCI_DEVICE_ID_INTEL_ICH9_2:
583 case PCI_DEVICE_ID_INTEL_ICH9_2: 583 case PCI_DEVICE_ID_INTEL_ICH9_3:
584 case PCI_DEVICE_ID_INTEL_ICH9_3: 584 case PCI_DEVICE_ID_INTEL_ICH9_4:
585 case PCI_DEVICE_ID_INTEL_ICH9_4: 585 case PCI_DEVICE_ID_INTEL_ICH9_5:
586 case PCI_DEVICE_ID_INTEL_ICH9_5: 586 case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
587 case PCI_DEVICE_ID_INTEL_TOLAPAI_0: 587 case PCI_DEVICE_ID_INTEL_ICH10_0:
588 case PCI_DEVICE_ID_INTEL_ICH10_0: 588 case PCI_DEVICE_ID_INTEL_ICH10_1:
589 case PCI_DEVICE_ID_INTEL_ICH10_1: 589 case PCI_DEVICE_ID_INTEL_ICH10_2:
590 case PCI_DEVICE_ID_INTEL_ICH10_2: 590 case PCI_DEVICE_ID_INTEL_ICH10_3:
591 case PCI_DEVICE_ID_INTEL_ICH10_3: 591 r->name = "PIIX/ICH";
592 r->name = "PIIX/ICH"; 592 r->get = pirq_piix_get;
593 r->get = pirq_piix_get; 593 r->set = pirq_piix_set;
594 r->set = pirq_piix_set; 594 return 1;
595 return 1;
596 } 595 }
597 return 0; 596 return 0;
598} 597}
@@ -606,7 +605,7 @@ static __init int via_router_probe(struct irq_router *r,
606 * workarounds for some buggy BIOSes 605 * workarounds for some buggy BIOSes
607 */ 606 */
608 if (device == PCI_DEVICE_ID_VIA_82C586_0) { 607 if (device == PCI_DEVICE_ID_VIA_82C586_0) {
609 switch(router->device) { 608 switch (router->device) {
610 case PCI_DEVICE_ID_VIA_82C686: 609 case PCI_DEVICE_ID_VIA_82C686:
611 /* 610 /*
612 * Asus k7m bios wrongly reports 82C686A 611 * Asus k7m bios wrongly reports 82C686A
@@ -631,7 +630,7 @@ static __init int via_router_probe(struct irq_router *r,
631 } 630 }
632 } 631 }
633 632
634 switch(device) { 633 switch (device) {
635 case PCI_DEVICE_ID_VIA_82C586_0: 634 case PCI_DEVICE_ID_VIA_82C586_0:
636 r->name = "VIA"; 635 r->name = "VIA";
637 r->get = pirq_via586_get; 636 r->get = pirq_via586_get;
@@ -654,28 +653,27 @@ static __init int via_router_probe(struct irq_router *r,
654 653
655static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) 654static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
656{ 655{
657 switch(device) 656 switch (device) {
658 { 657 case PCI_DEVICE_ID_VLSI_82C534:
659 case PCI_DEVICE_ID_VLSI_82C534: 658 r->name = "VLSI 82C534";
660 r->name = "VLSI 82C534"; 659 r->get = pirq_vlsi_get;
661 r->get = pirq_vlsi_get; 660 r->set = pirq_vlsi_set;
662 r->set = pirq_vlsi_set; 661 return 1;
663 return 1;
664 } 662 }
665 return 0; 663 return 0;
666} 664}
667 665
668 666
669static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) 667static __init int serverworks_router_probe(struct irq_router *r,
668 struct pci_dev *router, u16 device)
670{ 669{
671 switch(device) 670 switch (device) {
672 { 671 case PCI_DEVICE_ID_SERVERWORKS_OSB4:
673 case PCI_DEVICE_ID_SERVERWORKS_OSB4: 672 case PCI_DEVICE_ID_SERVERWORKS_CSB5:
674 case PCI_DEVICE_ID_SERVERWORKS_CSB5: 673 r->name = "ServerWorks";
675 r->name = "ServerWorks"; 674 r->get = pirq_serverworks_get;
676 r->get = pirq_serverworks_get; 675 r->set = pirq_serverworks_set;
677 r->set = pirq_serverworks_set; 676 return 1;
678 return 1;
679 } 677 }
680 return 0; 678 return 0;
681} 679}
@@ -684,7 +682,7 @@ static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router,
684{ 682{
685 if (device != PCI_DEVICE_ID_SI_503) 683 if (device != PCI_DEVICE_ID_SI_503)
686 return 0; 684 return 0;
687 685
688 r->name = "SIS"; 686 r->name = "SIS";
689 r->get = pirq_sis_get; 687 r->get = pirq_sis_get;
690 r->set = pirq_sis_set; 688 r->set = pirq_sis_set;
@@ -693,47 +691,43 @@ static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router,
693 691
694static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) 692static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
695{ 693{
696 switch(device) 694 switch (device) {
697 { 695 case PCI_DEVICE_ID_CYRIX_5520:
698 case PCI_DEVICE_ID_CYRIX_5520: 696 r->name = "NatSemi";
699 r->name = "NatSemi"; 697 r->get = pirq_cyrix_get;
700 r->get = pirq_cyrix_get; 698 r->set = pirq_cyrix_set;
701 r->set = pirq_cyrix_set; 699 return 1;
702 return 1;
703 } 700 }
704 return 0; 701 return 0;
705} 702}
706 703
707static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) 704static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
708{ 705{
709 switch(device) 706 switch (device) {
710 { 707 case PCI_DEVICE_ID_OPTI_82C700:
711 case PCI_DEVICE_ID_OPTI_82C700: 708 r->name = "OPTI";
712 r->name = "OPTI"; 709 r->get = pirq_opti_get;
713 r->get = pirq_opti_get; 710 r->set = pirq_opti_set;
714 r->set = pirq_opti_set; 711 return 1;
715 return 1;
716 } 712 }
717 return 0; 713 return 0;
718} 714}
719 715
720static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) 716static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
721{ 717{
722 switch(device) 718 switch (device) {
723 { 719 case PCI_DEVICE_ID_ITE_IT8330G_0:
724 case PCI_DEVICE_ID_ITE_IT8330G_0: 720 r->name = "ITE";
725 r->name = "ITE"; 721 r->get = pirq_ite_get;
726 r->get = pirq_ite_get; 722 r->set = pirq_ite_set;
727 r->set = pirq_ite_set; 723 return 1;
728 return 1;
729 } 724 }
730 return 0; 725 return 0;
731} 726}
732 727
733static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) 728static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
734{ 729{
735 switch(device) 730 switch (device) {
736 {
737 case PCI_DEVICE_ID_AL_M1533: 731 case PCI_DEVICE_ID_AL_M1533:
738 case PCI_DEVICE_ID_AL_M1563: 732 case PCI_DEVICE_ID_AL_M1563:
739 printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n"); 733 printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
@@ -747,25 +741,24 @@ static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router,
747 741
748static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) 742static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
749{ 743{
750 switch(device) 744 switch (device) {
751 { 745 case PCI_DEVICE_ID_AMD_VIPER_740B:
752 case PCI_DEVICE_ID_AMD_VIPER_740B: 746 r->name = "AMD756";
753 r->name = "AMD756"; 747 break;
754 break; 748 case PCI_DEVICE_ID_AMD_VIPER_7413:
755 case PCI_DEVICE_ID_AMD_VIPER_7413: 749 r->name = "AMD766";
756 r->name = "AMD766"; 750 break;
757 break; 751 case PCI_DEVICE_ID_AMD_VIPER_7443:
758 case PCI_DEVICE_ID_AMD_VIPER_7443: 752 r->name = "AMD768";
759 r->name = "AMD768"; 753 break;
760 break; 754 default:
761 default: 755 return 0;
762 return 0;
763 } 756 }
764 r->get = pirq_amd756_get; 757 r->get = pirq_amd756_get;
765 r->set = pirq_amd756_set; 758 r->set = pirq_amd756_set;
766 return 1; 759 return 1;
767} 760}
768 761
769static __init int pico_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) 762static __init int pico_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
770{ 763{
771 switch (device) { 764 switch (device) {
@@ -807,7 +800,7 @@ static struct pci_dev *pirq_router_dev;
807 * FIXME: should we have an option to say "generic for 800 * FIXME: should we have an option to say "generic for
808 * chipset" ? 801 * chipset" ?
809 */ 802 */
810 803
811static void __init pirq_find_router(struct irq_router *r) 804static void __init pirq_find_router(struct irq_router *r)
812{ 805{
813 struct irq_routing_table *rt = pirq_table; 806 struct irq_routing_table *rt = pirq_table;
@@ -826,7 +819,7 @@ static void __init pirq_find_router(struct irq_router *r)
826 r->name = "default"; 819 r->name = "default";
827 r->get = NULL; 820 r->get = NULL;
828 r->set = NULL; 821 r->set = NULL;
829 822
830 DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n", 823 DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
831 rt->rtr_vendor, rt->rtr_device); 824 rt->rtr_vendor, rt->rtr_device);
832 825
@@ -837,12 +830,14 @@ static void __init pirq_find_router(struct irq_router *r)
837 return; 830 return;
838 } 831 }
839 832
840 for( h = pirq_routers; h->vendor; h++) { 833 for (h = pirq_routers; h->vendor; h++) {
841 /* First look for a router match */ 834 /* First look for a router match */
842 if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device)) 835 if (rt->rtr_vendor == h->vendor &&
836 h->probe(r, pirq_router_dev, rt->rtr_device))
843 break; 837 break;
844 /* Fall back to a device match */ 838 /* Fall back to a device match */
845 if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device)) 839 if (pirq_router_dev->vendor == h->vendor &&
840 h->probe(r, pirq_router_dev, pirq_router_dev->device))
846 break; 841 break;
847 } 842 }
848 printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n", 843 printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n",
@@ -857,11 +852,13 @@ static void __init pirq_find_router(struct irq_router *r)
857static struct irq_info *pirq_get_info(struct pci_dev *dev) 852static struct irq_info *pirq_get_info(struct pci_dev *dev)
858{ 853{
859 struct irq_routing_table *rt = pirq_table; 854 struct irq_routing_table *rt = pirq_table;
860 int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); 855 int entries = (rt->size - sizeof(struct irq_routing_table)) /
856 sizeof(struct irq_info);
861 struct irq_info *info; 857 struct irq_info *info;
862 858
863 for (info = rt->slots; entries--; info++) 859 for (info = rt->slots; entries--; info++)
864 if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn)) 860 if (info->bus == dev->bus->number &&
861 PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
865 return info; 862 return info;
866 return NULL; 863 return NULL;
867} 864}
@@ -889,7 +886,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
889 886
890 if (!pirq_table) 887 if (!pirq_table)
891 return 0; 888 return 0;
892 889
893 DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin); 890 DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
894 info = pirq_get_info(dev); 891 info = pirq_get_info(dev);
895 if (!info) { 892 if (!info) {
@@ -902,7 +899,8 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
902 DBG(" -> not routed\n" KERN_DEBUG); 899 DBG(" -> not routed\n" KERN_DEBUG);
903 return 0; 900 return 0;
904 } 901 }
905 DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs); 902 DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask,
903 pirq_table->exclusive_irqs);
906 mask &= pcibios_irq_mask; 904 mask &= pcibios_irq_mask;
907 905
908 /* Work around broken HP Pavilion Notebooks which assign USB to 906 /* Work around broken HP Pavilion Notebooks which assign USB to
@@ -915,7 +913,8 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
915 } 913 }
916 914
917 /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */ 915 /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
918 if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) { 916 if (acer_tm360_irqrouting && dev->irq == 11 &&
917 dev->vendor == PCI_VENDOR_ID_O2) {
919 pirq = 0x68; 918 pirq = 0x68;
920 mask = 0x400; 919 mask = 0x400;
921 dev->irq = r->get(pirq_router_dev, dev, pirq); 920 dev->irq = r->get(pirq_router_dev, dev, pirq);
@@ -928,17 +927,20 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
928 */ 927 */
929 newirq = dev->irq; 928 newirq = dev->irq;
930 if (newirq && !((1 << newirq) & mask)) { 929 if (newirq && !((1 << newirq) & mask)) {
931 if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0; 930 if (pci_probe & PCI_USE_PIRQ_MASK)
932 else printk("\n" KERN_WARNING 931 newirq = 0;
933 "PCI: IRQ %i for device %s doesn't match PIRQ mask " 932 else
934 "- try pci=usepirqmask\n" KERN_DEBUG, newirq, 933 printk("\n" KERN_WARNING
935 pci_name(dev)); 934 "PCI: IRQ %i for device %s doesn't match PIRQ mask - try pci=usepirqmask\n"
935 KERN_DEBUG, newirq,
936 pci_name(dev));
936 } 937 }
937 if (!newirq && assign) { 938 if (!newirq && assign) {
938 for (i = 0; i < 16; i++) { 939 for (i = 0; i < 16; i++) {
939 if (!(mask & (1 << i))) 940 if (!(mask & (1 << i)))
940 continue; 941 continue;
941 if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, IRQF_SHARED)) 942 if (pirq_penalty[i] < pirq_penalty[newirq] &&
943 can_request_irq(i, IRQF_SHARED))
942 newirq = i; 944 newirq = i;
943 } 945 }
944 } 946 }
@@ -949,12 +951,13 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
949 irq = pirq & 0xf; 951 irq = pirq & 0xf;
950 DBG(" -> hardcoded IRQ %d\n", irq); 952 DBG(" -> hardcoded IRQ %d\n", irq);
951 msg = "Hardcoded"; 953 msg = "Hardcoded";
952 } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \ 954 } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
953 ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) { 955 ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) {
954 DBG(" -> got IRQ %d\n", irq); 956 DBG(" -> got IRQ %d\n", irq);
955 msg = "Found"; 957 msg = "Found";
956 eisa_set_level_irq(irq); 958 eisa_set_level_irq(irq);
957 } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) { 959 } else if (newirq && r->set &&
960 (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
958 DBG(" -> assigning IRQ %d", newirq); 961 DBG(" -> assigning IRQ %d", newirq);
959 if (r->set(pirq_router_dev, dev, pirq, newirq)) { 962 if (r->set(pirq_router_dev, dev, pirq, newirq)) {
960 eisa_set_level_irq(newirq); 963 eisa_set_level_irq(newirq);
@@ -972,7 +975,8 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
972 } else 975 } else
973 return 0; 976 return 0;
974 } 977 }
975 printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev)); 978 printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq,
979 pci_name(dev));
976 980
977 /* Update IRQ for all devices with the same pirq value */ 981 /* Update IRQ for all devices with the same pirq value */
978 while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) { 982 while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
@@ -984,20 +988,25 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
984 if (!info) 988 if (!info)
985 continue; 989 continue;
986 if (info->irq[pin].link == pirq) { 990 if (info->irq[pin].link == pirq) {
987 /* We refuse to override the dev->irq information. Give a warning! */ 991 /*
988 if ( dev2->irq && dev2->irq != irq && \ 992 * We refuse to override the dev->irq
993 * information. Give a warning!
994 */
995 if (dev2->irq && dev2->irq != irq && \
989 (!(pci_probe & PCI_USE_PIRQ_MASK) || \ 996 (!(pci_probe & PCI_USE_PIRQ_MASK) || \
990 ((1 << dev2->irq) & mask)) ) { 997 ((1 << dev2->irq) & mask))) {
991#ifndef CONFIG_PCI_MSI 998#ifndef CONFIG_PCI_MSI
992 printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n", 999 printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n",
993 pci_name(dev2), dev2->irq, irq); 1000 pci_name(dev2), dev2->irq, irq);
994#endif 1001#endif
995 continue; 1002 continue;
996 } 1003 }
997 dev2->irq = irq; 1004 dev2->irq = irq;
998 pirq_penalty[irq]++; 1005 pirq_penalty[irq]++;
999 if (dev != dev2) 1006 if (dev != dev2)
1000 printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2)); 1007 printk(KERN_INFO
1008 "PCI: Sharing IRQ %d with %s\n",
1009 irq, pci_name(dev2));
1001 } 1010 }
1002 } 1011 }
1003 return 1; 1012 return 1;
@@ -1011,15 +1020,21 @@ static void __init pcibios_fixup_irqs(void)
1011 DBG(KERN_DEBUG "PCI: IRQ fixup\n"); 1020 DBG(KERN_DEBUG "PCI: IRQ fixup\n");
1012 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 1021 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
1013 /* 1022 /*
1014 * If the BIOS has set an out of range IRQ number, just ignore it. 1023 * If the BIOS has set an out of range IRQ number, just
1015 * Also keep track of which IRQ's are already in use. 1024 * ignore it. Also keep track of which IRQ's are
1025 * already in use.
1016 */ 1026 */
1017 if (dev->irq >= 16) { 1027 if (dev->irq >= 16) {
1018 DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq); 1028 DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n",
1029 pci_name(dev), dev->irq);
1019 dev->irq = 0; 1030 dev->irq = 0;
1020 } 1031 }
1021 /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */ 1032 /*
1022 if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000) 1033 * If the IRQ is already assigned to a PCI device,
1034 * ignore its ISA use penalty
1035 */
1036 if (pirq_penalty[dev->irq] >= 100 &&
1037 pirq_penalty[dev->irq] < 100000)
1023 pirq_penalty[dev->irq] = 0; 1038 pirq_penalty[dev->irq] = 0;
1024 pirq_penalty[dev->irq]++; 1039 pirq_penalty[dev->irq]++;
1025 } 1040 }
@@ -1031,13 +1046,17 @@ static void __init pcibios_fixup_irqs(void)
1031 /* 1046 /*
1032 * Recalculate IRQ numbers if we use the I/O APIC. 1047 * Recalculate IRQ numbers if we use the I/O APIC.
1033 */ 1048 */
1034 if (io_apic_assign_pci_irqs) 1049 if (io_apic_assign_pci_irqs) {
1035 {
1036 int irq; 1050 int irq;
1037 1051
1038 if (pin) { 1052 if (pin) {
1039 pin--; /* interrupt pins are numbered starting from 1 */ 1053 /*
1040 irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin); 1054 * interrupt pins are numbered starting
1055 * from 1
1056 */
1057 pin--;
1058 irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
1059 PCI_SLOT(dev->devfn), pin);
1041 /* 1060 /*
1042 * Busses behind bridges are typically not listed in the MP-table. 1061 * Busses behind bridges are typically not listed in the MP-table.
1043 * In this case we have to look up the IRQ based on the parent bus, 1062 * In this case we have to look up the IRQ based on the parent bus,
@@ -1045,10 +1064,10 @@ static void __init pcibios_fixup_irqs(void)
1045 * busses itself so we should get into this branch reliably. 1064 * busses itself so we should get into this branch reliably.
1046 */ 1065 */
1047 if (irq < 0 && dev->bus->parent) { /* go back to the bridge */ 1066 if (irq < 0 && dev->bus->parent) { /* go back to the bridge */
1048 struct pci_dev * bridge = dev->bus->self; 1067 struct pci_dev *bridge = dev->bus->self;
1049 1068
1050 pin = (pin + PCI_SLOT(dev->devfn)) % 4; 1069 pin = (pin + PCI_SLOT(dev->devfn)) % 4;
1051 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 1070 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
1052 PCI_SLOT(bridge->devfn), pin); 1071 PCI_SLOT(bridge->devfn), pin);
1053 if (irq >= 0) 1072 if (irq >= 0)
1054 printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n", 1073 printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
@@ -1078,7 +1097,8 @@ static int __init fix_broken_hp_bios_irq9(const struct dmi_system_id *d)
1078{ 1097{
1079 if (!broken_hp_bios_irq9) { 1098 if (!broken_hp_bios_irq9) {
1080 broken_hp_bios_irq9 = 1; 1099 broken_hp_bios_irq9 = 1;
1081 printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident); 1100 printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
1101 d->ident);
1082 } 1102 }
1083 return 0; 1103 return 0;
1084} 1104}
@@ -1091,7 +1111,8 @@ static int __init fix_acer_tm360_irqrouting(const struct dmi_system_id *d)
1091{ 1111{
1092 if (!acer_tm360_irqrouting) { 1112 if (!acer_tm360_irqrouting) {
1093 acer_tm360_irqrouting = 1; 1113 acer_tm360_irqrouting = 1;
1094 printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident); 1114 printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
1115 d->ident);
1095 } 1116 }
1096 return 0; 1117 return 0;
1097} 1118}
@@ -1103,7 +1124,8 @@ static struct dmi_system_id __initdata pciirq_dmi_table[] = {
1103 .matches = { 1124 .matches = {
1104 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), 1125 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
1105 DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"), 1126 DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
1106 DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"), 1127 DMI_MATCH(DMI_PRODUCT_VERSION,
1128 "HP Pavilion Notebook Model GE"),
1107 DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"), 1129 DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
1108 }, 1130 },
1109 }, 1131 },
@@ -1118,7 +1140,7 @@ static struct dmi_system_id __initdata pciirq_dmi_table[] = {
1118 { } 1140 { }
1119}; 1141};
1120 1142
1121static int __init pcibios_irq_init(void) 1143int __init pcibios_irq_init(void)
1122{ 1144{
1123 DBG(KERN_DEBUG "PCI: IRQ init\n"); 1145 DBG(KERN_DEBUG "PCI: IRQ init\n");
1124 1146
@@ -1138,11 +1160,14 @@ static int __init pcibios_irq_init(void)
1138 pirq_find_router(&pirq_router); 1160 pirq_find_router(&pirq_router);
1139 if (pirq_table->exclusive_irqs) { 1161 if (pirq_table->exclusive_irqs) {
1140 int i; 1162 int i;
1141 for (i=0; i<16; i++) 1163 for (i = 0; i < 16; i++)
1142 if (!(pirq_table->exclusive_irqs & (1 << i))) 1164 if (!(pirq_table->exclusive_irqs & (1 << i)))
1143 pirq_penalty[i] += 100; 1165 pirq_penalty[i] += 100;
1144 } 1166 }
1145 /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */ 1167 /*
1168 * If we're using the I/O APIC, avoid using the PCI IRQ
1169 * routing table
1170 */
1146 if (io_apic_assign_pci_irqs) 1171 if (io_apic_assign_pci_irqs)
1147 pirq_table = NULL; 1172 pirq_table = NULL;
1148 } 1173 }
@@ -1153,9 +1178,6 @@ static int __init pcibios_irq_init(void)
1153 return 0; 1178 return 0;
1154} 1179}
1155 1180
1156subsys_initcall(pcibios_irq_init);
1157
1158
1159static void pirq_penalize_isa_irq(int irq, int active) 1181static void pirq_penalize_isa_irq(int irq, int active)
1160{ 1182{
1161 /* 1183 /*
@@ -1189,7 +1211,7 @@ static int pirq_enable_irq(struct pci_dev *dev)
1189 if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) { 1211 if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
1190 char *msg = ""; 1212 char *msg = "";
1191 1213
1192 pin--; /* interrupt pins are numbered starting from 1 */ 1214 pin--; /* interrupt pins are numbered starting from 1 */
1193 1215
1194 if (io_apic_assign_pci_irqs) { 1216 if (io_apic_assign_pci_irqs) {
1195 int irq; 1217 int irq;
@@ -1203,19 +1225,22 @@ static int pirq_enable_irq(struct pci_dev *dev)
1203 */ 1225 */
1204 temp_dev = dev; 1226 temp_dev = dev;
1205 while (irq < 0 && dev->bus->parent) { /* go back to the bridge */ 1227 while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
1206 struct pci_dev * bridge = dev->bus->self; 1228 struct pci_dev *bridge = dev->bus->self;
1207 1229
1208 pin = (pin + PCI_SLOT(dev->devfn)) % 4; 1230 pin = (pin + PCI_SLOT(dev->devfn)) % 4;
1209 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 1231 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
1210 PCI_SLOT(bridge->devfn), pin); 1232 PCI_SLOT(bridge->devfn), pin);
1211 if (irq >= 0) 1233 if (irq >= 0)
1212 printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n", 1234 printk(KERN_WARNING
1213 pci_name(bridge), 'A' + pin, irq); 1235 "PCI: using PPB %s[%c] to get irq %d\n",
1236 pci_name(bridge),
1237 'A' + pin, irq);
1214 dev = bridge; 1238 dev = bridge;
1215 } 1239 }
1216 dev = temp_dev; 1240 dev = temp_dev;
1217 if (irq >= 0) { 1241 if (irq >= 0) {
1218 printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", 1242 printk(KERN_INFO
1243 "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
1219 pci_name(dev), 'A' + pin, irq); 1244 pci_name(dev), 'A' + pin, irq);
1220 dev->irq = irq; 1245 dev->irq = irq;
1221 return 0; 1246 return 0;
@@ -1226,12 +1251,17 @@ static int pirq_enable_irq(struct pci_dev *dev)
1226 else 1251 else
1227 msg = " Please try using pci=biosirq."; 1252 msg = " Please try using pci=biosirq.";
1228 1253
1229 /* With IDE legacy devices the IRQ lookup failure is not a problem.. */ 1254 /*
1230 if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5)) 1255 * With IDE legacy devices the IRQ lookup failure is not
1256 * a problem..
1257 */
1258 if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE &&
1259 !(dev->class & 0x5))
1231 return 0; 1260 return 0;
1232 1261
1233 printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n", 1262 printk(KERN_WARNING
1234 'A' + pin, pci_name(dev), msg); 1263 "PCI: No IRQ known for interrupt pin %c of device %s.%s\n",
1264 'A' + pin, pci_name(dev), msg);
1235 } 1265 }
1236 return 0; 1266 return 0;
1237} 1267}
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index a67921ce60af..ec9ce35e44d6 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -55,4 +55,21 @@ static int __init pci_legacy_init(void)
55 return 0; 55 return 0;
56} 56}
57 57
58subsys_initcall(pci_legacy_init); 58int __init pci_subsys_init(void)
59{
60#ifdef CONFIG_X86_NUMAQ
61 pci_numaq_init();
62#endif
63#ifdef CONFIG_ACPI
64 pci_acpi_init();
65#endif
66#ifdef CONFIG_X86_VISWS
67 pci_visws_init();
68#endif
69 pci_legacy_init();
70 pcibios_irq_init();
71 pcibios_init();
72
73 return 0;
74}
75subsys_initcall(pci_subsys_init);
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 0cfebecf2a8f..23faaa890ffc 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -374,7 +374,7 @@ reject:
374 374
375static int __initdata known_bridge; 375static int __initdata known_bridge;
376 376
377void __init __pci_mmcfg_init(int early) 377static void __init __pci_mmcfg_init(int early)
378{ 378{
379 /* MMCONFIG disabled */ 379 /* MMCONFIG disabled */
380 if ((pci_probe & PCI_PROBE_MMCONF) == 0) 380 if ((pci_probe & PCI_PROBE_MMCONF) == 0)
diff --git a/arch/x86/pci/mp_bus_to_node.c b/arch/x86/pci/mp_bus_to_node.c
deleted file mode 100644
index 022943999b84..000000000000
--- a/arch/x86/pci/mp_bus_to_node.c
+++ /dev/null
@@ -1,23 +0,0 @@
1#include <linux/pci.h>
2#include <linux/init.h>
3#include <linux/topology.h>
4
5#define BUS_NR 256
6
7static unsigned char mp_bus_to_node[BUS_NR];
8
9void set_mp_bus_to_node(int busnum, int node)
10{
11 if (busnum >= 0 && busnum < BUS_NR)
12 mp_bus_to_node[busnum] = (unsigned char) node;
13}
14
15int get_mp_bus_to_node(int busnum)
16{
17 int node;
18
19 if (busnum < 0 || busnum > (BUS_NR - 1))
20 return 0;
21 node = mp_bus_to_node[busnum];
22 return node;
23}
diff --git a/arch/x86/pci/numa.c b/arch/x86/pci/numaq_32.c
index d9afbae5092b..f4b16dc11dad 100644
--- a/arch/x86/pci/numa.c
+++ b/arch/x86/pci/numaq_32.c
@@ -1,50 +1,26 @@
1/* 1/*
2 * numa.c - Low-level PCI access for NUMA-Q machines 2 * numaq_32.c - Low-level PCI access for NUMA-Q machines
3 */ 3 */
4 4
5#include <linux/pci.h> 5#include <linux/pci.h>
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/nodemask.h> 7#include <linux/nodemask.h>
8#include <mach_apic.h> 8#include <mach_apic.h>
9#include <asm/mpspec.h>
9#include "pci.h" 10#include "pci.h"
10 11
11#define XQUAD_PORTIO_BASE 0xfe400000 12#define XQUAD_PORTIO_BASE 0xfe400000
12#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ 13#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
13 14
14int mp_bus_id_to_node[MAX_MP_BUSSES];
15#define BUS2QUAD(global) (mp_bus_id_to_node[global]) 15#define BUS2QUAD(global) (mp_bus_id_to_node[global])
16 16
17int mp_bus_id_to_local[MAX_MP_BUSSES];
18#define BUS2LOCAL(global) (mp_bus_id_to_local[global]) 17#define BUS2LOCAL(global) (mp_bus_id_to_local[global])
19 18
20void mpc_oem_bus_info(struct mpc_config_bus *m, char *name,
21 struct mpc_config_translation *translation)
22{
23 int quad = translation->trans_quad;
24 int local = translation->trans_local;
25
26 mp_bus_id_to_node[m->mpc_busid] = quad;
27 mp_bus_id_to_local[m->mpc_busid] = local;
28 printk(KERN_INFO "Bus #%d is %s (node %d)\n",
29 m->mpc_busid, name, quad);
30}
31
32int quad_local_to_mp_bus_id [NR_CPUS/4][4];
33#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local]) 19#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local])
34void mpc_oem_pci_bus(struct mpc_config_bus *m,
35 struct mpc_config_translation *translation)
36{
37 int quad = translation->trans_quad;
38 int local = translation->trans_local;
39
40 quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
41}
42 20
43/* Where the IO area was mapped on multiquad, always 0 otherwise */ 21/* Where the IO area was mapped on multiquad, always 0 otherwise */
44void *xquad_portio; 22void *xquad_portio;
45#ifdef CONFIG_X86_NUMAQ
46EXPORT_SYMBOL(xquad_portio); 23EXPORT_SYMBOL(xquad_portio);
47#endif
48 24
49#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port) 25#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
50 26
@@ -175,10 +151,13 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
175} 151}
176DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx); 152DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx);
177 153
178static int __init pci_numa_init(void) 154int __init pci_numaq_init(void)
179{ 155{
180 int quad; 156 int quad;
181 157
158 if (!found_numaq)
159 return 0;
160
182 raw_pci_ops = &pci_direct_conf1_mq; 161 raw_pci_ops = &pci_direct_conf1_mq;
183 162
184 if (pcibios_scanned++) 163 if (pcibios_scanned++)
@@ -197,5 +176,3 @@ static int __init pci_numa_init(void)
197 } 176 }
198 return 0; 177 return 0;
199} 178}
200
201subsys_initcall(pci_numa_init);
diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h
index 720c4c554534..15b9cf6be729 100644
--- a/arch/x86/pci/pci.h
+++ b/arch/x86/pci/pci.h
@@ -27,6 +27,8 @@
27#define PCI_CAN_SKIP_ISA_ALIGN 0x8000 27#define PCI_CAN_SKIP_ISA_ALIGN 0x8000
28#define PCI_USE__CRS 0x10000 28#define PCI_USE__CRS 0x10000
29#define PCI_CHECK_ENABLE_AMD_MMCONF 0x20000 29#define PCI_CHECK_ENABLE_AMD_MMCONF 0x20000
30#define PCI_HAS_IO_ECS 0x40000
31#define PCI_NOASSIGN_ROMS 0x80000
30 32
31extern unsigned int pci_probe; 33extern unsigned int pci_probe;
32extern unsigned long pirq_table_addr; 34extern unsigned long pirq_table_addr;
@@ -38,9 +40,6 @@ enum pci_bf_sort_state {
38 pci_dmi_bf, 40 pci_dmi_bf,
39}; 41};
40 42
41extern void __init dmi_check_pciprobe(void);
42extern void __init dmi_check_skip_isa_align(void);
43
44/* pci-i386.c */ 43/* pci-i386.c */
45 44
46extern unsigned int pcibios_max_latency; 45extern unsigned int pcibios_max_latency;
@@ -98,10 +97,20 @@ extern struct pci_raw_ops *raw_pci_ext_ops;
98 97
99extern struct pci_raw_ops pci_direct_conf1; 98extern struct pci_raw_ops pci_direct_conf1;
100 99
100/* arch_initcall level */
101extern int pci_direct_probe(void); 101extern int pci_direct_probe(void);
102extern void pci_direct_init(int type); 102extern void pci_direct_init(int type);
103extern void pci_pcbios_init(void); 103extern void pci_pcbios_init(void);
104extern int pci_olpc_init(void); 104extern int pci_olpc_init(void);
105extern void __init dmi_check_pciprobe(void);
106extern void __init dmi_check_skip_isa_align(void);
107
108/* some common used subsys_initcalls */
109extern int __init pci_acpi_init(void);
110extern int __init pcibios_irq_init(void);
111extern int __init pci_visws_init(void);
112extern int __init pci_numaq_init(void);
113extern int __init pcibios_init(void);
105 114
106/* pci-mmconfig.c */ 115/* pci-mmconfig.c */
107 116
diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c
index c2df4e97eed6..42f4cb19faca 100644
--- a/arch/x86/pci/visws.c
+++ b/arch/x86/pci/visws.c
@@ -8,18 +8,19 @@
8#include <linux/pci.h> 8#include <linux/pci.h>
9#include <linux/init.h> 9#include <linux/init.h>
10 10
11#include "cobalt.h" 11#include <asm/setup.h>
12#include "lithium.h" 12#include <asm/visws/cobalt.h>
13#include <asm/visws/lithium.h>
13 14
14#include "pci.h" 15#include "pci.h"
15 16
16static int pci_visws_enable_irq(struct pci_dev *dev) { return 0; } 17static int pci_visws_enable_irq(struct pci_dev *dev) { return 0; }
17static void pci_visws_disable_irq(struct pci_dev *dev) { } 18static void pci_visws_disable_irq(struct pci_dev *dev) { }
18 19
19int (*pcibios_enable_irq)(struct pci_dev *dev) = &pci_visws_enable_irq; 20/* int (*pcibios_enable_irq)(struct pci_dev *dev) = &pci_visws_enable_irq; */
20void (*pcibios_disable_irq)(struct pci_dev *dev) = &pci_visws_disable_irq; 21/* void (*pcibios_disable_irq)(struct pci_dev *dev) = &pci_visws_disable_irq; */
21 22
22void __init pcibios_penalize_isa_irq(int irq, int active) {} 23/* void __init pcibios_penalize_isa_irq(int irq, int active) {} */
23 24
24 25
25unsigned int pci_bus0, pci_bus1; 26unsigned int pci_bus0, pci_bus1;
@@ -85,8 +86,14 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq)
85 pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); 86 pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
86} 87}
87 88
88static int __init pcibios_init(void) 89int __init pci_visws_init(void)
89{ 90{
91 if (!is_visws_box())
92 return -1;
93
94 pcibios_enable_irq = &pci_visws_enable_irq;
95 pcibios_disable_irq = &pci_visws_disable_irq;
96
90 /* The VISWS supports configuration access type 1 only */ 97 /* The VISWS supports configuration access type 1 only */
91 pci_probe = (pci_probe | PCI_PROBE_CONF1) & 98 pci_probe = (pci_probe | PCI_PROBE_CONF1) &
92 ~(PCI_PROBE_BIOS | PCI_PROBE_CONF2); 99 ~(PCI_PROBE_BIOS | PCI_PROBE_CONF2);
@@ -104,5 +111,3 @@ static int __init pcibios_init(void)
104 pcibios_resource_survey(); 111 pcibios_resource_survey();
105 return 0; 112 return 0;
106} 113}
107
108subsys_initcall(pcibios_init);
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index b542355e0e34..6dd000dd7933 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -83,7 +83,7 @@ static int set_up_temporary_mappings(void)
83 83
84 /* Set up the direct mapping from scratch */ 84 /* Set up the direct mapping from scratch */
85 start = (unsigned long)pfn_to_kaddr(0); 85 start = (unsigned long)pfn_to_kaddr(0);
86 end = (unsigned long)pfn_to_kaddr(end_pfn); 86 end = (unsigned long)pfn_to_kaddr(max_pfn);
87 87
88 for (; start < end; start = next) { 88 for (; start < end; start = next) {
89 pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC); 89 pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC);
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index b7ad9f89d21f..4d6ef0a336d6 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -62,7 +62,7 @@ $(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE
62# Build multiple 32-bit vDSO images to choose from at boot time. 62# Build multiple 32-bit vDSO images to choose from at boot time.
63# 63#
64obj-$(VDSO32-y) += vdso32-syms.lds 64obj-$(VDSO32-y) += vdso32-syms.lds
65vdso32.so-$(CONFIG_X86_32) += int80 65vdso32.so-$(VDSO32-y) += int80
66vdso32.so-$(CONFIG_COMPAT) += syscall 66vdso32.so-$(CONFIG_COMPAT) += syscall
67vdso32.so-$(VDSO32-y) += sysenter 67vdso32.so-$(VDSO32-y) += sysenter
68 68
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index efa2ba7c6005..1ef0f90813d6 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -23,7 +23,7 @@
23 23
24#define gtod vdso_vsyscall_gtod_data 24#define gtod vdso_vsyscall_gtod_data
25 25
26static long vdso_fallback_gettime(long clock, struct timespec *ts) 26notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
27{ 27{
28 long ret; 28 long ret;
29 asm("syscall" : "=a" (ret) : 29 asm("syscall" : "=a" (ret) :
@@ -31,7 +31,7 @@ static long vdso_fallback_gettime(long clock, struct timespec *ts)
31 return ret; 31 return ret;
32} 32}
33 33
34static inline long vgetns(void) 34notrace static inline long vgetns(void)
35{ 35{
36 long v; 36 long v;
37 cycles_t (*vread)(void); 37 cycles_t (*vread)(void);
@@ -40,7 +40,7 @@ static inline long vgetns(void)
40 return (v * gtod->clock.mult) >> gtod->clock.shift; 40 return (v * gtod->clock.mult) >> gtod->clock.shift;
41} 41}
42 42
43static noinline int do_realtime(struct timespec *ts) 43notrace static noinline int do_realtime(struct timespec *ts)
44{ 44{
45 unsigned long seq, ns; 45 unsigned long seq, ns;
46 do { 46 do {
@@ -54,7 +54,8 @@ static noinline int do_realtime(struct timespec *ts)
54} 54}
55 55
56/* Copy of the version in kernel/time.c which we cannot directly access */ 56/* Copy of the version in kernel/time.c which we cannot directly access */
57static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec) 57notrace static void
58vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
58{ 59{
59 while (nsec >= NSEC_PER_SEC) { 60 while (nsec >= NSEC_PER_SEC) {
60 nsec -= NSEC_PER_SEC; 61 nsec -= NSEC_PER_SEC;
@@ -68,7 +69,7 @@ static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
68 ts->tv_nsec = nsec; 69 ts->tv_nsec = nsec;
69} 70}
70 71
71static noinline int do_monotonic(struct timespec *ts) 72notrace static noinline int do_monotonic(struct timespec *ts)
72{ 73{
73 unsigned long seq, ns, secs; 74 unsigned long seq, ns, secs;
74 do { 75 do {
@@ -82,7 +83,7 @@ static noinline int do_monotonic(struct timespec *ts)
82 return 0; 83 return 0;
83} 84}
84 85
85int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) 86notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
86{ 87{
87 if (likely(gtod->sysctl_enabled && gtod->clock.vread)) 88 if (likely(gtod->sysctl_enabled && gtod->clock.vread))
88 switch (clock) { 89 switch (clock) {
@@ -96,7 +97,7 @@ int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
96int clock_gettime(clockid_t, struct timespec *) 97int clock_gettime(clockid_t, struct timespec *)
97 __attribute__((weak, alias("__vdso_clock_gettime"))); 98 __attribute__((weak, alias("__vdso_clock_gettime")));
98 99
99int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) 100notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
100{ 101{
101 long ret; 102 long ret;
102 if (likely(gtod->sysctl_enabled && gtod->clock.vread)) { 103 if (likely(gtod->sysctl_enabled && gtod->clock.vread)) {
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index cf058fecfcee..513f330c5832 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -193,30 +193,16 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
193 } 193 }
194} 194}
195 195
196/*
197 * These symbols are defined by vdso32.S to mark the bounds
198 * of the ELF DSO images included therein.
199 */
200extern const char vdso32_default_start, vdso32_default_end;
201extern const char vdso32_sysenter_start, vdso32_sysenter_end;
202static struct page *vdso32_pages[1]; 196static struct page *vdso32_pages[1];
203 197
204#ifdef CONFIG_X86_64 198#ifdef CONFIG_X86_64
205 199
206static int use_sysenter __read_mostly = -1; 200#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32))
207 201#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
208#define vdso32_sysenter() (use_sysenter > 0)
209 202
210/* May not be __init: called during resume */ 203/* May not be __init: called during resume */
211void syscall32_cpu_init(void) 204void syscall32_cpu_init(void)
212{ 205{
213 if (use_sysenter < 0) {
214 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
215 use_sysenter = 1;
216 if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR)
217 use_sysenter = 1;
218 }
219
220 /* Load these always in case some future AMD CPU supports 206 /* Load these always in case some future AMD CPU supports
221 SYSENTER from compat mode too. */ 207 SYSENTER from compat mode too. */
222 checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); 208 checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
@@ -235,6 +221,7 @@ static inline void map_compat_vdso(int map)
235#else /* CONFIG_X86_32 */ 221#else /* CONFIG_X86_32 */
236 222
237#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) 223#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
224#define vdso32_syscall() (0)
238 225
239void enable_sep_cpu(void) 226void enable_sep_cpu(void)
240{ 227{
@@ -305,12 +292,15 @@ int __init sysenter_setup(void)
305 gate_vma_init(); 292 gate_vma_init();
306#endif 293#endif
307 294
308 if (!vdso32_sysenter()) { 295 if (vdso32_syscall()) {
309 vsyscall = &vdso32_default_start; 296 vsyscall = &vdso32_syscall_start;
310 vsyscall_len = &vdso32_default_end - &vdso32_default_start; 297 vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
311 } else { 298 } else if (vdso32_sysenter()){
312 vsyscall = &vdso32_sysenter_start; 299 vsyscall = &vdso32_sysenter_start;
313 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start; 300 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
301 } else {
302 vsyscall = &vdso32_int80_start;
303 vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
314 } 304 }
315 305
316 memcpy(syscall_page, vsyscall, vsyscall_len); 306 memcpy(syscall_page, vsyscall, vsyscall_len);
diff --git a/arch/x86/vdso/vdso32.S b/arch/x86/vdso/vdso32.S
index 1e36f72cab86..2ce5f82c333b 100644
--- a/arch/x86/vdso/vdso32.S
+++ b/arch/x86/vdso/vdso32.S
@@ -2,14 +2,17 @@
2 2
3__INITDATA 3__INITDATA
4 4
5 .globl vdso32_default_start, vdso32_default_end 5 .globl vdso32_int80_start, vdso32_int80_end
6vdso32_default_start: 6vdso32_int80_start:
7#ifdef CONFIG_X86_32
8 .incbin "arch/x86/vdso/vdso32-int80.so" 7 .incbin "arch/x86/vdso/vdso32-int80.so"
9#else 8vdso32_int80_end:
9
10 .globl vdso32_syscall_start, vdso32_syscall_end
11vdso32_syscall_start:
12#ifdef CONFIG_COMPAT
10 .incbin "arch/x86/vdso/vdso32-syscall.so" 13 .incbin "arch/x86/vdso/vdso32-syscall.so"
11#endif 14#endif
12vdso32_default_end: 15vdso32_syscall_end:
13 16
14 .globl vdso32_sysenter_start, vdso32_sysenter_end 17 .globl vdso32_sysenter_start, vdso32_sysenter_end
15vdso32_sysenter_start: 18vdso32_sysenter_start:
diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c
index c8097f17f8a9..9fbc6b20026b 100644
--- a/arch/x86/vdso/vgetcpu.c
+++ b/arch/x86/vdso/vgetcpu.c
@@ -13,7 +13,8 @@
13#include <asm/vgtod.h> 13#include <asm/vgtod.h>
14#include "vextern.h" 14#include "vextern.h"
15 15
16long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) 16notrace long
17__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
17{ 18{
18 unsigned int p; 19 unsigned int p;
19 20
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 3fdd51497a83..257ba4a10abf 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -16,12 +16,13 @@
16#include "vextern.h" /* Just for VMAGIC. */ 16#include "vextern.h" /* Just for VMAGIC. */
17#undef VEXTERN 17#undef VEXTERN
18 18
19int vdso_enabled = 1; 19unsigned int __read_mostly vdso_enabled = 1;
20 20
21extern char vdso_start[], vdso_end[]; 21extern char vdso_start[], vdso_end[];
22extern unsigned short vdso_sync_cpuid; 22extern unsigned short vdso_sync_cpuid;
23 23
24struct page **vdso_pages; 24static struct page **vdso_pages;
25static unsigned vdso_size;
25 26
26static inline void *var_ref(void *p, char *name) 27static inline void *var_ref(void *p, char *name)
27{ 28{
@@ -38,6 +39,7 @@ static int __init init_vdso_vars(void)
38 int i; 39 int i;
39 char *vbase; 40 char *vbase;
40 41
42 vdso_size = npages << PAGE_SHIFT;
41 vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL); 43 vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL);
42 if (!vdso_pages) 44 if (!vdso_pages)
43 goto oom; 45 goto oom;
@@ -101,20 +103,19 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
101 struct mm_struct *mm = current->mm; 103 struct mm_struct *mm = current->mm;
102 unsigned long addr; 104 unsigned long addr;
103 int ret; 105 int ret;
104 unsigned len = round_up(vdso_end - vdso_start, PAGE_SIZE);
105 106
106 if (!vdso_enabled) 107 if (!vdso_enabled)
107 return 0; 108 return 0;
108 109
109 down_write(&mm->mmap_sem); 110 down_write(&mm->mmap_sem);
110 addr = vdso_addr(mm->start_stack, len); 111 addr = vdso_addr(mm->start_stack, vdso_size);
111 addr = get_unmapped_area(NULL, addr, len, 0, 0); 112 addr = get_unmapped_area(NULL, addr, vdso_size, 0, 0);
112 if (IS_ERR_VALUE(addr)) { 113 if (IS_ERR_VALUE(addr)) {
113 ret = addr; 114 ret = addr;
114 goto up_fail; 115 goto up_fail;
115 } 116 }
116 117
117 ret = install_special_mapping(mm, addr, len, 118 ret = install_special_mapping(mm, addr, vdso_size,
118 VM_READ|VM_EXEC| 119 VM_READ|VM_EXEC|
119 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| 120 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
120 VM_ALWAYSDUMP, 121 VM_ALWAYSDUMP,
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 6c388e593bc8..3815e425f470 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -6,9 +6,25 @@ config XEN
6 bool "Xen guest support" 6 bool "Xen guest support"
7 select PARAVIRT 7 select PARAVIRT
8 select PARAVIRT_CLOCK 8 select PARAVIRT_CLOCK
9 depends on X86_32 9 depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER))
10 depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER) 10 depends on X86_CMPXCHG && X86_TSC
11 help 11 help
12 This is the Linux Xen port. Enabling this will allow the 12 This is the Linux Xen port. Enabling this will allow the
13 kernel to boot in a paravirtualized environment under the 13 kernel to boot in a paravirtualized environment under the
14 Xen hypervisor. 14 Xen hypervisor.
15
16config XEN_MAX_DOMAIN_MEMORY
17 int "Maximum allowed size of a domain in gigabytes"
18 default 8 if X86_32
19 default 32 if X86_64
20 depends on XEN
21 help
22 The pseudo-physical to machine address array is sized
23 according to the maximum possible memory size of a Xen
24 domain. This array uses 1 page per gigabyte, so there's no
25 need to be too stingy here.
26
27config XEN_SAVE_RESTORE
28 bool
29 depends on PM
30 default y \ No newline at end of file
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 3d8df981d5fd..59c1e539aed2 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,4 @@
1obj-y := enlighten.o setup.o multicalls.o mmu.o \ 1obj-y := enlighten.o setup.o multicalls.o mmu.o \
2 time.o manage.o xen-asm.o grant-table.o 2 time.o xen-asm_$(BITS).o grant-table.o suspend.o
3 3
4obj-$(CONFIG_SMP) += smp.o 4obj-$(CONFIG_SMP) += smp.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index f09c1c69c37a..9ff6e3cbf08f 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -33,6 +33,7 @@
33#include <xen/interface/sched.h> 33#include <xen/interface/sched.h>
34#include <xen/features.h> 34#include <xen/features.h>
35#include <xen/page.h> 35#include <xen/page.h>
36#include <xen/hvc-console.h>
36 37
37#include <asm/paravirt.h> 38#include <asm/paravirt.h>
38#include <asm/page.h> 39#include <asm/page.h>
@@ -40,6 +41,7 @@
40#include <asm/xen/hypervisor.h> 41#include <asm/xen/hypervisor.h>
41#include <asm/fixmap.h> 42#include <asm/fixmap.h>
42#include <asm/processor.h> 43#include <asm/processor.h>
44#include <asm/msr-index.h>
43#include <asm/setup.h> 45#include <asm/setup.h>
44#include <asm/desc.h> 46#include <asm/desc.h>
45#include <asm/pgtable.h> 47#include <asm/pgtable.h>
@@ -56,6 +58,18 @@ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
56DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); 58DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
57 59
58/* 60/*
61 * Identity map, in addition to plain kernel map. This needs to be
62 * large enough to allocate page table pages to allocate the rest.
63 * Each page can map 2MB.
64 */
65static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
66
67#ifdef CONFIG_X86_64
68/* l3 pud for userspace vsyscall mapping */
69static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
70#endif /* CONFIG_X86_64 */
71
72/*
59 * Note about cr3 (pagetable base) values: 73 * Note about cr3 (pagetable base) values:
60 * 74 *
61 * xen_cr3 contains the current logical cr3 value; it contains the 75 * xen_cr3 contains the current logical cr3 value; it contains the
@@ -75,13 +89,13 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
75struct start_info *xen_start_info; 89struct start_info *xen_start_info;
76EXPORT_SYMBOL_GPL(xen_start_info); 90EXPORT_SYMBOL_GPL(xen_start_info);
77 91
78static /* __initdata */ struct shared_info dummy_shared_info; 92struct shared_info xen_dummy_shared_info;
79 93
80/* 94/*
81 * Point at some empty memory to start with. We map the real shared_info 95 * Point at some empty memory to start with. We map the real shared_info
82 * page as soon as fixmap is up and running. 96 * page as soon as fixmap is up and running.
83 */ 97 */
84struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info; 98struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
85 99
86/* 100/*
87 * Flag to determine whether vcpu info placement is available on all 101 * Flag to determine whether vcpu info placement is available on all
@@ -98,13 +112,13 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info;
98 */ 112 */
99static int have_vcpu_info_placement = 1; 113static int have_vcpu_info_placement = 1;
100 114
101static void __init xen_vcpu_setup(int cpu) 115static void xen_vcpu_setup(int cpu)
102{ 116{
103 struct vcpu_register_vcpu_info info; 117 struct vcpu_register_vcpu_info info;
104 int err; 118 int err;
105 struct vcpu_info *vcpup; 119 struct vcpu_info *vcpup;
106 120
107 BUG_ON(HYPERVISOR_shared_info == &dummy_shared_info); 121 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
108 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; 122 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
109 123
110 if (!have_vcpu_info_placement) 124 if (!have_vcpu_info_placement)
@@ -136,11 +150,45 @@ static void __init xen_vcpu_setup(int cpu)
136 } 150 }
137} 151}
138 152
153/*
154 * On restore, set the vcpu placement up again.
155 * If it fails, then we're in a bad state, since
156 * we can't back out from using it...
157 */
158void xen_vcpu_restore(void)
159{
160 if (have_vcpu_info_placement) {
161 int cpu;
162
163 for_each_online_cpu(cpu) {
164 bool other_cpu = (cpu != smp_processor_id());
165
166 if (other_cpu &&
167 HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL))
168 BUG();
169
170 xen_vcpu_setup(cpu);
171
172 if (other_cpu &&
173 HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
174 BUG();
175 }
176
177 BUG_ON(!have_vcpu_info_placement);
178 }
179}
180
139static void __init xen_banner(void) 181static void __init xen_banner(void)
140{ 182{
183 unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
184 struct xen_extraversion extra;
185 HYPERVISOR_xen_version(XENVER_extraversion, &extra);
186
141 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 187 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
142 pv_info.name); 188 pv_info.name);
143 printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); 189 printk(KERN_INFO "Xen version: %d.%d%s%s\n",
190 version >> 16, version & 0xffff, extra.extraversion,
191 xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
144} 192}
145 193
146static void xen_cpuid(unsigned int *ax, unsigned int *bx, 194static void xen_cpuid(unsigned int *ax, unsigned int *bx,
@@ -235,13 +283,13 @@ static void xen_irq_enable(void)
235{ 283{
236 struct vcpu_info *vcpu; 284 struct vcpu_info *vcpu;
237 285
238 /* There's a one instruction preempt window here. We need to 286 /* We don't need to worry about being preempted here, since
239 make sure we're don't switch CPUs between getting the vcpu 287 either a) interrupts are disabled, so no preemption, or b)
240 pointer and updating the mask. */ 288 the caller is confused and is trying to re-enable interrupts
241 preempt_disable(); 289 on an indeterminate processor. */
290
242 vcpu = x86_read_percpu(xen_vcpu); 291 vcpu = x86_read_percpu(xen_vcpu);
243 vcpu->evtchn_upcall_mask = 0; 292 vcpu->evtchn_upcall_mask = 0;
244 preempt_enable_no_resched();
245 293
246 /* Doesn't matter if we get preempted here, because any 294 /* Doesn't matter if we get preempted here, because any
247 pending event will get dealt with anyway. */ 295 pending event will get dealt with anyway. */
@@ -254,7 +302,7 @@ static void xen_irq_enable(void)
254static void xen_safe_halt(void) 302static void xen_safe_halt(void)
255{ 303{
256 /* Blocking includes an implicit local_irq_enable(). */ 304 /* Blocking includes an implicit local_irq_enable(). */
257 if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0) 305 if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
258 BUG(); 306 BUG();
259} 307}
260 308
@@ -332,14 +380,6 @@ static void load_TLS_descriptor(struct thread_struct *t,
332 380
333static void xen_load_tls(struct thread_struct *t, unsigned int cpu) 381static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
334{ 382{
335 xen_mc_batch();
336
337 load_TLS_descriptor(t, cpu, 0);
338 load_TLS_descriptor(t, cpu, 1);
339 load_TLS_descriptor(t, cpu, 2);
340
341 xen_mc_issue(PARAVIRT_LAZY_CPU);
342
343 /* 383 /*
344 * XXX sleazy hack: If we're being called in a lazy-cpu zone, 384 * XXX sleazy hack: If we're being called in a lazy-cpu zone,
345 * it means we're in a context switch, and %gs has just been 385 * it means we're in a context switch, and %gs has just been
@@ -348,10 +388,39 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
348 * Either way, it has been saved, and the new value will get 388 * Either way, it has been saved, and the new value will get
349 * loaded properly. This will go away as soon as Xen has been 389 * loaded properly. This will go away as soon as Xen has been
350 * modified to not save/restore %gs for normal hypercalls. 390 * modified to not save/restore %gs for normal hypercalls.
391 *
392 * On x86_64, this hack is not used for %gs, because gs points
393 * to KERNEL_GS_BASE (and uses it for PDA references), so we
394 * must not zero %gs on x86_64
395 *
396 * For x86_64, we need to zero %fs, otherwise we may get an
397 * exception between the new %fs descriptor being loaded and
398 * %fs being effectively cleared at __switch_to().
351 */ 399 */
352 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) 400 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
401#ifdef CONFIG_X86_32
353 loadsegment(gs, 0); 402 loadsegment(gs, 0);
403#else
404 loadsegment(fs, 0);
405#endif
406 }
407
408 xen_mc_batch();
409
410 load_TLS_descriptor(t, cpu, 0);
411 load_TLS_descriptor(t, cpu, 1);
412 load_TLS_descriptor(t, cpu, 2);
413
414 xen_mc_issue(PARAVIRT_LAZY_CPU);
415}
416
417#ifdef CONFIG_X86_64
418static void xen_load_gs_index(unsigned int idx)
419{
420 if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
421 BUG();
354} 422}
423#endif
355 424
356static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, 425static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
357 const void *ptr) 426 const void *ptr)
@@ -369,23 +438,18 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
369 preempt_enable(); 438 preempt_enable();
370} 439}
371 440
372static int cvt_gate_to_trap(int vector, u32 low, u32 high, 441static int cvt_gate_to_trap(int vector, const gate_desc *val,
373 struct trap_info *info) 442 struct trap_info *info)
374{ 443{
375 u8 type, dpl; 444 if (val->type != 0xf && val->type != 0xe)
376
377 type = (high >> 8) & 0x1f;
378 dpl = (high >> 13) & 3;
379
380 if (type != 0xf && type != 0xe)
381 return 0; 445 return 0;
382 446
383 info->vector = vector; 447 info->vector = vector;
384 info->address = (high & 0xffff0000) | (low & 0x0000ffff); 448 info->address = gate_offset(*val);
385 info->cs = low >> 16; 449 info->cs = gate_segment(*val);
386 info->flags = dpl; 450 info->flags = val->dpl;
387 /* interrupt gates clear IF */ 451 /* interrupt gates clear IF */
388 if (type == 0xe) 452 if (val->type == 0xe)
389 info->flags |= 4; 453 info->flags |= 4;
390 454
391 return 1; 455 return 1;
@@ -412,11 +476,10 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
412 476
413 if (p >= start && (p + 8) <= end) { 477 if (p >= start && (p + 8) <= end) {
414 struct trap_info info[2]; 478 struct trap_info info[2];
415 u32 *desc = (u32 *)g;
416 479
417 info[1].address = 0; 480 info[1].address = 0;
418 481
419 if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0])) 482 if (cvt_gate_to_trap(entrynum, g, &info[0]))
420 if (HYPERVISOR_set_trap_table(info)) 483 if (HYPERVISOR_set_trap_table(info))
421 BUG(); 484 BUG();
422 } 485 }
@@ -429,13 +492,13 @@ static void xen_convert_trap_info(const struct desc_ptr *desc,
429{ 492{
430 unsigned in, out, count; 493 unsigned in, out, count;
431 494
432 count = (desc->size+1) / 8; 495 count = (desc->size+1) / sizeof(gate_desc);
433 BUG_ON(count > 256); 496 BUG_ON(count > 256);
434 497
435 for (in = out = 0; in < count; in++) { 498 for (in = out = 0; in < count; in++) {
436 const u32 *entry = (u32 *)(desc->address + in * 8); 499 gate_desc *entry = (gate_desc*)(desc->address) + in;
437 500
438 if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out])) 501 if (cvt_gate_to_trap(in, entry, &traps[out]))
439 out++; 502 out++;
440 } 503 }
441 traps[out].address = 0; 504 traps[out].address = 0;
@@ -607,6 +670,30 @@ static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
607 xen_mc_issue(PARAVIRT_LAZY_MMU); 670 xen_mc_issue(PARAVIRT_LAZY_MMU);
608} 671}
609 672
673static void xen_clts(void)
674{
675 struct multicall_space mcs;
676
677 mcs = xen_mc_entry(0);
678
679 MULTI_fpu_taskswitch(mcs.mc, 0);
680
681 xen_mc_issue(PARAVIRT_LAZY_CPU);
682}
683
684static void xen_write_cr0(unsigned long cr0)
685{
686 struct multicall_space mcs;
687
688 /* Only pay attention to cr0.TS; everything else is
689 ignored. */
690 mcs = xen_mc_entry(0);
691
692 MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0);
693
694 xen_mc_issue(PARAVIRT_LAZY_CPU);
695}
696
610static void xen_write_cr2(unsigned long cr2) 697static void xen_write_cr2(unsigned long cr2)
611{ 698{
612 x86_read_percpu(xen_vcpu)->arch.cr2 = cr2; 699 x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
@@ -624,8 +711,10 @@ static unsigned long xen_read_cr2_direct(void)
624 711
625static void xen_write_cr4(unsigned long cr4) 712static void xen_write_cr4(unsigned long cr4)
626{ 713{
627 /* Just ignore cr4 changes; Xen doesn't allow us to do 714 cr4 &= ~X86_CR4_PGE;
628 anything anyway. */ 715 cr4 &= ~X86_CR4_PSE;
716
717 native_write_cr4(cr4);
629} 718}
630 719
631static unsigned long xen_read_cr3(void) 720static unsigned long xen_read_cr3(void)
@@ -638,33 +727,89 @@ static void set_current_cr3(void *v)
638 x86_write_percpu(xen_current_cr3, (unsigned long)v); 727 x86_write_percpu(xen_current_cr3, (unsigned long)v);
639} 728}
640 729
641static void xen_write_cr3(unsigned long cr3) 730static void __xen_write_cr3(bool kernel, unsigned long cr3)
642{ 731{
643 struct mmuext_op *op; 732 struct mmuext_op *op;
644 struct multicall_space mcs; 733 struct multicall_space mcs;
645 unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); 734 unsigned long mfn;
646 735
647 BUG_ON(preemptible()); 736 if (cr3)
737 mfn = pfn_to_mfn(PFN_DOWN(cr3));
738 else
739 mfn = 0;
648 740
649 mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */ 741 WARN_ON(mfn == 0 && kernel);
650 742
651 /* Update while interrupts are disabled, so its atomic with 743 mcs = __xen_mc_entry(sizeof(*op));
652 respect to ipis */
653 x86_write_percpu(xen_cr3, cr3);
654 744
655 op = mcs.args; 745 op = mcs.args;
656 op->cmd = MMUEXT_NEW_BASEPTR; 746 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
657 op->arg1.mfn = mfn; 747 op->arg1.mfn = mfn;
658 748
659 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 749 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
660 750
661 /* Update xen_update_cr3 once the batch has actually 751 if (kernel) {
662 been submitted. */ 752 x86_write_percpu(xen_cr3, cr3);
663 xen_mc_callback(set_current_cr3, (void *)cr3); 753
754 /* Update xen_current_cr3 once the batch has actually
755 been submitted. */
756 xen_mc_callback(set_current_cr3, (void *)cr3);
757 }
758}
759
760static void xen_write_cr3(unsigned long cr3)
761{
762 BUG_ON(preemptible());
763
764 xen_mc_batch(); /* disables interrupts */
765
766 /* Update while interrupts are disabled, so its atomic with
767 respect to ipis */
768 x86_write_percpu(xen_cr3, cr3);
769
770 __xen_write_cr3(true, cr3);
771
772#ifdef CONFIG_X86_64
773 {
774 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
775 if (user_pgd)
776 __xen_write_cr3(false, __pa(user_pgd));
777 else
778 __xen_write_cr3(false, 0);
779 }
780#endif
664 781
665 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ 782 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
666} 783}
667 784
785static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
786{
787 int ret;
788
789 ret = 0;
790
791 switch(msr) {
792#ifdef CONFIG_X86_64
793 unsigned which;
794 u64 base;
795
796 case MSR_FS_BASE: which = SEGBASE_FS; goto set;
797 case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set;
798 case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set;
799
800 set:
801 base = ((u64)high << 32) | low;
802 if (HYPERVISOR_set_segment_base(which, base) != 0)
803 ret = -EFAULT;
804 break;
805#endif
806 default:
807 ret = native_write_msr_safe(msr, low, high);
808 }
809
810 return ret;
811}
812
668/* Early in boot, while setting up the initial pagetable, assume 813/* Early in boot, while setting up the initial pagetable, assume
669 everything is pinned. */ 814 everything is pinned. */
670static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) 815static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
@@ -721,6 +866,48 @@ static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn)
721 xen_alloc_ptpage(mm, pfn, PT_PMD); 866 xen_alloc_ptpage(mm, pfn, PT_PMD);
722} 867}
723 868
869static int xen_pgd_alloc(struct mm_struct *mm)
870{
871 pgd_t *pgd = mm->pgd;
872 int ret = 0;
873
874 BUG_ON(PagePinned(virt_to_page(pgd)));
875
876#ifdef CONFIG_X86_64
877 {
878 struct page *page = virt_to_page(pgd);
879 pgd_t *user_pgd;
880
881 BUG_ON(page->private != 0);
882
883 ret = -ENOMEM;
884
885 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
886 page->private = (unsigned long)user_pgd;
887
888 if (user_pgd != NULL) {
889 user_pgd[pgd_index(VSYSCALL_START)] =
890 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
891 ret = 0;
892 }
893
894 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
895 }
896#endif
897
898 return ret;
899}
900
901static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
902{
903#ifdef CONFIG_X86_64
904 pgd_t *user_pgd = xen_get_user_pgd(pgd);
905
906 if (user_pgd)
907 free_page((unsigned long)user_pgd);
908#endif
909}
910
724/* This should never happen until we're OK to use struct page */ 911/* This should never happen until we're OK to use struct page */
725static void xen_release_ptpage(u32 pfn, unsigned level) 912static void xen_release_ptpage(u32 pfn, unsigned level)
726{ 913{
@@ -746,6 +933,18 @@ static void xen_release_pmd(u32 pfn)
746 xen_release_ptpage(pfn, PT_PMD); 933 xen_release_ptpage(pfn, PT_PMD);
747} 934}
748 935
936#if PAGETABLE_LEVELS == 4
937static void xen_alloc_pud(struct mm_struct *mm, u32 pfn)
938{
939 xen_alloc_ptpage(mm, pfn, PT_PUD);
940}
941
942static void xen_release_pud(u32 pfn)
943{
944 xen_release_ptpage(pfn, PT_PUD);
945}
946#endif
947
749#ifdef CONFIG_HIGHPTE 948#ifdef CONFIG_HIGHPTE
750static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) 949static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
751{ 950{
@@ -784,68 +983,16 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
784 983
785static __init void xen_pagetable_setup_start(pgd_t *base) 984static __init void xen_pagetable_setup_start(pgd_t *base)
786{ 985{
787 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
788 int i;
789
790 /* special set_pte for pagetable initialization */
791 pv_mmu_ops.set_pte = xen_set_pte_init;
792
793 init_mm.pgd = base;
794 /*
795 * copy top-level of Xen-supplied pagetable into place. This
796 * is a stand-in while we copy the pmd pages.
797 */
798 memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
799
800 /*
801 * For PAE, need to allocate new pmds, rather than
802 * share Xen's, since Xen doesn't like pmd's being
803 * shared between address spaces.
804 */
805 for (i = 0; i < PTRS_PER_PGD; i++) {
806 if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
807 pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
808
809 memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
810 PAGE_SIZE);
811
812 make_lowmem_page_readonly(pmd);
813
814 set_pgd(&base[i], __pgd(1 + __pa(pmd)));
815 } else
816 pgd_clear(&base[i]);
817 }
818
819 /* make sure zero_page is mapped RO so we can use it in pagetables */
820 make_lowmem_page_readonly(empty_zero_page);
821 make_lowmem_page_readonly(base);
822 /*
823 * Switch to new pagetable. This is done before
824 * pagetable_init has done anything so that the new pages
825 * added to the table can be prepared properly for Xen.
826 */
827 xen_write_cr3(__pa(base));
828
829 /* Unpin initial Xen pagetable */
830 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
831 PFN_DOWN(__pa(xen_start_info->pt_base)));
832} 986}
833 987
834static __init void setup_shared_info(void) 988void xen_setup_shared_info(void)
835{ 989{
836 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 990 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
837 unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP); 991 set_fixmap(FIX_PARAVIRT_BOOTMAP,
838 992 xen_start_info->shared_info);
839 /* 993
840 * Create a mapping for the shared info page. 994 HYPERVISOR_shared_info =
841 * Should be set_fixmap(), but shared_info is a machine 995 (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
842 * address with no corresponding pseudo-phys address.
843 */
844 set_pte_mfn(addr,
845 PFN_DOWN(xen_start_info->shared_info),
846 PAGE_KERNEL);
847
848 HYPERVISOR_shared_info = (struct shared_info *)addr;
849 } else 996 } else
850 HYPERVISOR_shared_info = 997 HYPERVISOR_shared_info =
851 (struct shared_info *)__va(xen_start_info->shared_info); 998 (struct shared_info *)__va(xen_start_info->shared_info);
@@ -854,27 +1001,43 @@ static __init void setup_shared_info(void)
854 /* In UP this is as good a place as any to set up shared info */ 1001 /* In UP this is as good a place as any to set up shared info */
855 xen_setup_vcpu_info_placement(); 1002 xen_setup_vcpu_info_placement();
856#endif 1003#endif
1004
1005 xen_setup_mfn_list_list();
857} 1006}
858 1007
859static __init void xen_pagetable_setup_done(pgd_t *base) 1008static __init void xen_pagetable_setup_done(pgd_t *base)
860{ 1009{
1010 xen_setup_shared_info();
1011}
1012
1013static __init void xen_post_allocator_init(void)
1014{
1015 pv_mmu_ops.set_pte = xen_set_pte;
1016 pv_mmu_ops.set_pmd = xen_set_pmd;
1017 pv_mmu_ops.set_pud = xen_set_pud;
1018#if PAGETABLE_LEVELS == 4
1019 pv_mmu_ops.set_pgd = xen_set_pgd;
1020#endif
1021
861 /* This will work as long as patching hasn't happened yet 1022 /* This will work as long as patching hasn't happened yet
862 (which it hasn't) */ 1023 (which it hasn't) */
863 pv_mmu_ops.alloc_pte = xen_alloc_pte; 1024 pv_mmu_ops.alloc_pte = xen_alloc_pte;
864 pv_mmu_ops.alloc_pmd = xen_alloc_pmd; 1025 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
865 pv_mmu_ops.release_pte = xen_release_pte; 1026 pv_mmu_ops.release_pte = xen_release_pte;
866 pv_mmu_ops.release_pmd = xen_release_pmd; 1027 pv_mmu_ops.release_pmd = xen_release_pmd;
867 pv_mmu_ops.set_pte = xen_set_pte; 1028#if PAGETABLE_LEVELS == 4
868 1029 pv_mmu_ops.alloc_pud = xen_alloc_pud;
869 setup_shared_info(); 1030 pv_mmu_ops.release_pud = xen_release_pud;
1031#endif
870 1032
871 /* Actually pin the pagetable down, but we can't set PG_pinned 1033#ifdef CONFIG_X86_64
872 yet because the page structures don't exist yet. */ 1034 SetPagePinned(virt_to_page(level3_user_vsyscall));
873 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base))); 1035#endif
1036 xen_mark_init_mm_pinned();
874} 1037}
875 1038
876/* This is called once we have the cpu_possible_map */ 1039/* This is called once we have the cpu_possible_map */
877void __init xen_setup_vcpu_info_placement(void) 1040void xen_setup_vcpu_info_placement(void)
878{ 1041{
879 int cpu; 1042 int cpu;
880 1043
@@ -883,6 +1046,7 @@ void __init xen_setup_vcpu_info_placement(void)
883 1046
884 /* xen_vcpu_setup managed to place the vcpu_info within the 1047 /* xen_vcpu_setup managed to place the vcpu_info within the
885 percpu area for all cpus, so make use of it */ 1048 percpu area for all cpus, so make use of it */
1049#ifdef CONFIG_X86_32
886 if (have_vcpu_info_placement) { 1050 if (have_vcpu_info_placement) {
887 printk(KERN_INFO "Xen: using vcpu_info placement\n"); 1051 printk(KERN_INFO "Xen: using vcpu_info placement\n");
888 1052
@@ -892,6 +1056,7 @@ void __init xen_setup_vcpu_info_placement(void)
892 pv_irq_ops.irq_enable = xen_irq_enable_direct; 1056 pv_irq_ops.irq_enable = xen_irq_enable_direct;
893 pv_mmu_ops.read_cr2 = xen_read_cr2_direct; 1057 pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
894 } 1058 }
1059#endif
895} 1060}
896 1061
897static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, 1062static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
@@ -912,10 +1077,12 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
912 goto patch_site 1077 goto patch_site
913 1078
914 switch (type) { 1079 switch (type) {
1080#ifdef CONFIG_X86_32
915 SITE(pv_irq_ops, irq_enable); 1081 SITE(pv_irq_ops, irq_enable);
916 SITE(pv_irq_ops, irq_disable); 1082 SITE(pv_irq_ops, irq_disable);
917 SITE(pv_irq_ops, save_fl); 1083 SITE(pv_irq_ops, save_fl);
918 SITE(pv_irq_ops, restore_fl); 1084 SITE(pv_irq_ops, restore_fl);
1085#endif /* CONFIG_X86_32 */
919#undef SITE 1086#undef SITE
920 1087
921 patch_site: 1088 patch_site:
@@ -947,6 +1114,49 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
947 return ret; 1114 return ret;
948} 1115}
949 1116
1117static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1118{
1119 pte_t pte;
1120
1121 phys >>= PAGE_SHIFT;
1122
1123 switch (idx) {
1124 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1125#ifdef CONFIG_X86_F00F_BUG
1126 case FIX_F00F_IDT:
1127#endif
1128#ifdef CONFIG_X86_32
1129 case FIX_WP_TEST:
1130 case FIX_VDSO:
1131# ifdef CONFIG_HIGHMEM
1132 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1133# endif
1134#else
1135 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1136#endif
1137#ifdef CONFIG_X86_LOCAL_APIC
1138 case FIX_APIC_BASE: /* maps dummy local APIC */
1139#endif
1140 pte = pfn_pte(phys, prot);
1141 break;
1142
1143 default:
1144 pte = mfn_pte(phys, prot);
1145 break;
1146 }
1147
1148 __native_set_fixmap(idx, pte);
1149
1150#ifdef CONFIG_X86_64
1151 /* Replicate changes to map the vsyscall page into the user
1152 pagetable vsyscall mapping. */
1153 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1154 unsigned long vaddr = __fix_to_virt(idx);
1155 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1156 }
1157#endif
1158}
1159
950static const struct pv_info xen_info __initdata = { 1160static const struct pv_info xen_info __initdata = {
951 .paravirt_enabled = 1, 1161 .paravirt_enabled = 1,
952 .shared_kernel_pmd = 0, 1162 .shared_kernel_pmd = 0,
@@ -960,7 +1170,7 @@ static const struct pv_init_ops xen_init_ops __initdata = {
960 .banner = xen_banner, 1170 .banner = xen_banner,
961 .memory_setup = xen_memory_setup, 1171 .memory_setup = xen_memory_setup,
962 .arch_setup = xen_arch_setup, 1172 .arch_setup = xen_arch_setup,
963 .post_allocator_init = xen_mark_init_mm_pinned, 1173 .post_allocator_init = xen_post_allocator_init,
964}; 1174};
965 1175
966static const struct pv_time_ops xen_time_ops __initdata = { 1176static const struct pv_time_ops xen_time_ops __initdata = {
@@ -968,7 +1178,7 @@ static const struct pv_time_ops xen_time_ops __initdata = {
968 1178
969 .set_wallclock = xen_set_wallclock, 1179 .set_wallclock = xen_set_wallclock,
970 .get_wallclock = xen_get_wallclock, 1180 .get_wallclock = xen_get_wallclock,
971 .get_cpu_khz = xen_cpu_khz, 1181 .get_tsc_khz = xen_tsc_khz,
972 .sched_clock = xen_sched_clock, 1182 .sched_clock = xen_sched_clock,
973}; 1183};
974 1184
@@ -978,10 +1188,10 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
978 .set_debugreg = xen_set_debugreg, 1188 .set_debugreg = xen_set_debugreg,
979 .get_debugreg = xen_get_debugreg, 1189 .get_debugreg = xen_get_debugreg,
980 1190
981 .clts = native_clts, 1191 .clts = xen_clts,
982 1192
983 .read_cr0 = native_read_cr0, 1193 .read_cr0 = native_read_cr0,
984 .write_cr0 = native_write_cr0, 1194 .write_cr0 = xen_write_cr0,
985 1195
986 .read_cr4 = native_read_cr4, 1196 .read_cr4 = native_read_cr4,
987 .read_cr4_safe = native_read_cr4_safe, 1197 .read_cr4_safe = native_read_cr4_safe,
@@ -990,18 +1200,25 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
990 .wbinvd = native_wbinvd, 1200 .wbinvd = native_wbinvd,
991 1201
992 .read_msr = native_read_msr_safe, 1202 .read_msr = native_read_msr_safe,
993 .write_msr = native_write_msr_safe, 1203 .write_msr = xen_write_msr_safe,
994 .read_tsc = native_read_tsc, 1204 .read_tsc = native_read_tsc,
995 .read_pmc = native_read_pmc, 1205 .read_pmc = native_read_pmc,
996 1206
997 .iret = xen_iret, 1207 .iret = xen_iret,
998 .irq_enable_syscall_ret = xen_sysexit, 1208 .irq_enable_sysexit = xen_sysexit,
1209#ifdef CONFIG_X86_64
1210 .usergs_sysret32 = xen_sysret32,
1211 .usergs_sysret64 = xen_sysret64,
1212#endif
999 1213
1000 .load_tr_desc = paravirt_nop, 1214 .load_tr_desc = paravirt_nop,
1001 .set_ldt = xen_set_ldt, 1215 .set_ldt = xen_set_ldt,
1002 .load_gdt = xen_load_gdt, 1216 .load_gdt = xen_load_gdt,
1003 .load_idt = xen_load_idt, 1217 .load_idt = xen_load_idt,
1004 .load_tls = xen_load_tls, 1218 .load_tls = xen_load_tls,
1219#ifdef CONFIG_X86_64
1220 .load_gs_index = xen_load_gs_index,
1221#endif
1005 1222
1006 .store_gdt = native_store_gdt, 1223 .store_gdt = native_store_gdt,
1007 .store_idt = native_store_idt, 1224 .store_idt = native_store_idt,
@@ -1015,26 +1232,48 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1015 .set_iopl_mask = xen_set_iopl_mask, 1232 .set_iopl_mask = xen_set_iopl_mask,
1016 .io_delay = xen_io_delay, 1233 .io_delay = xen_io_delay,
1017 1234
1235 /* Xen takes care of %gs when switching to usermode for us */
1236 .swapgs = paravirt_nop,
1237
1018 .lazy_mode = { 1238 .lazy_mode = {
1019 .enter = paravirt_enter_lazy_cpu, 1239 .enter = paravirt_enter_lazy_cpu,
1020 .leave = xen_leave_lazy, 1240 .leave = xen_leave_lazy,
1021 }, 1241 },
1022}; 1242};
1023 1243
1244static void __init __xen_init_IRQ(void)
1245{
1246#ifdef CONFIG_X86_64
1247 int i;
1248
1249 /* Create identity vector->irq map */
1250 for(i = 0; i < NR_VECTORS; i++) {
1251 int cpu;
1252
1253 for_each_possible_cpu(cpu)
1254 per_cpu(vector_irq, cpu)[i] = i;
1255 }
1256#endif /* CONFIG_X86_64 */
1257
1258 xen_init_IRQ();
1259}
1260
1024static const struct pv_irq_ops xen_irq_ops __initdata = { 1261static const struct pv_irq_ops xen_irq_ops __initdata = {
1025 .init_IRQ = xen_init_IRQ, 1262 .init_IRQ = __xen_init_IRQ,
1026 .save_fl = xen_save_fl, 1263 .save_fl = xen_save_fl,
1027 .restore_fl = xen_restore_fl, 1264 .restore_fl = xen_restore_fl,
1028 .irq_disable = xen_irq_disable, 1265 .irq_disable = xen_irq_disable,
1029 .irq_enable = xen_irq_enable, 1266 .irq_enable = xen_irq_enable,
1030 .safe_halt = xen_safe_halt, 1267 .safe_halt = xen_safe_halt,
1031 .halt = xen_halt, 1268 .halt = xen_halt,
1269#ifdef CONFIG_X86_64
1270 .adjust_exception_frame = xen_adjust_exception_frame,
1271#endif
1032}; 1272};
1033 1273
1034static const struct pv_apic_ops xen_apic_ops __initdata = { 1274static const struct pv_apic_ops xen_apic_ops __initdata = {
1035#ifdef CONFIG_X86_LOCAL_APIC 1275#ifdef CONFIG_X86_LOCAL_APIC
1036 .apic_write = xen_apic_write, 1276 .apic_write = xen_apic_write,
1037 .apic_write_atomic = xen_apic_write,
1038 .apic_read = xen_apic_read, 1277 .apic_read = xen_apic_read,
1039 .setup_boot_clock = paravirt_nop, 1278 .setup_boot_clock = paravirt_nop,
1040 .setup_secondary_clock = paravirt_nop, 1279 .setup_secondary_clock = paravirt_nop,
@@ -1060,6 +1299,9 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1060 .pte_update = paravirt_nop, 1299 .pte_update = paravirt_nop,
1061 .pte_update_defer = paravirt_nop, 1300 .pte_update_defer = paravirt_nop,
1062 1301
1302 .pgd_alloc = xen_pgd_alloc,
1303 .pgd_free = xen_pgd_free,
1304
1063 .alloc_pte = xen_alloc_pte_init, 1305 .alloc_pte = xen_alloc_pte_init,
1064 .release_pte = xen_release_pte_init, 1306 .release_pte = xen_release_pte_init,
1065 .alloc_pmd = xen_alloc_pte_init, 1307 .alloc_pmd = xen_alloc_pte_init,
@@ -1070,25 +1312,44 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1070 .kmap_atomic_pte = xen_kmap_atomic_pte, 1312 .kmap_atomic_pte = xen_kmap_atomic_pte,
1071#endif 1313#endif
1072 1314
1073 .set_pte = NULL, /* see xen_pagetable_setup_* */ 1315#ifdef CONFIG_X86_64
1316 .set_pte = xen_set_pte,
1317#else
1318 .set_pte = xen_set_pte_init,
1319#endif
1074 .set_pte_at = xen_set_pte_at, 1320 .set_pte_at = xen_set_pte_at,
1075 .set_pmd = xen_set_pmd, 1321 .set_pmd = xen_set_pmd_hyper,
1322
1323 .ptep_modify_prot_start = __ptep_modify_prot_start,
1324 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1076 1325
1077 .pte_val = xen_pte_val, 1326 .pte_val = xen_pte_val,
1327 .pte_flags = native_pte_val,
1078 .pgd_val = xen_pgd_val, 1328 .pgd_val = xen_pgd_val,
1079 1329
1080 .make_pte = xen_make_pte, 1330 .make_pte = xen_make_pte,
1081 .make_pgd = xen_make_pgd, 1331 .make_pgd = xen_make_pgd,
1082 1332
1333#ifdef CONFIG_X86_PAE
1083 .set_pte_atomic = xen_set_pte_atomic, 1334 .set_pte_atomic = xen_set_pte_atomic,
1084 .set_pte_present = xen_set_pte_at, 1335 .set_pte_present = xen_set_pte_at,
1085 .set_pud = xen_set_pud,
1086 .pte_clear = xen_pte_clear, 1336 .pte_clear = xen_pte_clear,
1087 .pmd_clear = xen_pmd_clear, 1337 .pmd_clear = xen_pmd_clear,
1338#endif /* CONFIG_X86_PAE */
1339 .set_pud = xen_set_pud_hyper,
1088 1340
1089 .make_pmd = xen_make_pmd, 1341 .make_pmd = xen_make_pmd,
1090 .pmd_val = xen_pmd_val, 1342 .pmd_val = xen_pmd_val,
1091 1343
1344#if PAGETABLE_LEVELS == 4
1345 .pud_val = xen_pud_val,
1346 .make_pud = xen_make_pud,
1347 .set_pgd = xen_set_pgd_hyper,
1348
1349 .alloc_pud = xen_alloc_pte_init,
1350 .release_pud = xen_release_pte_init,
1351#endif /* PAGETABLE_LEVELS == 4 */
1352
1092 .activate_mm = xen_activate_mm, 1353 .activate_mm = xen_activate_mm,
1093 .dup_mmap = xen_dup_mmap, 1354 .dup_mmap = xen_dup_mmap,
1094 .exit_mmap = xen_exit_mmap, 1355 .exit_mmap = xen_exit_mmap,
@@ -1097,28 +1358,19 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1097 .enter = paravirt_enter_lazy_mmu, 1358 .enter = paravirt_enter_lazy_mmu,
1098 .leave = xen_leave_lazy, 1359 .leave = xen_leave_lazy,
1099 }, 1360 },
1100};
1101 1361
1102#ifdef CONFIG_SMP 1362 .set_fixmap = xen_set_fixmap,
1103static const struct smp_ops xen_smp_ops __initdata = {
1104 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
1105 .smp_prepare_cpus = xen_smp_prepare_cpus,
1106 .cpu_up = xen_cpu_up,
1107 .smp_cpus_done = xen_smp_cpus_done,
1108
1109 .smp_send_stop = xen_smp_send_stop,
1110 .smp_send_reschedule = xen_smp_send_reschedule,
1111 .smp_call_function_mask = xen_smp_call_function_mask,
1112}; 1363};
1113#endif /* CONFIG_SMP */
1114 1364
1115static void xen_reboot(int reason) 1365static void xen_reboot(int reason)
1116{ 1366{
1367 struct sched_shutdown r = { .reason = reason };
1368
1117#ifdef CONFIG_SMP 1369#ifdef CONFIG_SMP
1118 smp_send_stop(); 1370 smp_send_stop();
1119#endif 1371#endif
1120 1372
1121 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason)) 1373 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
1122 BUG(); 1374 BUG();
1123} 1375}
1124 1376
@@ -1154,6 +1406,7 @@ static const struct machine_ops __initdata xen_machine_ops = {
1154 1406
1155static void __init xen_reserve_top(void) 1407static void __init xen_reserve_top(void)
1156{ 1408{
1409#ifdef CONFIG_X86_32
1157 unsigned long top = HYPERVISOR_VIRT_START; 1410 unsigned long top = HYPERVISOR_VIRT_START;
1158 struct xen_platform_parameters pp; 1411 struct xen_platform_parameters pp;
1159 1412
@@ -1161,8 +1414,248 @@ static void __init xen_reserve_top(void)
1161 top = pp.virt_start; 1414 top = pp.virt_start;
1162 1415
1163 reserve_top_address(-top + 2 * PAGE_SIZE); 1416 reserve_top_address(-top + 2 * PAGE_SIZE);
1417#endif /* CONFIG_X86_32 */
1418}
1419
1420/*
1421 * Like __va(), but returns address in the kernel mapping (which is
1422 * all we have until the physical memory mapping has been set up.
1423 */
1424static void *__ka(phys_addr_t paddr)
1425{
1426#ifdef CONFIG_X86_64
1427 return (void *)(paddr + __START_KERNEL_map);
1428#else
1429 return __va(paddr);
1430#endif
1431}
1432
1433/* Convert a machine address to physical address */
1434static unsigned long m2p(phys_addr_t maddr)
1435{
1436 phys_addr_t paddr;
1437
1438 maddr &= PTE_PFN_MASK;
1439 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1440
1441 return paddr;
1164} 1442}
1165 1443
1444/* Convert a machine address to kernel virtual */
1445static void *m2v(phys_addr_t maddr)
1446{
1447 return __ka(m2p(maddr));
1448}
1449
1450#ifdef CONFIG_X86_64
1451static void walk(pgd_t *pgd, unsigned long addr)
1452{
1453 unsigned l4idx = pgd_index(addr);
1454 unsigned l3idx = pud_index(addr);
1455 unsigned l2idx = pmd_index(addr);
1456 unsigned l1idx = pte_index(addr);
1457 pgd_t l4;
1458 pud_t l3;
1459 pmd_t l2;
1460 pte_t l1;
1461
1462 xen_raw_printk("walk %p, %lx -> %d %d %d %d\n",
1463 pgd, addr, l4idx, l3idx, l2idx, l1idx);
1464
1465 l4 = pgd[l4idx];
1466 xen_raw_printk(" l4: %016lx\n", l4.pgd);
1467 xen_raw_printk(" %016lx\n", pgd_val(l4));
1468
1469 l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx];
1470 xen_raw_printk(" l3: %016lx\n", l3.pud);
1471 xen_raw_printk(" %016lx\n", pud_val(l3));
1472
1473 l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx];
1474 xen_raw_printk(" l2: %016lx\n", l2.pmd);
1475 xen_raw_printk(" %016lx\n", pmd_val(l2));
1476
1477 l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx];
1478 xen_raw_printk(" l1: %016lx\n", l1.pte);
1479 xen_raw_printk(" %016lx\n", pte_val(l1));
1480}
1481#endif
1482
1483static void set_page_prot(void *addr, pgprot_t prot)
1484{
1485 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1486 pte_t pte = pfn_pte(pfn, prot);
1487
1488 xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n",
1489 addr, pfn, get_phys_to_machine(pfn),
1490 pgprot_val(prot), pte.pte);
1491
1492 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1493 BUG();
1494}
1495
1496static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1497{
1498 unsigned pmdidx, pteidx;
1499 unsigned ident_pte;
1500 unsigned long pfn;
1501
1502 ident_pte = 0;
1503 pfn = 0;
1504 for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1505 pte_t *pte_page;
1506
1507 /* Reuse or allocate a page of ptes */
1508 if (pmd_present(pmd[pmdidx]))
1509 pte_page = m2v(pmd[pmdidx].pmd);
1510 else {
1511 /* Check for free pte pages */
1512 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1513 break;
1514
1515 pte_page = &level1_ident_pgt[ident_pte];
1516 ident_pte += PTRS_PER_PTE;
1517
1518 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1519 }
1520
1521 /* Install mappings */
1522 for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1523 pte_t pte;
1524
1525 if (pfn > max_pfn_mapped)
1526 max_pfn_mapped = pfn;
1527
1528 if (!pte_none(pte_page[pteidx]))
1529 continue;
1530
1531 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1532 pte_page[pteidx] = pte;
1533 }
1534 }
1535
1536 for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1537 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1538
1539 set_page_prot(pmd, PAGE_KERNEL_RO);
1540}
1541
1542#ifdef CONFIG_X86_64
1543static void convert_pfn_mfn(void *v)
1544{
1545 pte_t *pte = v;
1546 int i;
1547
1548 /* All levels are converted the same way, so just treat them
1549 as ptes. */
1550 for(i = 0; i < PTRS_PER_PTE; i++)
1551 pte[i] = xen_make_pte(pte[i].pte);
1552}
1553
1554/*
1555 * Set up the inital kernel pagetable.
1556 *
1557 * We can construct this by grafting the Xen provided pagetable into
1558 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1559 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1560 * means that only the kernel has a physical mapping to start with -
1561 * but that's enough to get __va working. We need to fill in the rest
1562 * of the physical mapping once some sort of allocator has been set
1563 * up.
1564 */
1565static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1566{
1567 pud_t *l3;
1568 pmd_t *l2;
1569
1570 /* Zap identity mapping */
1571 init_level4_pgt[0] = __pgd(0);
1572
1573 /* Pre-constructed entries are in pfn, so convert to mfn */
1574 convert_pfn_mfn(init_level4_pgt);
1575 convert_pfn_mfn(level3_ident_pgt);
1576 convert_pfn_mfn(level3_kernel_pgt);
1577
1578 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1579 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1580
1581 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1582 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1583
1584 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1585 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1586 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1587
1588 /* Set up identity map */
1589 xen_map_identity_early(level2_ident_pgt, max_pfn);
1590
1591 /* Make pagetable pieces RO */
1592 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1593 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1594 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1595 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1596 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1597 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1598
1599 /* Pin down new L4 */
1600 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1601 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1602
1603 /* Unpin Xen-provided one */
1604 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1605
1606 /* Switch over */
1607 pgd = init_level4_pgt;
1608
1609 /*
1610 * At this stage there can be no user pgd, and no page
1611 * structure to attach it to, so make sure we just set kernel
1612 * pgd.
1613 */
1614 xen_mc_batch();
1615 __xen_write_cr3(true, __pa(pgd));
1616 xen_mc_issue(PARAVIRT_LAZY_CPU);
1617
1618 reserve_early(__pa(xen_start_info->pt_base),
1619 __pa(xen_start_info->pt_base +
1620 xen_start_info->nr_pt_frames * PAGE_SIZE),
1621 "XEN PAGETABLES");
1622
1623 return pgd;
1624}
1625#else /* !CONFIG_X86_64 */
1626static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1627
1628static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1629{
1630 pmd_t *kernel_pmd;
1631
1632 init_pg_tables_start = __pa(pgd);
1633 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1634 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1635
1636 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1637 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1638
1639 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1640
1641 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1642 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1643 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1644
1645 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1646 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1647 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1648
1649 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1650
1651 xen_write_cr3(__pa(swapper_pg_dir));
1652
1653 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1654
1655 return swapper_pg_dir;
1656}
1657#endif /* CONFIG_X86_64 */
1658
1166/* First C function to be called on Xen boot */ 1659/* First C function to be called on Xen boot */
1167asmlinkage void __init xen_start_kernel(void) 1660asmlinkage void __init xen_start_kernel(void)
1168{ 1661{
@@ -1173,6 +1666,8 @@ asmlinkage void __init xen_start_kernel(void)
1173 1666
1174 BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); 1667 BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0);
1175 1668
1669 xen_setup_features();
1670
1176 /* Install Xen paravirt ops */ 1671 /* Install Xen paravirt ops */
1177 pv_info = xen_info; 1672 pv_info = xen_info;
1178 pv_init_ops = xen_init_ops; 1673 pv_init_ops = xen_init_ops;
@@ -1182,59 +1677,85 @@ asmlinkage void __init xen_start_kernel(void)
1182 pv_apic_ops = xen_apic_ops; 1677 pv_apic_ops = xen_apic_ops;
1183 pv_mmu_ops = xen_mmu_ops; 1678 pv_mmu_ops = xen_mmu_ops;
1184 1679
1680 if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
1681 pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
1682 pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
1683 }
1684
1185 machine_ops = xen_machine_ops; 1685 machine_ops = xen_machine_ops;
1186 1686
1187#ifdef CONFIG_SMP 1687#ifdef CONFIG_X86_64
1188 smp_ops = xen_smp_ops; 1688 /* Disable until direct per-cpu data access. */
1689 have_vcpu_info_placement = 0;
1690 x86_64_init_pda();
1189#endif 1691#endif
1190 1692
1191 xen_setup_features(); 1693 xen_smp_init();
1192 1694
1193 /* Get mfn list */ 1695 /* Get mfn list */
1194 if (!xen_feature(XENFEAT_auto_translated_physmap)) 1696 if (!xen_feature(XENFEAT_auto_translated_physmap))
1195 phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list; 1697 xen_build_dynamic_phys_to_machine();
1196 1698
1197 pgd = (pgd_t *)xen_start_info->pt_base; 1699 pgd = (pgd_t *)xen_start_info->pt_base;
1198 1700
1199 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; 1701 /* Prevent unwanted bits from being set in PTEs. */
1200 1702 __supported_pte_mask &= ~_PAGE_GLOBAL;
1201 init_mm.pgd = pgd; /* use the Xen pagetables to start */ 1703 if (!is_initial_xendomain())
1202 1704 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1203 /* keep using Xen gdt for now; no urgent need to change it */
1204
1205 x86_write_percpu(xen_cr3, __pa(pgd));
1206 x86_write_percpu(xen_current_cr3, __pa(pgd));
1207 1705
1208 /* Don't do the full vcpu_info placement stuff until we have a 1706 /* Don't do the full vcpu_info placement stuff until we have a
1209 possible map and a non-dummy shared_info. */ 1707 possible map and a non-dummy shared_info. */
1210 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 1708 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
1211 1709
1710 xen_raw_console_write("mapping kernel into physical memory\n");
1711 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
1712
1713 init_mm.pgd = pgd;
1714
1715 /* keep using Xen gdt for now; no urgent need to change it */
1716
1212 pv_info.kernel_rpl = 1; 1717 pv_info.kernel_rpl = 1;
1213 if (xen_feature(XENFEAT_supervisor_mode_kernel)) 1718 if (xen_feature(XENFEAT_supervisor_mode_kernel))
1214 pv_info.kernel_rpl = 0; 1719 pv_info.kernel_rpl = 0;
1215 1720
1216 /* Prevent unwanted bits from being set in PTEs. */
1217 __supported_pte_mask &= ~_PAGE_GLOBAL;
1218 if (!is_initial_xendomain())
1219 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1220
1221 /* set the limit of our address space */ 1721 /* set the limit of our address space */
1222 xen_reserve_top(); 1722 xen_reserve_top();
1223 1723
1724#ifdef CONFIG_X86_32
1224 /* set up basic CPUID stuff */ 1725 /* set up basic CPUID stuff */
1225 cpu_detect(&new_cpu_data); 1726 cpu_detect(&new_cpu_data);
1226 new_cpu_data.hard_math = 1; 1727 new_cpu_data.hard_math = 1;
1227 new_cpu_data.x86_capability[0] = cpuid_edx(1); 1728 new_cpu_data.x86_capability[0] = cpuid_edx(1);
1729#endif
1228 1730
1229 /* Poke various useful things into boot_params */ 1731 /* Poke various useful things into boot_params */
1230 boot_params.hdr.type_of_loader = (9 << 4) | 0; 1732 boot_params.hdr.type_of_loader = (9 << 4) | 0;
1231 boot_params.hdr.ramdisk_image = xen_start_info->mod_start 1733 boot_params.hdr.ramdisk_image = xen_start_info->mod_start
1232 ? __pa(xen_start_info->mod_start) : 0; 1734 ? __pa(xen_start_info->mod_start) : 0;
1233 boot_params.hdr.ramdisk_size = xen_start_info->mod_len; 1735 boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
1736 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
1234 1737
1235 if (!is_initial_xendomain()) 1738 if (!is_initial_xendomain()) {
1739 add_preferred_console("xenboot", 0, NULL);
1740 add_preferred_console("tty", 0, NULL);
1236 add_preferred_console("hvc", 0, NULL); 1741 add_preferred_console("hvc", 0, NULL);
1742 }
1743
1744 xen_raw_console_write("about to get started...\n");
1745
1746#if 0
1747 xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n",
1748 &boot_params, __pa_symbol(&boot_params),
1749 __va(__pa_symbol(&boot_params)));
1750
1751 walk(pgd, &boot_params);
1752 walk(pgd, __va(__pa(&boot_params)));
1753#endif
1237 1754
1238 /* Start the world */ 1755 /* Start the world */
1239 start_kernel(); 1756#ifdef CONFIG_X86_32
1757 i386_start_kernel();
1758#else
1759 x86_64_start_reservations((char *)__pa_symbol(&boot_params));
1760#endif
1240} 1761}
diff --git a/arch/x86/xen/manage.c b/arch/x86/xen/manage.c
deleted file mode 100644
index aa7af9e6abc0..000000000000
--- a/arch/x86/xen/manage.c
+++ /dev/null
@@ -1,143 +0,0 @@
1/*
2 * Handle extern requests for shutdown, reboot and sysrq
3 */
4#include <linux/kernel.h>
5#include <linux/err.h>
6#include <linux/reboot.h>
7#include <linux/sysrq.h>
8
9#include <xen/xenbus.h>
10
11#define SHUTDOWN_INVALID -1
12#define SHUTDOWN_POWEROFF 0
13#define SHUTDOWN_SUSPEND 2
14/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
15 * report a crash, not be instructed to crash!
16 * HALT is the same as POWEROFF, as far as we're concerned. The tools use
17 * the distinction when we return the reason code to them.
18 */
19#define SHUTDOWN_HALT 4
20
21/* Ignore multiple shutdown requests. */
22static int shutting_down = SHUTDOWN_INVALID;
23
24static void shutdown_handler(struct xenbus_watch *watch,
25 const char **vec, unsigned int len)
26{
27 char *str;
28 struct xenbus_transaction xbt;
29 int err;
30
31 if (shutting_down != SHUTDOWN_INVALID)
32 return;
33
34 again:
35 err = xenbus_transaction_start(&xbt);
36 if (err)
37 return;
38
39 str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
40 /* Ignore read errors and empty reads. */
41 if (XENBUS_IS_ERR_READ(str)) {
42 xenbus_transaction_end(xbt, 1);
43 return;
44 }
45
46 xenbus_write(xbt, "control", "shutdown", "");
47
48 err = xenbus_transaction_end(xbt, 0);
49 if (err == -EAGAIN) {
50 kfree(str);
51 goto again;
52 }
53
54 if (strcmp(str, "poweroff") == 0 ||
55 strcmp(str, "halt") == 0)
56 orderly_poweroff(false);
57 else if (strcmp(str, "reboot") == 0)
58 ctrl_alt_del();
59 else {
60 printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
61 shutting_down = SHUTDOWN_INVALID;
62 }
63
64 kfree(str);
65}
66
67static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
68 unsigned int len)
69{
70 char sysrq_key = '\0';
71 struct xenbus_transaction xbt;
72 int err;
73
74 again:
75 err = xenbus_transaction_start(&xbt);
76 if (err)
77 return;
78 if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
79 printk(KERN_ERR "Unable to read sysrq code in "
80 "control/sysrq\n");
81 xenbus_transaction_end(xbt, 1);
82 return;
83 }
84
85 if (sysrq_key != '\0')
86 xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
87
88 err = xenbus_transaction_end(xbt, 0);
89 if (err == -EAGAIN)
90 goto again;
91
92 if (sysrq_key != '\0')
93 handle_sysrq(sysrq_key, NULL);
94}
95
96static struct xenbus_watch shutdown_watch = {
97 .node = "control/shutdown",
98 .callback = shutdown_handler
99};
100
101static struct xenbus_watch sysrq_watch = {
102 .node = "control/sysrq",
103 .callback = sysrq_handler
104};
105
106static int setup_shutdown_watcher(void)
107{
108 int err;
109
110 err = register_xenbus_watch(&shutdown_watch);
111 if (err) {
112 printk(KERN_ERR "Failed to set shutdown watcher\n");
113 return err;
114 }
115
116 err = register_xenbus_watch(&sysrq_watch);
117 if (err) {
118 printk(KERN_ERR "Failed to set sysrq watcher\n");
119 return err;
120 }
121
122 return 0;
123}
124
125static int shutdown_event(struct notifier_block *notifier,
126 unsigned long event,
127 void *data)
128{
129 setup_shutdown_watcher();
130 return NOTIFY_DONE;
131}
132
133static int __init setup_shutdown_event(void)
134{
135 static struct notifier_block xenstore_notifier = {
136 .notifier_call = shutdown_event
137 };
138 register_xenstore_notifier(&xenstore_notifier);
139
140 return 0;
141}
142
143subsys_initcall(setup_shutdown_event);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 4e527e7893a8..aa37469da696 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -44,8 +44,10 @@
44 44
45#include <asm/pgtable.h> 45#include <asm/pgtable.h>
46#include <asm/tlbflush.h> 46#include <asm/tlbflush.h>
47#include <asm/fixmap.h>
47#include <asm/mmu_context.h> 48#include <asm/mmu_context.h>
48#include <asm/paravirt.h> 49#include <asm/paravirt.h>
50#include <asm/linkage.h>
49 51
50#include <asm/xen/hypercall.h> 52#include <asm/xen/hypercall.h>
51#include <asm/xen/hypervisor.h> 53#include <asm/xen/hypervisor.h>
@@ -56,15 +58,144 @@
56#include "multicalls.h" 58#include "multicalls.h"
57#include "mmu.h" 59#include "mmu.h"
58 60
59xmaddr_t arbitrary_virt_to_machine(unsigned long address) 61/*
62 * Just beyond the highest usermode address. STACK_TOP_MAX has a
63 * redzone above it, so round it up to a PGD boundary.
64 */
65#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
66
67
68#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
69#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
70
71/* Placeholder for holes in the address space */
72static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
73 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
74
75 /* Array of pointers to pages containing p2m entries */
76static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
77 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
78
79/* Arrays of p2m arrays expressed in mfns used for save/restore */
80static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
81
82static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
83 __page_aligned_bss;
84
85static inline unsigned p2m_top_index(unsigned long pfn)
86{
87 BUG_ON(pfn >= MAX_DOMAIN_PAGES);
88 return pfn / P2M_ENTRIES_PER_PAGE;
89}
90
91static inline unsigned p2m_index(unsigned long pfn)
92{
93 return pfn % P2M_ENTRIES_PER_PAGE;
94}
95
96/* Build the parallel p2m_top_mfn structures */
97void xen_setup_mfn_list_list(void)
98{
99 unsigned pfn, idx;
100
101 for(pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
102 unsigned topidx = p2m_top_index(pfn);
103
104 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
105 }
106
107 for(idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
108 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
109 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
110 }
111
112 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
113
114 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
115 virt_to_mfn(p2m_top_mfn_list);
116 HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
117}
118
119/* Set up p2m_top to point to the domain-builder provided p2m pages */
120void __init xen_build_dynamic_phys_to_machine(void)
121{
122 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
123 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
124 unsigned pfn;
125
126 for(pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
127 unsigned topidx = p2m_top_index(pfn);
128
129 p2m_top[topidx] = &mfn_list[pfn];
130 }
131}
132
133unsigned long get_phys_to_machine(unsigned long pfn)
134{
135 unsigned topidx, idx;
136
137 if (unlikely(pfn >= MAX_DOMAIN_PAGES))
138 return INVALID_P2M_ENTRY;
139
140 topidx = p2m_top_index(pfn);
141 idx = p2m_index(pfn);
142 return p2m_top[topidx][idx];
143}
144EXPORT_SYMBOL_GPL(get_phys_to_machine);
145
146static void alloc_p2m(unsigned long **pp, unsigned long *mfnp)
147{
148 unsigned long *p;
149 unsigned i;
150
151 p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
152 BUG_ON(p == NULL);
153
154 for(i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
155 p[i] = INVALID_P2M_ENTRY;
156
157 if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
158 free_page((unsigned long)p);
159 else
160 *mfnp = virt_to_mfn(p);
161}
162
163void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
164{
165 unsigned topidx, idx;
166
167 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
168 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
169 return;
170 }
171
172 if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
173 BUG_ON(mfn != INVALID_P2M_ENTRY);
174 return;
175 }
176
177 topidx = p2m_top_index(pfn);
178 if (p2m_top[topidx] == p2m_missing) {
179 /* no need to allocate a page to store an invalid entry */
180 if (mfn == INVALID_P2M_ENTRY)
181 return;
182 alloc_p2m(&p2m_top[topidx], &p2m_top_mfn[topidx]);
183 }
184
185 idx = p2m_index(pfn);
186 p2m_top[topidx][idx] = mfn;
187}
188
189xmaddr_t arbitrary_virt_to_machine(void *vaddr)
60{ 190{
191 unsigned long address = (unsigned long)vaddr;
61 unsigned int level; 192 unsigned int level;
62 pte_t *pte = lookup_address(address, &level); 193 pte_t *pte = lookup_address(address, &level);
63 unsigned offset = address & ~PAGE_MASK; 194 unsigned offset = address & ~PAGE_MASK;
64 195
65 BUG_ON(pte == NULL); 196 BUG_ON(pte == NULL);
66 197
67 return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset); 198 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
68} 199}
69 200
70void make_lowmem_page_readonly(void *vaddr) 201void make_lowmem_page_readonly(void *vaddr)
@@ -98,59 +229,68 @@ void make_lowmem_page_readwrite(void *vaddr)
98} 229}
99 230
100 231
101void xen_set_pmd(pmd_t *ptr, pmd_t val) 232static bool page_pinned(void *ptr)
233{
234 struct page *page = virt_to_page(ptr);
235
236 return PagePinned(page);
237}
238
239static void extend_mmu_update(const struct mmu_update *update)
102{ 240{
103 struct multicall_space mcs; 241 struct multicall_space mcs;
104 struct mmu_update *u; 242 struct mmu_update *u;
105 243
106 preempt_disable(); 244 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
245
246 if (mcs.mc != NULL)
247 mcs.mc->args[1]++;
248 else {
249 mcs = __xen_mc_entry(sizeof(*u));
250 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
251 }
107 252
108 mcs = xen_mc_entry(sizeof(*u));
109 u = mcs.args; 253 u = mcs.args;
110 u->ptr = virt_to_machine(ptr).maddr; 254 *u = *update;
111 u->val = pmd_val_ma(val); 255}
112 MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); 256
257void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
258{
259 struct mmu_update u;
260
261 preempt_disable();
262
263 xen_mc_batch();
264
265 /* ptr may be ioremapped for 64-bit pagetable setup */
266 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
267 u.val = pmd_val_ma(val);
268 extend_mmu_update(&u);
113 269
114 xen_mc_issue(PARAVIRT_LAZY_MMU); 270 xen_mc_issue(PARAVIRT_LAZY_MMU);
115 271
116 preempt_enable(); 272 preempt_enable();
117} 273}
118 274
275void xen_set_pmd(pmd_t *ptr, pmd_t val)
276{
277 /* If page is not pinned, we can just update the entry
278 directly */
279 if (!page_pinned(ptr)) {
280 *ptr = val;
281 return;
282 }
283
284 xen_set_pmd_hyper(ptr, val);
285}
286
119/* 287/*
120 * Associate a virtual page frame with a given physical page frame 288 * Associate a virtual page frame with a given physical page frame
121 * and protection flags for that frame. 289 * and protection flags for that frame.
122 */ 290 */
123void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) 291void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
124{ 292{
125 pgd_t *pgd; 293 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
126 pud_t *pud;
127 pmd_t *pmd;
128 pte_t *pte;
129
130 pgd = swapper_pg_dir + pgd_index(vaddr);
131 if (pgd_none(*pgd)) {
132 BUG();
133 return;
134 }
135 pud = pud_offset(pgd, vaddr);
136 if (pud_none(*pud)) {
137 BUG();
138 return;
139 }
140 pmd = pmd_offset(pud, vaddr);
141 if (pmd_none(*pmd)) {
142 BUG();
143 return;
144 }
145 pte = pte_offset_kernel(pmd, vaddr);
146 /* <mfn,flags> stored as-is, to permit clearing entries */
147 xen_set_pte(pte, mfn_pte(mfn, flags));
148
149 /*
150 * It's enough to flush this one mapping.
151 * (PGE mappings get flushed as well)
152 */
153 __flush_tlb_one(vaddr);
154} 294}
155 295
156void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 296void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -179,12 +319,32 @@ out:
179 preempt_enable(); 319 preempt_enable();
180} 320}
181 321
322pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
323{
324 /* Just return the pte as-is. We preserve the bits on commit */
325 return *ptep;
326}
327
328void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
329 pte_t *ptep, pte_t pte)
330{
331 struct mmu_update u;
332
333 xen_mc_batch();
334
335 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
336 u.val = pte_val_ma(pte);
337 extend_mmu_update(&u);
338
339 xen_mc_issue(PARAVIRT_LAZY_MMU);
340}
341
182/* Assume pteval_t is equivalent to all the other *val_t types. */ 342/* Assume pteval_t is equivalent to all the other *val_t types. */
183static pteval_t pte_mfn_to_pfn(pteval_t val) 343static pteval_t pte_mfn_to_pfn(pteval_t val)
184{ 344{
185 if (val & _PAGE_PRESENT) { 345 if (val & _PAGE_PRESENT) {
186 unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT; 346 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
187 pteval_t flags = val & ~PTE_MASK; 347 pteval_t flags = val & PTE_FLAGS_MASK;
188 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags; 348 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
189 } 349 }
190 350
@@ -194,8 +354,8 @@ static pteval_t pte_mfn_to_pfn(pteval_t val)
194static pteval_t pte_pfn_to_mfn(pteval_t val) 354static pteval_t pte_pfn_to_mfn(pteval_t val)
195{ 355{
196 if (val & _PAGE_PRESENT) { 356 if (val & _PAGE_PRESENT) {
197 unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT; 357 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
198 pteval_t flags = val & ~PTE_MASK; 358 pteval_t flags = val & PTE_FLAGS_MASK;
199 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; 359 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
200 } 360 }
201 361
@@ -229,34 +389,51 @@ pmdval_t xen_pmd_val(pmd_t pmd)
229 return pte_mfn_to_pfn(pmd.pmd); 389 return pte_mfn_to_pfn(pmd.pmd);
230} 390}
231 391
232void xen_set_pud(pud_t *ptr, pud_t val) 392void xen_set_pud_hyper(pud_t *ptr, pud_t val)
233{ 393{
234 struct multicall_space mcs; 394 struct mmu_update u;
235 struct mmu_update *u;
236 395
237 preempt_disable(); 396 preempt_disable();
238 397
239 mcs = xen_mc_entry(sizeof(*u)); 398 xen_mc_batch();
240 u = mcs.args; 399
241 u->ptr = virt_to_machine(ptr).maddr; 400 /* ptr may be ioremapped for 64-bit pagetable setup */
242 u->val = pud_val_ma(val); 401 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
243 MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); 402 u.val = pud_val_ma(val);
403 extend_mmu_update(&u);
244 404
245 xen_mc_issue(PARAVIRT_LAZY_MMU); 405 xen_mc_issue(PARAVIRT_LAZY_MMU);
246 406
247 preempt_enable(); 407 preempt_enable();
248} 408}
249 409
410void xen_set_pud(pud_t *ptr, pud_t val)
411{
412 /* If page is not pinned, we can just update the entry
413 directly */
414 if (!page_pinned(ptr)) {
415 *ptr = val;
416 return;
417 }
418
419 xen_set_pud_hyper(ptr, val);
420}
421
250void xen_set_pte(pte_t *ptep, pte_t pte) 422void xen_set_pte(pte_t *ptep, pte_t pte)
251{ 423{
424#ifdef CONFIG_X86_PAE
252 ptep->pte_high = pte.pte_high; 425 ptep->pte_high = pte.pte_high;
253 smp_wmb(); 426 smp_wmb();
254 ptep->pte_low = pte.pte_low; 427 ptep->pte_low = pte.pte_low;
428#else
429 *ptep = pte;
430#endif
255} 431}
256 432
433#ifdef CONFIG_X86_PAE
257void xen_set_pte_atomic(pte_t *ptep, pte_t pte) 434void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
258{ 435{
259 set_64bit((u64 *)ptep, pte_val_ma(pte)); 436 set_64bit((u64 *)ptep, native_pte_val(pte));
260} 437}
261 438
262void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 439void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
@@ -268,8 +445,9 @@ void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
268 445
269void xen_pmd_clear(pmd_t *pmdp) 446void xen_pmd_clear(pmd_t *pmdp)
270{ 447{
271 xen_set_pmd(pmdp, __pmd(0)); 448 set_pmd(pmdp, __pmd(0));
272} 449}
450#endif /* CONFIG_X86_PAE */
273 451
274pmd_t xen_make_pmd(pmdval_t pmd) 452pmd_t xen_make_pmd(pmdval_t pmd)
275{ 453{
@@ -277,78 +455,189 @@ pmd_t xen_make_pmd(pmdval_t pmd)
277 return native_make_pmd(pmd); 455 return native_make_pmd(pmd);
278} 456}
279 457
458#if PAGETABLE_LEVELS == 4
459pudval_t xen_pud_val(pud_t pud)
460{
461 return pte_mfn_to_pfn(pud.pud);
462}
463
464pud_t xen_make_pud(pudval_t pud)
465{
466 pud = pte_pfn_to_mfn(pud);
467
468 return native_make_pud(pud);
469}
470
471pgd_t *xen_get_user_pgd(pgd_t *pgd)
472{
473 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
474 unsigned offset = pgd - pgd_page;
475 pgd_t *user_ptr = NULL;
476
477 if (offset < pgd_index(USER_LIMIT)) {
478 struct page *page = virt_to_page(pgd_page);
479 user_ptr = (pgd_t *)page->private;
480 if (user_ptr)
481 user_ptr += offset;
482 }
483
484 return user_ptr;
485}
486
487static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
488{
489 struct mmu_update u;
490
491 u.ptr = virt_to_machine(ptr).maddr;
492 u.val = pgd_val_ma(val);
493 extend_mmu_update(&u);
494}
495
280/* 496/*
281 (Yet another) pagetable walker. This one is intended for pinning a 497 * Raw hypercall-based set_pgd, intended for in early boot before
282 pagetable. This means that it walks a pagetable and calls the 498 * there's a page structure. This implies:
283 callback function on each page it finds making up the page table, 499 * 1. The only existing pagetable is the kernel's
284 at every level. It walks the entire pagetable, but it only bothers 500 * 2. It is always pinned
285 pinning pte pages which are below pte_limit. In the normal case 501 * 3. It has no user pagetable attached to it
286 this will be TASK_SIZE, but at boot we need to pin up to 502 */
287 FIXADDR_TOP. But the important bit is that we don't pin beyond 503void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
288 there, because then we start getting into Xen's ptes. 504{
289*/ 505 preempt_disable();
290static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level), 506
507 xen_mc_batch();
508
509 __xen_set_pgd_hyper(ptr, val);
510
511 xen_mc_issue(PARAVIRT_LAZY_MMU);
512
513 preempt_enable();
514}
515
516void xen_set_pgd(pgd_t *ptr, pgd_t val)
517{
518 pgd_t *user_ptr = xen_get_user_pgd(ptr);
519
520 /* If page is not pinned, we can just update the entry
521 directly */
522 if (!page_pinned(ptr)) {
523 *ptr = val;
524 if (user_ptr) {
525 WARN_ON(page_pinned(user_ptr));
526 *user_ptr = val;
527 }
528 return;
529 }
530
531 /* If it's pinned, then we can at least batch the kernel and
532 user updates together. */
533 xen_mc_batch();
534
535 __xen_set_pgd_hyper(ptr, val);
536 if (user_ptr)
537 __xen_set_pgd_hyper(user_ptr, val);
538
539 xen_mc_issue(PARAVIRT_LAZY_MMU);
540}
541#endif /* PAGETABLE_LEVELS == 4 */
542
543/*
544 * (Yet another) pagetable walker. This one is intended for pinning a
545 * pagetable. This means that it walks a pagetable and calls the
546 * callback function on each page it finds making up the page table,
547 * at every level. It walks the entire pagetable, but it only bothers
548 * pinning pte pages which are below limit. In the normal case this
549 * will be STACK_TOP_MAX, but at boot we need to pin up to
550 * FIXADDR_TOP.
551 *
552 * For 32-bit the important bit is that we don't pin beyond there,
553 * because then we start getting into Xen's ptes.
554 *
555 * For 64-bit, we must skip the Xen hole in the middle of the address
556 * space, just after the big x86-64 virtual hole.
557 */
558static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
291 unsigned long limit) 559 unsigned long limit)
292{ 560{
293 pgd_t *pgd = pgd_base;
294 int flush = 0; 561 int flush = 0;
295 unsigned long addr = 0; 562 unsigned hole_low, hole_high;
296 unsigned long pgd_next; 563 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
564 unsigned pgdidx, pudidx, pmdidx;
297 565
298 BUG_ON(limit > FIXADDR_TOP); 566 /* The limit is the last byte to be touched */
567 limit--;
568 BUG_ON(limit >= FIXADDR_TOP);
299 569
300 if (xen_feature(XENFEAT_auto_translated_physmap)) 570 if (xen_feature(XENFEAT_auto_translated_physmap))
301 return 0; 571 return 0;
302 572
303 for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) { 573 /*
574 * 64-bit has a great big hole in the middle of the address
575 * space, which contains the Xen mappings. On 32-bit these
576 * will end up making a zero-sized hole and so is a no-op.
577 */
578 hole_low = pgd_index(USER_LIMIT);
579 hole_high = pgd_index(PAGE_OFFSET);
580
581 pgdidx_limit = pgd_index(limit);
582#if PTRS_PER_PUD > 1
583 pudidx_limit = pud_index(limit);
584#else
585 pudidx_limit = 0;
586#endif
587#if PTRS_PER_PMD > 1
588 pmdidx_limit = pmd_index(limit);
589#else
590 pmdidx_limit = 0;
591#endif
592
593 flush |= (*func)(virt_to_page(pgd), PT_PGD);
594
595 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
304 pud_t *pud; 596 pud_t *pud;
305 unsigned long pud_limit, pud_next;
306 597
307 pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP); 598 if (pgdidx >= hole_low && pgdidx < hole_high)
599 continue;
308 600
309 if (!pgd_val(*pgd)) 601 if (!pgd_val(pgd[pgdidx]))
310 continue; 602 continue;
311 603
312 pud = pud_offset(pgd, 0); 604 pud = pud_offset(&pgd[pgdidx], 0);
313 605
314 if (PTRS_PER_PUD > 1) /* not folded */ 606 if (PTRS_PER_PUD > 1) /* not folded */
315 flush |= (*func)(virt_to_page(pud), PT_PUD); 607 flush |= (*func)(virt_to_page(pud), PT_PUD);
316 608
317 for (; addr != pud_limit; pud++, addr = pud_next) { 609 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
318 pmd_t *pmd; 610 pmd_t *pmd;
319 unsigned long pmd_limit;
320 611
321 pud_next = pud_addr_end(addr, pud_limit); 612 if (pgdidx == pgdidx_limit &&
322 613 pudidx > pudidx_limit)
323 if (pud_next < limit) 614 goto out;
324 pmd_limit = pud_next;
325 else
326 pmd_limit = limit;
327 615
328 if (pud_none(*pud)) 616 if (pud_none(pud[pudidx]))
329 continue; 617 continue;
330 618
331 pmd = pmd_offset(pud, 0); 619 pmd = pmd_offset(&pud[pudidx], 0);
332 620
333 if (PTRS_PER_PMD > 1) /* not folded */ 621 if (PTRS_PER_PMD > 1) /* not folded */
334 flush |= (*func)(virt_to_page(pmd), PT_PMD); 622 flush |= (*func)(virt_to_page(pmd), PT_PMD);
335 623
336 for (; addr != pmd_limit; pmd++) { 624 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
337 addr += (PAGE_SIZE * PTRS_PER_PTE); 625 struct page *pte;
338 if ((pmd_limit-1) < (addr-1)) { 626
339 addr = pmd_limit; 627 if (pgdidx == pgdidx_limit &&
340 break; 628 pudidx == pudidx_limit &&
341 } 629 pmdidx > pmdidx_limit)
630 goto out;
342 631
343 if (pmd_none(*pmd)) 632 if (pmd_none(pmd[pmdidx]))
344 continue; 633 continue;
345 634
346 flush |= (*func)(pmd_page(*pmd), PT_PTE); 635 pte = pmd_page(pmd[pmdidx]);
636 flush |= (*func)(pte, PT_PTE);
347 } 637 }
348 } 638 }
349 } 639 }
350 640out:
351 flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
352 641
353 return flush; 642 return flush;
354} 643}
@@ -430,20 +719,62 @@ void xen_pgd_pin(pgd_t *pgd)
430{ 719{
431 xen_mc_batch(); 720 xen_mc_batch();
432 721
433 if (pgd_walk(pgd, pin_page, TASK_SIZE)) { 722 if (pgd_walk(pgd, pin_page, USER_LIMIT)) {
434 /* re-enable interrupts for kmap_flush_unused */ 723 /* re-enable interrupts for kmap_flush_unused */
435 xen_mc_issue(0); 724 xen_mc_issue(0);
436 kmap_flush_unused(); 725 kmap_flush_unused();
437 xen_mc_batch(); 726 xen_mc_batch();
438 } 727 }
439 728
729#ifdef CONFIG_X86_64
730 {
731 pgd_t *user_pgd = xen_get_user_pgd(pgd);
732
733 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
734
735 if (user_pgd) {
736 pin_page(virt_to_page(user_pgd), PT_PGD);
737 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
738 }
739 }
740#else /* CONFIG_X86_32 */
741#ifdef CONFIG_X86_PAE
742 /* Need to make sure unshared kernel PMD is pinnable */
743 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
744#endif
440 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); 745 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
746#endif /* CONFIG_X86_64 */
441 xen_mc_issue(0); 747 xen_mc_issue(0);
442} 748}
443 749
444/* The init_mm pagetable is really pinned as soon as its created, but 750/*
445 that's before we have page structures to store the bits. So do all 751 * On save, we need to pin all pagetables to make sure they get their
446 the book-keeping now. */ 752 * mfns turned into pfns. Search the list for any unpinned pgds and pin
753 * them (unpinned pgds are not currently in use, probably because the
754 * process is under construction or destruction).
755 */
756void xen_mm_pin_all(void)
757{
758 unsigned long flags;
759 struct page *page;
760
761 spin_lock_irqsave(&pgd_lock, flags);
762
763 list_for_each_entry(page, &pgd_list, lru) {
764 if (!PagePinned(page)) {
765 xen_pgd_pin((pgd_t *)page_address(page));
766 SetPageSavePinned(page);
767 }
768 }
769
770 spin_unlock_irqrestore(&pgd_lock, flags);
771}
772
773/*
774 * The init_mm pagetable is really pinned as soon as its created, but
775 * that's before we have page structures to store the bits. So do all
776 * the book-keeping now.
777 */
447static __init int mark_pinned(struct page *page, enum pt_level level) 778static __init int mark_pinned(struct page *page, enum pt_level level)
448{ 779{
449 SetPagePinned(page); 780 SetPagePinned(page);
@@ -493,11 +824,49 @@ static void xen_pgd_unpin(pgd_t *pgd)
493 824
494 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 825 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
495 826
496 pgd_walk(pgd, unpin_page, TASK_SIZE); 827#ifdef CONFIG_X86_64
828 {
829 pgd_t *user_pgd = xen_get_user_pgd(pgd);
830
831 if (user_pgd) {
832 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
833 unpin_page(virt_to_page(user_pgd), PT_PGD);
834 }
835 }
836#endif
837
838#ifdef CONFIG_X86_PAE
839 /* Need to make sure unshared kernel PMD is unpinned */
840 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
841#endif
842
843 pgd_walk(pgd, unpin_page, USER_LIMIT);
497 844
498 xen_mc_issue(0); 845 xen_mc_issue(0);
499} 846}
500 847
848/*
849 * On resume, undo any pinning done at save, so that the rest of the
850 * kernel doesn't see any unexpected pinned pagetables.
851 */
852void xen_mm_unpin_all(void)
853{
854 unsigned long flags;
855 struct page *page;
856
857 spin_lock_irqsave(&pgd_lock, flags);
858
859 list_for_each_entry(page, &pgd_list, lru) {
860 if (PageSavePinned(page)) {
861 BUG_ON(!PagePinned(page));
862 xen_pgd_unpin((pgd_t *)page_address(page));
863 ClearPageSavePinned(page);
864 }
865 }
866
867 spin_unlock_irqrestore(&pgd_lock, flags);
868}
869
501void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) 870void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
502{ 871{
503 spin_lock(&next->page_table_lock); 872 spin_lock(&next->page_table_lock);
@@ -519,8 +888,15 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
519static void drop_other_mm_ref(void *info) 888static void drop_other_mm_ref(void *info)
520{ 889{
521 struct mm_struct *mm = info; 890 struct mm_struct *mm = info;
891 struct mm_struct *active_mm;
892
893#ifdef CONFIG_X86_64
894 active_mm = read_pda(active_mm);
895#else
896 active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
897#endif
522 898
523 if (__get_cpu_var(cpu_tlbstate).active_mm == mm) 899 if (active_mm == mm)
524 leave_mm(smp_processor_id()); 900 leave_mm(smp_processor_id());
525 901
526 /* If this cpu still has a stale cr3 reference, then make sure 902 /* If this cpu still has a stale cr3 reference, then make sure
@@ -558,7 +934,7 @@ static void drop_mm_ref(struct mm_struct *mm)
558 } 934 }
559 935
560 if (!cpus_empty(mask)) 936 if (!cpus_empty(mask))
561 xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); 937 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
562} 938}
563#else 939#else
564static void drop_mm_ref(struct mm_struct *mm) 940static void drop_mm_ref(struct mm_struct *mm)
@@ -591,7 +967,7 @@ void xen_exit_mmap(struct mm_struct *mm)
591 spin_lock(&mm->page_table_lock); 967 spin_lock(&mm->page_table_lock);
592 968
593 /* pgd may not be pinned in the error exit path of execve */ 969 /* pgd may not be pinned in the error exit path of execve */
594 if (PagePinned(virt_to_page(mm->pgd))) 970 if (page_pinned(mm->pgd))
595 xen_pgd_unpin(mm->pgd); 971 xen_pgd_unpin(mm->pgd);
596 972
597 spin_unlock(&mm->page_table_lock); 973 spin_unlock(&mm->page_table_lock);
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 5fe961caffd4..0f59bd03f9e3 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -10,25 +10,9 @@ enum pt_level {
10 PT_PTE 10 PT_PTE
11}; 11};
12 12
13/*
14 * Page-directory addresses above 4GB do not fit into architectural %cr3.
15 * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
16 * must use the following accessor macros to pack/unpack valid MFNs.
17 *
18 * Note that Xen is using the fact that the pagetable base is always
19 * page-aligned, and putting the 12 MSB of the address into the 12 LSB
20 * of cr3.
21 */
22#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
23#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
24
25 13
26void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 14void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
27 15
28void xen_set_pte(pte_t *ptep, pte_t pteval);
29void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
30 pte_t *ptep, pte_t pteval);
31void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
32 16
33void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next); 17void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
34void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); 18void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
@@ -45,11 +29,32 @@ pte_t xen_make_pte(pteval_t);
45pmd_t xen_make_pmd(pmdval_t); 29pmd_t xen_make_pmd(pmdval_t);
46pgd_t xen_make_pgd(pgdval_t); 30pgd_t xen_make_pgd(pgdval_t);
47 31
32void xen_set_pte(pte_t *ptep, pte_t pteval);
48void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 33void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
49 pte_t *ptep, pte_t pteval); 34 pte_t *ptep, pte_t pteval);
35
36#ifdef CONFIG_X86_PAE
50void xen_set_pte_atomic(pte_t *ptep, pte_t pte); 37void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
51void xen_set_pud(pud_t *ptr, pud_t val);
52void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 38void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
53void xen_pmd_clear(pmd_t *pmdp); 39void xen_pmd_clear(pmd_t *pmdp);
40#endif /* CONFIG_X86_PAE */
41
42void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
43void xen_set_pud(pud_t *ptr, pud_t val);
44void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
45void xen_set_pud_hyper(pud_t *ptr, pud_t val);
46
47#if PAGETABLE_LEVELS == 4
48pudval_t xen_pud_val(pud_t pud);
49pud_t xen_make_pud(pudval_t pudval);
50void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
51void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
52#endif
53
54pgd_t *xen_get_user_pgd(pgd_t *pgd);
55
56pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
57void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
58 pte_t *ptep, pte_t pte);
54 59
55#endif /* _XEN_MMU_H */ 60#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 5791eb2e3750..9efd1c6c9776 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -29,14 +29,14 @@
29#define MC_DEBUG 1 29#define MC_DEBUG 1
30 30
31#define MC_BATCH 32 31#define MC_BATCH 32
32#define MC_ARGS (MC_BATCH * 16 / sizeof(u64)) 32#define MC_ARGS (MC_BATCH * 16)
33 33
34struct mc_buffer { 34struct mc_buffer {
35 struct multicall_entry entries[MC_BATCH]; 35 struct multicall_entry entries[MC_BATCH];
36#if MC_DEBUG 36#if MC_DEBUG
37 struct multicall_entry debug[MC_BATCH]; 37 struct multicall_entry debug[MC_BATCH];
38#endif 38#endif
39 u64 args[MC_ARGS]; 39 unsigned char args[MC_ARGS];
40 struct callback { 40 struct callback {
41 void (*fn)(void *); 41 void (*fn)(void *);
42 void *data; 42 void *data;
@@ -76,6 +76,7 @@ void xen_mc_flush(void)
76 if (ret) { 76 if (ret) {
77 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n", 77 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
78 ret, smp_processor_id()); 78 ret, smp_processor_id());
79 dump_stack();
79 for (i = 0; i < b->mcidx; i++) { 80 for (i = 0; i < b->mcidx; i++) {
80 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n", 81 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
81 i+1, b->mcidx, 82 i+1, b->mcidx,
@@ -107,20 +108,48 @@ struct multicall_space __xen_mc_entry(size_t args)
107{ 108{
108 struct mc_buffer *b = &__get_cpu_var(mc_buffer); 109 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
109 struct multicall_space ret; 110 struct multicall_space ret;
110 unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64); 111 unsigned argidx = roundup(b->argidx, sizeof(u64));
111 112
112 BUG_ON(preemptible()); 113 BUG_ON(preemptible());
113 BUG_ON(argspace > MC_ARGS); 114 BUG_ON(b->argidx > MC_ARGS);
114 115
115 if (b->mcidx == MC_BATCH || 116 if (b->mcidx == MC_BATCH ||
116 (b->argidx + argspace) > MC_ARGS) 117 (argidx + args) > MC_ARGS) {
117 xen_mc_flush(); 118 xen_mc_flush();
119 argidx = roundup(b->argidx, sizeof(u64));
120 }
118 121
119 ret.mc = &b->entries[b->mcidx]; 122 ret.mc = &b->entries[b->mcidx];
120 b->mcidx++; 123 b->mcidx++;
124 ret.args = &b->args[argidx];
125 b->argidx = argidx + args;
126
127 BUG_ON(b->argidx > MC_ARGS);
128 return ret;
129}
130
131struct multicall_space xen_mc_extend_args(unsigned long op, size_t size)
132{
133 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
134 struct multicall_space ret = { NULL, NULL };
135
136 BUG_ON(preemptible());
137 BUG_ON(b->argidx > MC_ARGS);
138
139 if (b->mcidx == 0)
140 return ret;
141
142 if (b->entries[b->mcidx - 1].op != op)
143 return ret;
144
145 if ((b->argidx + size) > MC_ARGS)
146 return ret;
147
148 ret.mc = &b->entries[b->mcidx - 1];
121 ret.args = &b->args[b->argidx]; 149 ret.args = &b->args[b->argidx];
122 b->argidx += argspace; 150 b->argidx += size;
123 151
152 BUG_ON(b->argidx > MC_ARGS);
124 return ret; 153 return ret;
125} 154}
126 155
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index 8bae996d99a3..858938241616 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -45,4 +45,16 @@ static inline void xen_mc_issue(unsigned mode)
45/* Set up a callback to be called when the current batch is flushed */ 45/* Set up a callback to be called when the current batch is flushed */
46void xen_mc_callback(void (*fn)(void *), void *data); 46void xen_mc_callback(void (*fn)(void *), void *data);
47 47
48/*
49 * Try to extend the arguments of the previous multicall command. The
50 * previous command's op must match. If it does, then it attempts to
51 * extend the argument space allocated to the multicall entry by
52 * arg_size bytes.
53 *
54 * The returned multicall_space will return with mc pointing to the
55 * command on success, or NULL on failure, and args pointing to the
56 * newly allocated space.
57 */
58struct multicall_space xen_mc_extend_args(unsigned long op, size_t arg_size);
59
48#endif /* _XEN_MULTICALLS_H */ 60#endif /* _XEN_MULTICALLS_H */
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 82517e4a752a..b6acc3a0af46 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -13,9 +13,11 @@
13#include <asm/vdso.h> 13#include <asm/vdso.h>
14#include <asm/e820.h> 14#include <asm/e820.h>
15#include <asm/setup.h> 15#include <asm/setup.h>
16#include <asm/acpi.h>
16#include <asm/xen/hypervisor.h> 17#include <asm/xen/hypervisor.h>
17#include <asm/xen/hypercall.h> 18#include <asm/xen/hypercall.h>
18 19
20#include <xen/page.h>
19#include <xen/interface/callback.h> 21#include <xen/interface/callback.h>
20#include <xen/interface/physdev.h> 22#include <xen/interface/physdev.h>
21#include <xen/features.h> 23#include <xen/features.h>
@@ -27,8 +29,6 @@
27extern const char xen_hypervisor_callback[]; 29extern const char xen_hypervisor_callback[];
28extern const char xen_failsafe_callback[]; 30extern const char xen_failsafe_callback[];
29 31
30unsigned long *phys_to_machine_mapping;
31EXPORT_SYMBOL(phys_to_machine_mapping);
32 32
33/** 33/**
34 * machine_specific_memory_setup - Hook for machine specific memory setup. 34 * machine_specific_memory_setup - Hook for machine specific memory setup.
@@ -38,9 +38,31 @@ char * __init xen_memory_setup(void)
38{ 38{
39 unsigned long max_pfn = xen_start_info->nr_pages; 39 unsigned long max_pfn = xen_start_info->nr_pages;
40 40
41 max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
42
41 e820.nr_map = 0; 43 e820.nr_map = 0;
42 add_memory_region(0, LOWMEMSIZE(), E820_RAM); 44
43 add_memory_region(HIGH_MEMORY, PFN_PHYS(max_pfn)-HIGH_MEMORY, E820_RAM); 45 e820_add_region(0, PFN_PHYS(max_pfn), E820_RAM);
46
47 /*
48 * Even though this is normal, usable memory under Xen, reserve
49 * ISA memory anyway because too many things think they can poke
50 * about in there.
51 */
52 e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
53 E820_RESERVED);
54
55 /*
56 * Reserve Xen bits:
57 * - mfn_list
58 * - xen_start_info
59 * See comment above "struct start_info" in <xen/interface/xen.h>
60 */
61 e820_add_region(__pa(xen_start_info->mfn_list),
62 xen_start_info->pt_base - xen_start_info->mfn_list,
63 E820_RESERVED);
64
65 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
44 66
45 return "Xen"; 67 return "Xen";
46} 68}
@@ -61,30 +83,72 @@ static void xen_idle(void)
61 83
62/* 84/*
63 * Set the bit indicating "nosegneg" library variants should be used. 85 * Set the bit indicating "nosegneg" library variants should be used.
86 * We only need to bother in pure 32-bit mode; compat 32-bit processes
87 * can have un-truncated segments, so wrapping around is allowed.
64 */ 88 */
65static void __init fiddle_vdso(void) 89static void __init fiddle_vdso(void)
66{ 90{
67 extern const char vdso32_default_start; 91#ifdef CONFIG_X86_32
68 u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK); 92 u32 *mask;
93 mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
94 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
95 mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
69 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; 96 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
97#endif
70} 98}
71 99
72void xen_enable_sysenter(void) 100static __cpuinit int register_callback(unsigned type, const void *func)
73{ 101{
74 int cpu = smp_processor_id(); 102 struct callback_register callback = {
75 extern void xen_sysenter_target(void); 103 .type = type,
76 /* Mask events on entry, even though they get enabled immediately */ 104 .address = XEN_CALLBACK(__KERNEL_CS, func),
77 static struct callback_register sysenter = {
78 .type = CALLBACKTYPE_sysenter,
79 .address = { __KERNEL_CS, (unsigned long)xen_sysenter_target },
80 .flags = CALLBACKF_mask_events, 105 .flags = CALLBACKF_mask_events,
81 }; 106 };
82 107
83 if (!boot_cpu_has(X86_FEATURE_SEP) || 108 return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
84 HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) { 109}
85 clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP); 110
86 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); 111void __cpuinit xen_enable_sysenter(void)
112{
113 extern void xen_sysenter_target(void);
114 int ret;
115 unsigned sysenter_feature;
116
117#ifdef CONFIG_X86_32
118 sysenter_feature = X86_FEATURE_SEP;
119#else
120 sysenter_feature = X86_FEATURE_SYSENTER32;
121#endif
122
123 if (!boot_cpu_has(sysenter_feature))
124 return;
125
126 ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
127 if(ret != 0)
128 setup_clear_cpu_cap(sysenter_feature);
129}
130
131void __cpuinit xen_enable_syscall(void)
132{
133#ifdef CONFIG_X86_64
134 int ret;
135 extern void xen_syscall_target(void);
136 extern void xen_syscall32_target(void);
137
138 ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
139 if (ret != 0) {
140 printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
141 /* Pretty fatal; 64-bit userspace has no other
142 mechanism for syscalls. */
143 }
144
145 if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
146 ret = register_callback(CALLBACKTYPE_syscall32,
147 xen_syscall32_target);
148 if (ret != 0)
149 setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
87 } 150 }
151#endif /* CONFIG_X86_64 */
88} 152}
89 153
90void __init xen_arch_setup(void) 154void __init xen_arch_setup(void)
@@ -98,10 +162,12 @@ void __init xen_arch_setup(void)
98 if (!xen_feature(XENFEAT_auto_translated_physmap)) 162 if (!xen_feature(XENFEAT_auto_translated_physmap))
99 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); 163 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
100 164
101 HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, 165 if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
102 __KERNEL_CS, (unsigned long)xen_failsafe_callback); 166 register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
167 BUG();
103 168
104 xen_enable_sysenter(); 169 xen_enable_sysenter();
170 xen_enable_syscall();
105 171
106 set_iopl.iopl = 1; 172 set_iopl.iopl = 1;
107 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 173 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
@@ -121,11 +187,6 @@ void __init xen_arch_setup(void)
121 187
122 pm_idle = xen_idle; 188 pm_idle = xen_idle;
123 189
124#ifdef CONFIG_SMP
125 /* fill cpus_possible with all available cpus */
126 xen_fill_possible_map();
127#endif
128
129 paravirt_disable_iospace(); 190 paravirt_disable_iospace();
130 191
131 fiddle_vdso(); 192 fiddle_vdso();
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 94e69000f982..d8faf79a0a1d 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -15,6 +15,7 @@
15 * This does not handle HOTPLUG_CPU yet. 15 * This does not handle HOTPLUG_CPU yet.
16 */ 16 */
17#include <linux/sched.h> 17#include <linux/sched.h>
18#include <linux/kernel_stat.h>
18#include <linux/err.h> 19#include <linux/err.h>
19#include <linux/smp.h> 20#include <linux/smp.h>
20 21
@@ -35,28 +36,17 @@
35#include "xen-ops.h" 36#include "xen-ops.h"
36#include "mmu.h" 37#include "mmu.h"
37 38
38static cpumask_t xen_cpu_initialized_map; 39static void __cpuinit xen_init_lock_cpu(int cpu);
39static DEFINE_PER_CPU(int, resched_irq) = -1;
40static DEFINE_PER_CPU(int, callfunc_irq) = -1;
41static DEFINE_PER_CPU(int, debug_irq) = -1;
42 40
43/* 41cpumask_t xen_cpu_initialized_map;
44 * Structure and data for smp_call_function(). This is designed to minimise
45 * static memory requirements. It also looks cleaner.
46 */
47static DEFINE_SPINLOCK(call_lock);
48
49struct call_data_struct {
50 void (*func) (void *info);
51 void *info;
52 atomic_t started;
53 atomic_t finished;
54 int wait;
55};
56 42
57static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); 43static DEFINE_PER_CPU(int, resched_irq);
44static DEFINE_PER_CPU(int, callfunc_irq);
45static DEFINE_PER_CPU(int, callfuncsingle_irq);
46static DEFINE_PER_CPU(int, debug_irq) = -1;
58 47
59static struct call_data_struct *call_data; 48static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
49static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
60 50
61/* 51/*
62 * Reschedule call back. Nothing to do, 52 * Reschedule call back. Nothing to do,
@@ -65,6 +55,12 @@ static struct call_data_struct *call_data;
65 */ 55 */
66static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) 56static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
67{ 57{
58#ifdef CONFIG_X86_32
59 __get_cpu_var(irq_stat).irq_resched_count++;
60#else
61 add_pda(irq_resched_count, 1);
62#endif
63
68 return IRQ_HANDLED; 64 return IRQ_HANDLED;
69} 65}
70 66
@@ -73,13 +69,22 @@ static __cpuinit void cpu_bringup_and_idle(void)
73 int cpu = smp_processor_id(); 69 int cpu = smp_processor_id();
74 70
75 cpu_init(); 71 cpu_init();
72 preempt_disable();
73
76 xen_enable_sysenter(); 74 xen_enable_sysenter();
75 xen_enable_syscall();
77 76
78 preempt_disable(); 77 cpu = smp_processor_id();
79 per_cpu(cpu_state, cpu) = CPU_ONLINE; 78 smp_store_cpu_info(cpu);
79 cpu_data(cpu).x86_max_cores = 1;
80 set_cpu_sibling_map(cpu);
80 81
81 xen_setup_cpu_clockevents(); 82 xen_setup_cpu_clockevents();
82 83
84 cpu_set(cpu, cpu_online_map);
85 x86_write_percpu(cpu_state, CPU_ONLINE);
86 wmb();
87
83 /* We can take interrupts now: we're officially "up". */ 88 /* We can take interrupts now: we're officially "up". */
84 local_irq_enable(); 89 local_irq_enable();
85 90
@@ -122,6 +127,17 @@ static int xen_smp_intr_init(unsigned int cpu)
122 goto fail; 127 goto fail;
123 per_cpu(debug_irq, cpu) = rc; 128 per_cpu(debug_irq, cpu) = rc;
124 129
130 callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu);
131 rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR,
132 cpu,
133 xen_call_function_single_interrupt,
134 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
135 callfunc_name,
136 NULL);
137 if (rc < 0)
138 goto fail;
139 per_cpu(callfuncsingle_irq, cpu) = rc;
140
125 return 0; 141 return 0;
126 142
127 fail: 143 fail:
@@ -131,59 +147,45 @@ static int xen_smp_intr_init(unsigned int cpu)
131 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); 147 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
132 if (per_cpu(debug_irq, cpu) >= 0) 148 if (per_cpu(debug_irq, cpu) >= 0)
133 unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); 149 unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
150 if (per_cpu(callfuncsingle_irq, cpu) >= 0)
151 unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL);
152
134 return rc; 153 return rc;
135} 154}
136 155
137void __init xen_fill_possible_map(void) 156static void __init xen_fill_possible_map(void)
138{ 157{
139 int i, rc; 158 int i, rc;
140 159
141 for (i = 0; i < NR_CPUS; i++) { 160 for (i = 0; i < NR_CPUS; i++) {
142 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); 161 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
143 if (rc >= 0) 162 if (rc >= 0) {
163 num_processors++;
144 cpu_set(i, cpu_possible_map); 164 cpu_set(i, cpu_possible_map);
165 }
145 } 166 }
146} 167}
147 168
148void __init xen_smp_prepare_boot_cpu(void) 169static void __init xen_smp_prepare_boot_cpu(void)
149{ 170{
150 int cpu;
151
152 BUG_ON(smp_processor_id() != 0); 171 BUG_ON(smp_processor_id() != 0);
153 native_smp_prepare_boot_cpu(); 172 native_smp_prepare_boot_cpu();
154 173
155 /* We've switched to the "real" per-cpu gdt, so make sure the 174 /* We've switched to the "real" per-cpu gdt, so make sure the
156 old memory can be recycled */ 175 old memory can be recycled */
157 make_lowmem_page_readwrite(&per_cpu__gdt_page); 176 make_lowmem_page_readwrite(&per_cpu_var(gdt_page));
158
159 for_each_possible_cpu(cpu) {
160 cpus_clear(per_cpu(cpu_sibling_map, cpu));
161 /*
162 * cpu_core_map lives in a per cpu area that is cleared
163 * when the per cpu array is allocated.
164 *
165 * cpus_clear(per_cpu(cpu_core_map, cpu));
166 */
167 }
168 177
169 xen_setup_vcpu_info_placement(); 178 xen_setup_vcpu_info_placement();
170} 179}
171 180
172void __init xen_smp_prepare_cpus(unsigned int max_cpus) 181static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
173{ 182{
174 unsigned cpu; 183 unsigned cpu;
175 184
176 for_each_possible_cpu(cpu) { 185 xen_init_lock_cpu(0);
177 cpus_clear(per_cpu(cpu_sibling_map, cpu));
178 /*
179 * cpu_core_ map will be zeroed when the per
180 * cpu area is allocated.
181 *
182 * cpus_clear(per_cpu(cpu_core_map, cpu));
183 */
184 }
185 186
186 smp_store_cpu_info(0); 187 smp_store_cpu_info(0);
188 cpu_data(0).x86_max_cores = 1;
187 set_cpu_sibling_map(0); 189 set_cpu_sibling_map(0);
188 190
189 if (xen_smp_intr_init(0)) 191 if (xen_smp_intr_init(0))
@@ -218,7 +220,7 @@ static __cpuinit int
218cpu_initialize_context(unsigned int cpu, struct task_struct *idle) 220cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
219{ 221{
220 struct vcpu_guest_context *ctxt; 222 struct vcpu_guest_context *ctxt;
221 struct gdt_page *gdt = &per_cpu(gdt_page, cpu); 223 struct desc_struct *gdt;
222 224
223 if (cpu_test_and_set(cpu, xen_cpu_initialized_map)) 225 if (cpu_test_and_set(cpu, xen_cpu_initialized_map))
224 return 0; 226 return 0;
@@ -227,12 +229,15 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
227 if (ctxt == NULL) 229 if (ctxt == NULL)
228 return -ENOMEM; 230 return -ENOMEM;
229 231
232 gdt = get_cpu_gdt_table(cpu);
233
230 ctxt->flags = VGCF_IN_KERNEL; 234 ctxt->flags = VGCF_IN_KERNEL;
231 ctxt->user_regs.ds = __USER_DS; 235 ctxt->user_regs.ds = __USER_DS;
232 ctxt->user_regs.es = __USER_DS; 236 ctxt->user_regs.es = __USER_DS;
233 ctxt->user_regs.fs = __KERNEL_PERCPU;
234 ctxt->user_regs.gs = 0;
235 ctxt->user_regs.ss = __KERNEL_DS; 237 ctxt->user_regs.ss = __KERNEL_DS;
238#ifdef CONFIG_X86_32
239 ctxt->user_regs.fs = __KERNEL_PERCPU;
240#endif
236 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; 241 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
237 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ 242 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
238 243
@@ -242,11 +247,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
242 247
243 ctxt->ldt_ents = 0; 248 ctxt->ldt_ents = 0;
244 249
245 BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK); 250 BUG_ON((unsigned long)gdt & ~PAGE_MASK);
246 make_lowmem_page_readonly(gdt->gdt); 251 make_lowmem_page_readonly(gdt);
247 252
248 ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt); 253 ctxt->gdt_frames[0] = virt_to_mfn(gdt);
249 ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt); 254 ctxt->gdt_ents = GDT_ENTRIES;
250 255
251 ctxt->user_regs.cs = __KERNEL_CS; 256 ctxt->user_regs.cs = __KERNEL_CS;
252 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); 257 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
@@ -254,9 +259,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
254 ctxt->kernel_ss = __KERNEL_DS; 259 ctxt->kernel_ss = __KERNEL_DS;
255 ctxt->kernel_sp = idle->thread.sp0; 260 ctxt->kernel_sp = idle->thread.sp0;
256 261
262#ifdef CONFIG_X86_32
257 ctxt->event_callback_cs = __KERNEL_CS; 263 ctxt->event_callback_cs = __KERNEL_CS;
258 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
259 ctxt->failsafe_callback_cs = __KERNEL_CS; 264 ctxt->failsafe_callback_cs = __KERNEL_CS;
265#endif
266 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
260 ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; 267 ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
261 268
262 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); 269 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
@@ -269,7 +276,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
269 return 0; 276 return 0;
270} 277}
271 278
272int __cpuinit xen_cpu_up(unsigned int cpu) 279static int __cpuinit xen_cpu_up(unsigned int cpu)
273{ 280{
274 struct task_struct *idle = idle_task(cpu); 281 struct task_struct *idle = idle_task(cpu);
275 int rc; 282 int rc;
@@ -280,10 +287,28 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
280 return rc; 287 return rc;
281#endif 288#endif
282 289
290#ifdef CONFIG_X86_64
291 /* Allocate node local memory for AP pdas */
292 WARN_ON(cpu == 0);
293 if (cpu > 0) {
294 rc = get_local_pda(cpu);
295 if (rc)
296 return rc;
297 }
298#endif
299
300#ifdef CONFIG_X86_32
283 init_gdt(cpu); 301 init_gdt(cpu);
284 per_cpu(current_task, cpu) = idle; 302 per_cpu(current_task, cpu) = idle;
285 irq_ctx_init(cpu); 303 irq_ctx_init(cpu);
304#else
305 cpu_pda(cpu)->pcurrent = idle;
306 clear_tsk_thread_flag(idle, TIF_FORK);
307#endif
286 xen_setup_timer(cpu); 308 xen_setup_timer(cpu);
309 xen_init_lock_cpu(cpu);
310
311 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
287 312
288 /* make sure interrupts start blocked */ 313 /* make sure interrupts start blocked */
289 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; 314 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
@@ -299,20 +324,18 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
299 if (rc) 324 if (rc)
300 return rc; 325 return rc;
301 326
302 smp_store_cpu_info(cpu);
303 set_cpu_sibling_map(cpu);
304 /* This must be done before setting cpu_online_map */
305 wmb();
306
307 cpu_set(cpu, cpu_online_map);
308
309 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); 327 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
310 BUG_ON(rc); 328 BUG_ON(rc);
311 329
330 while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
331 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
332 barrier();
333 }
334
312 return 0; 335 return 0;
313} 336}
314 337
315void xen_smp_cpus_done(unsigned int max_cpus) 338static void xen_smp_cpus_done(unsigned int max_cpus)
316{ 339{
317} 340}
318 341
@@ -328,104 +351,254 @@ static void stop_self(void *v)
328 BUG(); 351 BUG();
329} 352}
330 353
331void xen_smp_send_stop(void) 354static void xen_smp_send_stop(void)
332{ 355{
333 smp_call_function(stop_self, NULL, 0, 0); 356 smp_call_function(stop_self, NULL, 0);
334} 357}
335 358
336void xen_smp_send_reschedule(int cpu) 359static void xen_smp_send_reschedule(int cpu)
337{ 360{
338 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); 361 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
339} 362}
340 363
341
342static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector) 364static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
343{ 365{
344 unsigned cpu; 366 unsigned cpu;
345 367
346 cpus_and(mask, mask, cpu_online_map); 368 cpus_and(mask, mask, cpu_online_map);
347 369
348 for_each_cpu_mask(cpu, mask) 370 for_each_cpu_mask_nr(cpu, mask)
349 xen_send_IPI_one(cpu, vector); 371 xen_send_IPI_one(cpu, vector);
350} 372}
351 373
374static void xen_smp_send_call_function_ipi(cpumask_t mask)
375{
376 int cpu;
377
378 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
379
380 /* Make sure other vcpus get a chance to run if they need to. */
381 for_each_cpu_mask_nr(cpu, mask) {
382 if (xen_vcpu_stolen(cpu)) {
383 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
384 break;
385 }
386 }
387}
388
389static void xen_smp_send_call_function_single_ipi(int cpu)
390{
391 xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
392}
393
352static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) 394static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
353{ 395{
354 void (*func) (void *info) = call_data->func;
355 void *info = call_data->info;
356 int wait = call_data->wait;
357
358 /*
359 * Notify initiating CPU that I've grabbed the data and am
360 * about to execute the function
361 */
362 mb();
363 atomic_inc(&call_data->started);
364 /*
365 * At this point the info structure may be out of scope unless wait==1
366 */
367 irq_enter(); 396 irq_enter();
368 (*func)(info); 397 generic_smp_call_function_interrupt();
398#ifdef CONFIG_X86_32
369 __get_cpu_var(irq_stat).irq_call_count++; 399 __get_cpu_var(irq_stat).irq_call_count++;
400#else
401 add_pda(irq_call_count, 1);
402#endif
370 irq_exit(); 403 irq_exit();
371 404
372 if (wait) { 405 return IRQ_HANDLED;
373 mb(); /* commit everything before setting finished */ 406}
374 atomic_inc(&call_data->finished); 407
375 } 408static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
409{
410 irq_enter();
411 generic_smp_call_function_single_interrupt();
412#ifdef CONFIG_X86_32
413 __get_cpu_var(irq_stat).irq_call_count++;
414#else
415 add_pda(irq_call_count, 1);
416#endif
417 irq_exit();
376 418
377 return IRQ_HANDLED; 419 return IRQ_HANDLED;
378} 420}
379 421
380int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), 422struct xen_spinlock {
381 void *info, int wait) 423 unsigned char lock; /* 0 -> free; 1 -> locked */
424 unsigned short spinners; /* count of waiting cpus */
425};
426
427static int xen_spin_is_locked(struct raw_spinlock *lock)
382{ 428{
383 struct call_data_struct data; 429 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
384 int cpus, cpu;
385 bool yield;
386 430
387 /* Holding any lock stops cpus from going down. */ 431 return xl->lock != 0;
388 spin_lock(&call_lock); 432}
389 433
390 cpu_clear(smp_processor_id(), mask); 434static int xen_spin_is_contended(struct raw_spinlock *lock)
435{
436 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
391 437
392 cpus = cpus_weight(mask); 438 /* Not strictly true; this is only the count of contended
393 if (!cpus) { 439 lock-takers entering the slow path. */
394 spin_unlock(&call_lock); 440 return xl->spinners != 0;
441}
442
443static int xen_spin_trylock(struct raw_spinlock *lock)
444{
445 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
446 u8 old = 1;
447
448 asm("xchgb %b0,%1"
449 : "+q" (old), "+m" (xl->lock) : : "memory");
450
451 return old == 0;
452}
453
454static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
455static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
456
457static inline void spinning_lock(struct xen_spinlock *xl)
458{
459 __get_cpu_var(lock_spinners) = xl;
460 wmb(); /* set lock of interest before count */
461 asm(LOCK_PREFIX " incw %0"
462 : "+m" (xl->spinners) : : "memory");
463}
464
465static inline void unspinning_lock(struct xen_spinlock *xl)
466{
467 asm(LOCK_PREFIX " decw %0"
468 : "+m" (xl->spinners) : : "memory");
469 wmb(); /* decrement count before clearing lock */
470 __get_cpu_var(lock_spinners) = NULL;
471}
472
473static noinline int xen_spin_lock_slow(struct raw_spinlock *lock)
474{
475 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
476 int irq = __get_cpu_var(lock_kicker_irq);
477 int ret;
478
479 /* If kicker interrupts not initialized yet, just spin */
480 if (irq == -1)
395 return 0; 481 return 0;
482
483 /* announce we're spinning */
484 spinning_lock(xl);
485
486 /* clear pending */
487 xen_clear_irq_pending(irq);
488
489 /* check again make sure it didn't become free while
490 we weren't looking */
491 ret = xen_spin_trylock(lock);
492 if (ret)
493 goto out;
494
495 /* block until irq becomes pending */
496 xen_poll_irq(irq);
497 kstat_this_cpu.irqs[irq]++;
498
499out:
500 unspinning_lock(xl);
501 return ret;
502}
503
504static void xen_spin_lock(struct raw_spinlock *lock)
505{
506 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
507 int timeout;
508 u8 oldval;
509
510 do {
511 timeout = 1 << 10;
512
513 asm("1: xchgb %1,%0\n"
514 " testb %1,%1\n"
515 " jz 3f\n"
516 "2: rep;nop\n"
517 " cmpb $0,%0\n"
518 " je 1b\n"
519 " dec %2\n"
520 " jnz 2b\n"
521 "3:\n"
522 : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
523 : "1" (1)
524 : "memory");
525
526 } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock)));
527}
528
529static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
530{
531 int cpu;
532
533 for_each_online_cpu(cpu) {
534 /* XXX should mix up next cpu selection */
535 if (per_cpu(lock_spinners, cpu) == xl) {
536 xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
537 break;
538 }
396 } 539 }
540}
397 541
398 /* Can deadlock when called with interrupts disabled */ 542static void xen_spin_unlock(struct raw_spinlock *lock)
399 WARN_ON(irqs_disabled()); 543{
544 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
400 545
401 data.func = func; 546 smp_wmb(); /* make sure no writes get moved after unlock */
402 data.info = info; 547 xl->lock = 0; /* release lock */
403 atomic_set(&data.started, 0);
404 data.wait = wait;
405 if (wait)
406 atomic_set(&data.finished, 0);
407 548
408 call_data = &data; 549 /* make sure unlock happens before kick */
409 mb(); /* write everything before IPI */ 550 barrier();
410 551
411 /* Send a message to other CPUs and wait for them to respond */ 552 if (unlikely(xl->spinners))
412 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); 553 xen_spin_unlock_slow(xl);
554}
413 555
414 /* Make sure other vcpus get a chance to run if they need to. */ 556static __cpuinit void xen_init_lock_cpu(int cpu)
415 yield = false; 557{
416 for_each_cpu_mask(cpu, mask) 558 int irq;
417 if (xen_vcpu_stolen(cpu)) 559 const char *name;
418 yield = true; 560
561 name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
562 irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
563 cpu,
564 xen_reschedule_interrupt,
565 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
566 name,
567 NULL);
568
569 if (irq >= 0) {
570 disable_irq(irq); /* make sure it's never delivered */
571 per_cpu(lock_kicker_irq, cpu) = irq;
572 }
419 573
420 if (yield) 574 printk("cpu %d spinlock event irq %d\n", cpu, irq);
421 HYPERVISOR_sched_op(SCHEDOP_yield, 0); 575}
422 576
423 /* Wait for response */ 577static void __init xen_init_spinlocks(void)
424 while (atomic_read(&data.started) != cpus || 578{
425 (wait && atomic_read(&data.finished) != cpus)) 579 pv_lock_ops.spin_is_locked = xen_spin_is_locked;
426 cpu_relax(); 580 pv_lock_ops.spin_is_contended = xen_spin_is_contended;
581 pv_lock_ops.spin_lock = xen_spin_lock;
582 pv_lock_ops.spin_trylock = xen_spin_trylock;
583 pv_lock_ops.spin_unlock = xen_spin_unlock;
584}
427 585
428 spin_unlock(&call_lock); 586static const struct smp_ops xen_smp_ops __initdata = {
587 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
588 .smp_prepare_cpus = xen_smp_prepare_cpus,
589 .cpu_up = xen_cpu_up,
590 .smp_cpus_done = xen_smp_cpus_done,
429 591
430 return 0; 592 .smp_send_stop = xen_smp_send_stop,
593 .smp_send_reschedule = xen_smp_send_reschedule,
594
595 .send_call_func_ipi = xen_smp_send_call_function_ipi,
596 .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
597};
598
599void __init xen_smp_init(void)
600{
601 smp_ops = xen_smp_ops;
602 xen_fill_possible_map();
603 xen_init_spinlocks();
431} 604}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
new file mode 100644
index 000000000000..2a234db5949b
--- /dev/null
+++ b/arch/x86/xen/suspend.c
@@ -0,0 +1,48 @@
1#include <linux/types.h>
2
3#include <xen/interface/xen.h>
4#include <xen/grant_table.h>
5#include <xen/events.h>
6
7#include <asm/xen/hypercall.h>
8#include <asm/xen/page.h>
9
10#include "xen-ops.h"
11#include "mmu.h"
12
13void xen_pre_suspend(void)
14{
15 xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
16 xen_start_info->console.domU.mfn =
17 mfn_to_pfn(xen_start_info->console.domU.mfn);
18
19 BUG_ON(!irqs_disabled());
20
21 HYPERVISOR_shared_info = &xen_dummy_shared_info;
22 if (HYPERVISOR_update_va_mapping(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
23 __pte_ma(0), 0))
24 BUG();
25}
26
27void xen_post_suspend(int suspend_cancelled)
28{
29 xen_setup_shared_info();
30
31 if (suspend_cancelled) {
32 xen_start_info->store_mfn =
33 pfn_to_mfn(xen_start_info->store_mfn);
34 xen_start_info->console.domU.mfn =
35 pfn_to_mfn(xen_start_info->console.domU.mfn);
36 } else {
37#ifdef CONFIG_SMP
38 xen_cpu_initialized_map = cpu_online_map;
39#endif
40 xen_vcpu_restore();
41 }
42
43}
44
45void xen_arch_resume(void)
46{
47 /* nothing */
48}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 41e217503c96..685b77470fc3 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -197,8 +197,8 @@ unsigned long long xen_sched_clock(void)
197} 197}
198 198
199 199
200/* Get the CPU speed from Xen */ 200/* Get the TSC speed from Xen */
201unsigned long xen_cpu_khz(void) 201unsigned long xen_tsc_khz(void)
202{ 202{
203 u64 xen_khz = 1000000ULL << 32; 203 u64 xen_khz = 1000000ULL << 32;
204 const struct pvclock_vcpu_time_info *info = 204 const struct pvclock_vcpu_time_info *info =
@@ -459,6 +459,19 @@ void xen_setup_cpu_clockevents(void)
459 clockevents_register_device(&__get_cpu_var(xen_clock_events)); 459 clockevents_register_device(&__get_cpu_var(xen_clock_events));
460} 460}
461 461
462void xen_timer_resume(void)
463{
464 int cpu;
465
466 if (xen_clockevent != &xen_vcpuop_clockevent)
467 return;
468
469 for_each_online_cpu(cpu) {
470 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
471 BUG();
472 }
473}
474
462__init void xen_time_init(void) 475__init void xen_time_init(void)
463{ 476{
464 int cpu = smp_processor_id(); 477 int cpu = smp_processor_id();
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm_32.S
index 2497a30f41de..2497a30f41de 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm_32.S
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
new file mode 100644
index 000000000000..7f58304fafb3
--- /dev/null
+++ b/arch/x86/xen/xen-asm_64.S
@@ -0,0 +1,271 @@
1/*
2 Asm versions of Xen pv-ops, suitable for either direct use or inlining.
3 The inline versions are the same as the direct-use versions, with the
4 pre- and post-amble chopped off.
5
6 This code is encoded for size rather than absolute efficiency,
7 with a view to being able to inline as much as possible.
8
9 We only bother with direct forms (ie, vcpu in pda) of the operations
10 here; the indirect forms are better handled in C, since they're
11 generally too large to inline anyway.
12 */
13
14#include <linux/linkage.h>
15
16#include <asm/asm-offsets.h>
17#include <asm/processor-flags.h>
18#include <asm/errno.h>
19#include <asm/segment.h>
20
21#include <xen/interface/xen.h>
22
23#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
24#define ENDPATCH(x) .globl x##_end; x##_end=.
25
26/* Pseudo-flag used for virtual NMI, which we don't implement yet */
27#define XEN_EFLAGS_NMI 0x80000000
28
29#if 0
30#include <asm/percpu.h>
31
32/*
33 Enable events. This clears the event mask and tests the pending
34 event status with one and operation. If there are pending
35 events, then enter the hypervisor to get them handled.
36 */
37ENTRY(xen_irq_enable_direct)
38 /* Unmask events */
39 movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
40
41 /* Preempt here doesn't matter because that will deal with
42 any pending interrupts. The pending check may end up being
43 run on the wrong CPU, but that doesn't hurt. */
44
45 /* Test for pending */
46 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
47 jz 1f
48
492: call check_events
501:
51ENDPATCH(xen_irq_enable_direct)
52 ret
53 ENDPROC(xen_irq_enable_direct)
54 RELOC(xen_irq_enable_direct, 2b+1)
55
56/*
57 Disabling events is simply a matter of making the event mask
58 non-zero.
59 */
60ENTRY(xen_irq_disable_direct)
61 movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
62ENDPATCH(xen_irq_disable_direct)
63 ret
64 ENDPROC(xen_irq_disable_direct)
65 RELOC(xen_irq_disable_direct, 0)
66
67/*
68 (xen_)save_fl is used to get the current interrupt enable status.
69 Callers expect the status to be in X86_EFLAGS_IF, and other bits
70 may be set in the return value. We take advantage of this by
71 making sure that X86_EFLAGS_IF has the right value (and other bits
72 in that byte are 0), but other bits in the return value are
73 undefined. We need to toggle the state of the bit, because
74 Xen and x86 use opposite senses (mask vs enable).
75 */
76ENTRY(xen_save_fl_direct)
77 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
78 setz %ah
79 addb %ah,%ah
80ENDPATCH(xen_save_fl_direct)
81 ret
82 ENDPROC(xen_save_fl_direct)
83 RELOC(xen_save_fl_direct, 0)
84
85/*
86 In principle the caller should be passing us a value return
87 from xen_save_fl_direct, but for robustness sake we test only
88 the X86_EFLAGS_IF flag rather than the whole byte. After
89 setting the interrupt mask state, it checks for unmasked
90 pending events and enters the hypervisor to get them delivered
91 if so.
92 */
93ENTRY(xen_restore_fl_direct)
94 testb $X86_EFLAGS_IF>>8, %ah
95 setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
96 /* Preempt here doesn't matter because that will deal with
97 any pending interrupts. The pending check may end up being
98 run on the wrong CPU, but that doesn't hurt. */
99
100 /* check for unmasked and pending */
101 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
102 jz 1f
1032: call check_events
1041:
105ENDPATCH(xen_restore_fl_direct)
106 ret
107 ENDPROC(xen_restore_fl_direct)
108 RELOC(xen_restore_fl_direct, 2b+1)
109
110
111/*
112 Force an event check by making a hypercall,
113 but preserve regs before making the call.
114 */
115check_events:
116 push %rax
117 push %rcx
118 push %rdx
119 push %rsi
120 push %rdi
121 push %r8
122 push %r9
123 push %r10
124 push %r11
125 call force_evtchn_callback
126 pop %r11
127 pop %r10
128 pop %r9
129 pop %r8
130 pop %rdi
131 pop %rsi
132 pop %rdx
133 pop %rcx
134 pop %rax
135 ret
136#endif
137
138ENTRY(xen_adjust_exception_frame)
139 mov 8+0(%rsp),%rcx
140 mov 8+8(%rsp),%r11
141 ret $16
142
143hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
144/*
145 Xen64 iret frame:
146
147 ss
148 rsp
149 rflags
150 cs
151 rip <-- standard iret frame
152
153 flags
154
155 rcx }
156 r11 }<-- pushed by hypercall page
157rsp -> rax }
158 */
159ENTRY(xen_iret)
160 pushq $0
1611: jmp hypercall_iret
162ENDPATCH(xen_iret)
163RELOC(xen_iret, 1b+1)
164
165/*
166 sysexit is not used for 64-bit processes, so it's
167 only ever used to return to 32-bit compat userspace.
168 */
169ENTRY(xen_sysexit)
170 pushq $__USER32_DS
171 pushq %rcx
172 pushq $X86_EFLAGS_IF
173 pushq $__USER32_CS
174 pushq %rdx
175
176 pushq $0
1771: jmp hypercall_iret
178ENDPATCH(xen_sysexit)
179RELOC(xen_sysexit, 1b+1)
180
181ENTRY(xen_sysret64)
182 /* We're already on the usermode stack at this point, but still
183 with the kernel gs, so we can easily switch back */
184 movq %rsp, %gs:pda_oldrsp
185 movq %gs:pda_kernelstack,%rsp
186
187 pushq $__USER_DS
188 pushq %gs:pda_oldrsp
189 pushq %r11
190 pushq $__USER_CS
191 pushq %rcx
192
193 pushq $VGCF_in_syscall
1941: jmp hypercall_iret
195ENDPATCH(xen_sysret64)
196RELOC(xen_sysret64, 1b+1)
197
198ENTRY(xen_sysret32)
199 /* We're already on the usermode stack at this point, but still
200 with the kernel gs, so we can easily switch back */
201 movq %rsp, %gs:pda_oldrsp
202 movq %gs:pda_kernelstack, %rsp
203
204 pushq $__USER32_DS
205 pushq %gs:pda_oldrsp
206 pushq %r11
207 pushq $__USER32_CS
208 pushq %rcx
209
210 pushq $VGCF_in_syscall
2111: jmp hypercall_iret
212ENDPATCH(xen_sysret32)
213RELOC(xen_sysret32, 1b+1)
214
215/*
216 Xen handles syscall callbacks much like ordinary exceptions,
217 which means we have:
218 - kernel gs
219 - kernel rsp
220 - an iret-like stack frame on the stack (including rcx and r11):
221 ss
222 rsp
223 rflags
224 cs
225 rip
226 r11
227 rsp-> rcx
228
229 In all the entrypoints, we undo all that to make it look
230 like a CPU-generated syscall/sysenter and jump to the normal
231 entrypoint.
232 */
233
234.macro undo_xen_syscall
235 mov 0*8(%rsp),%rcx
236 mov 1*8(%rsp),%r11
237 mov 5*8(%rsp),%rsp
238.endm
239
240/* Normal 64-bit system call target */
241ENTRY(xen_syscall_target)
242 undo_xen_syscall
243 jmp system_call_after_swapgs
244ENDPROC(xen_syscall_target)
245
246#ifdef CONFIG_IA32_EMULATION
247
248/* 32-bit compat syscall target */
249ENTRY(xen_syscall32_target)
250 undo_xen_syscall
251 jmp ia32_cstar_target
252ENDPROC(xen_syscall32_target)
253
254/* 32-bit compat sysenter target */
255ENTRY(xen_sysenter_target)
256 undo_xen_syscall
257 jmp ia32_sysenter_target
258ENDPROC(xen_sysenter_target)
259
260#else /* !CONFIG_IA32_EMULATION */
261
262ENTRY(xen_syscall32_target)
263ENTRY(xen_sysenter_target)
264 lea 16(%rsp), %rsp /* strip %rcx,%r11 */
265 mov $-ENOSYS, %rax
266 pushq $VGCF_in_syscall
267 jmp hypercall_iret
268ENDPROC(xen_syscall32_target)
269ENDPROC(xen_sysenter_target)
270
271#endif /* CONFIG_IA32_EMULATION */
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 6ec3b4f7719b..63d49a523ed3 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -5,14 +5,24 @@
5 5
6#include <linux/elfnote.h> 6#include <linux/elfnote.h>
7#include <linux/init.h> 7#include <linux/init.h>
8
8#include <asm/boot.h> 9#include <asm/boot.h>
10#include <asm/asm.h>
11#include <asm/page.h>
12
9#include <xen/interface/elfnote.h> 13#include <xen/interface/elfnote.h>
14#include <asm/xen/interface.h>
10 15
11 __INIT 16 __INIT
12ENTRY(startup_xen) 17ENTRY(startup_xen)
13 movl %esi,xen_start_info
14 cld 18 cld
15 movl $(init_thread_union+THREAD_SIZE),%esp 19#ifdef CONFIG_X86_32
20 mov %esi,xen_start_info
21 mov $init_thread_union+THREAD_SIZE,%esp
22#else
23 mov %rsi,xen_start_info
24 mov $init_thread_union+THREAD_SIZE,%rsp
25#endif
16 jmp xen_start_kernel 26 jmp xen_start_kernel
17 27
18 __FINIT 28 __FINIT
@@ -20,17 +30,26 @@ ENTRY(startup_xen)
20.pushsection .text 30.pushsection .text
21 .align PAGE_SIZE_asm 31 .align PAGE_SIZE_asm
22ENTRY(hypercall_page) 32ENTRY(hypercall_page)
23 .skip 0x1000 33 .skip PAGE_SIZE_asm
24.popsection 34.popsection
25 35
26 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") 36 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
27 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") 37 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
28 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") 38 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
29 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET) 39#ifdef CONFIG_X86_32
30 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) 40 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
31 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) 41#else
42 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
43#endif
44 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
45 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
32 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") 46 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
33 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") 47 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
34 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") 48 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
49 ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
50 .quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
51 ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
52 ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START)
53 ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0)
35 54
36#endif /*CONFIG_XEN */ 55#endif /*CONFIG_XEN */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index f1063ae08037..dd3c23152a2e 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -9,22 +9,31 @@
9extern const char xen_hypervisor_callback[]; 9extern const char xen_hypervisor_callback[];
10extern const char xen_failsafe_callback[]; 10extern const char xen_failsafe_callback[];
11 11
12struct trap_info;
12void xen_copy_trap_info(struct trap_info *traps); 13void xen_copy_trap_info(struct trap_info *traps);
13 14
14DECLARE_PER_CPU(unsigned long, xen_cr3); 15DECLARE_PER_CPU(unsigned long, xen_cr3);
15DECLARE_PER_CPU(unsigned long, xen_current_cr3); 16DECLARE_PER_CPU(unsigned long, xen_current_cr3);
16 17
17extern struct start_info *xen_start_info; 18extern struct start_info *xen_start_info;
19extern struct shared_info xen_dummy_shared_info;
18extern struct shared_info *HYPERVISOR_shared_info; 20extern struct shared_info *HYPERVISOR_shared_info;
19 21
22void xen_setup_mfn_list_list(void);
23void xen_setup_shared_info(void);
24
20char * __init xen_memory_setup(void); 25char * __init xen_memory_setup(void);
21void __init xen_arch_setup(void); 26void __init xen_arch_setup(void);
22void __init xen_init_IRQ(void); 27void __init xen_init_IRQ(void);
23void xen_enable_sysenter(void); 28void xen_enable_sysenter(void);
29void xen_enable_syscall(void);
30void xen_vcpu_restore(void);
31
32void __init xen_build_dynamic_phys_to_machine(void);
24 33
25void xen_setup_timer(int cpu); 34void xen_setup_timer(int cpu);
26void xen_setup_cpu_clockevents(void); 35void xen_setup_cpu_clockevents(void);
27unsigned long xen_cpu_khz(void); 36unsigned long xen_tsc_khz(void);
28void __init xen_time_init(void); 37void __init xen_time_init(void);
29unsigned long xen_get_wallclock(void); 38unsigned long xen_get_wallclock(void);
30int xen_set_wallclock(unsigned long time); 39int xen_set_wallclock(unsigned long time);
@@ -36,23 +45,15 @@ bool xen_vcpu_stolen(int vcpu);
36 45
37void xen_mark_init_mm_pinned(void); 46void xen_mark_init_mm_pinned(void);
38 47
39void __init xen_fill_possible_map(void);
40
41void __init xen_setup_vcpu_info_placement(void); 48void __init xen_setup_vcpu_info_placement(void);
42void xen_smp_prepare_boot_cpu(void);
43void xen_smp_prepare_cpus(unsigned int max_cpus);
44int xen_cpu_up(unsigned int cpu);
45void xen_smp_cpus_done(unsigned int max_cpus);
46 49
47void xen_smp_send_stop(void); 50#ifdef CONFIG_SMP
48void xen_smp_send_reschedule(int cpu); 51void xen_smp_init(void);
49int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic,
50 int wait);
51int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info,
52 int nonatomic, int wait);
53 52
54int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), 53extern cpumask_t xen_cpu_initialized_map;
55 void *info, int wait); 54#else
55static inline void xen_smp_init(void) {}
56#endif
56 57
57 58
58/* Declare an asm function, along with symbols needed to make it 59/* Declare an asm function, along with symbols needed to make it
@@ -67,7 +68,11 @@ DECL_ASM(void, xen_irq_disable_direct, void);
67DECL_ASM(unsigned long, xen_save_fl_direct, void); 68DECL_ASM(unsigned long, xen_save_fl_direct, void);
68DECL_ASM(void, xen_restore_fl_direct, unsigned long); 69DECL_ASM(void, xen_restore_fl_direct, unsigned long);
69 70
71/* These are not functions, and cannot be called normally */
70void xen_iret(void); 72void xen_iret(void);
71void xen_sysexit(void); 73void xen_sysexit(void);
74void xen_sysret32(void);
75void xen_sysret64(void);
76void xen_adjust_exception_frame(void);
72 77
73#endif /* XEN_OPS_H */ 78#endif /* XEN_OPS_H */